diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..203a2923
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,23 @@
+{
+    // 使用 IntelliSense 了解相关属性。 
+    // 悬停以查看现有属性的描述。
+    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python 调试程序: 当前文件",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "/etc/miniconda3/envs/py3_10/lib/python3.10/site-packages/torch/distributed/run.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": [
+                "--master_port=7777",
+                "--nproc_per_node=1",
+                "/data/WZW/DEIM/train.py",
+                "-c", "configs/test/deim_hgnetv2_s_visdrone.yml",
+                "--seed=0"
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/configs/base/dataloader_dfine.yml b/configs/base/dataloader_dfine.yml
new file mode 100644
index 00000000..62193371
--- /dev/null
+++ b/configs/base/dataloader_dfine.yml
@@ -0,0 +1,39 @@
+
+train_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        name: stop_epoch
+        epoch: 72 # epoch in [71, ~) stop `ops`
+        ops: ['RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+
+  collate_fn:
+    type: BatchImageCollateFunction
+    base_size: 640
+    base_size_repeat: 3
+    stop_epoch: 72 # epoch in [72, ~) stop `multiscales`
+
+  shuffle: True
+  total_batch_size: 32 # total batch size equals to 32 (4 * 8)
+  num_workers: 4
+
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+  total_batch_size: 64
+  num_workers: 4
\ No newline at end of file
diff --git a/configs/base/dataloader_rtdetrv2.yml b/configs/base/dataloader_rtdetrv2.yml
new file mode 100644
index 00000000..d55a4118
--- /dev/null
+++ b/configs/base/dataloader_rtdetrv2.yml
@@ -0,0 +1,38 @@
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}   
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        name: stop_epoch
+        epoch: 71 # epoch in [71, ~) stop `ops`
+        ops: ['RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+  
+  collate_fn:
+    type: BatchImageCollateFunction
+    scales: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800]
+    stop_epoch: 71 # epoch in [71, ~) stop `multiscales`
+
+  shuffle: True
+  total_batch_size: 16 # total batch size equals to 16 (4 * 4)
+  num_workers: 4
+
+
+val_dataloader:
+  dataset: 
+    transforms:
+      ops: 
+        - {type: Resize, size: [640, 640]}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}   
+  shuffle: False
+  total_batch_size: 32
+  num_workers: 4
\ No newline at end of file
diff --git a/configs/base/dfine_hgnetv2.yml b/configs/base/dfine_hgnetv2.yml
index e9de6d1b..405d5f30 100644
--- a/configs/base/dfine_hgnetv2.yml
+++ b/configs/base/dfine_hgnetv2.yml
@@ -21,7 +21,7 @@ flat_epoch: 4000000
 no_aug_epoch: 0
 
 HGNetv2:
-  pretrained: True
+  pretrained: False
   local_model_dir: ../RT-DETR-main/D-FINE/weight/hgnetv2/
 
 HybridEncoder:
diff --git a/configs/dataset/custom_detection.yml b/configs/dataset/custom_detection.yml
index 35435ad6..3039fc3c 100644
--- a/configs/dataset/custom_detection.yml
+++ b/configs/dataset/custom_detection.yml
@@ -11,8 +11,8 @@ train_dataloader:
   type: DataLoader
   dataset:
     type: CocoDetection
-    img_folder: /data/yourdataset/train
-    ann_file: /data/yourdataset/train/train.json
+    img_folder: /data/WZW/visdrone2019/VisDrone2019-DET-train/images
+    ann_file: /data/WZW/visdrone2019/VisDrone2019-DET-train/train.json
     return_masks: False
     transforms:
       type: Compose
@@ -28,8 +28,8 @@ val_dataloader:
   type: DataLoader
   dataset:
     type: CocoDetection
-    img_folder: /data/yourdataset/val
-    ann_file: /data/yourdataset/val/val.json
+    img_folder: /data/WZW/visdrone2019/VisDrone2019-DET-val/images
+    ann_file: /data/WZW/visdrone2019/VisDrone2019-DET-val/val.json
     return_masks: False
     transforms:
       type: Compose
diff --git a/configs/dataset/visdrone_detection.yml b/configs/dataset/visdrone_detection.yml
new file mode 100644
index 00000000..3eca271d
--- /dev/null
+++ b/configs/dataset/visdrone_detection.yml
@@ -0,0 +1,41 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ['bbox', ]
+
+num_classes: 10 # your dataset classes
+remap_mscoco_category: False
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /data/WZW/visdrone2019/VisDrone2019-DET-train/images      # 修改全局路径,以下同理
+    ann_file: /data/WZW/visdrone2019/VisDrone2019-DET-train/train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /data/WZW/visdrone2019/VisDrone2019-DET-val/images
+    ann_file: /data/WZW/visdrone2019/VisDrone2019-DET-val/val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/configs/deim/deim_hgnetv2_cgfm_n_custom.yml b/configs/deim/deim_hgnetv2_cgfm_n_custom.yml
new file mode 100644
index 00000000..3f823454
--- /dev/null
+++ b/configs/deim/deim_hgnetv2_cgfm_n_custom.yml
@@ -0,0 +1,63 @@
+__include__: [
+  '../dfine/dfine_hgnetv2_n_custom.yml',
+  '../base/deim.yml'
+]
+
+print_freq: 20
+output_dir: ./outputs/deim_hgnetv2_n_custom
+
+# 再往上找到base层的encoder进行修改(dfine_hgnetv2)
+DEIM:
+  encoder: HybridEncoder_CGFM
+# 此处是修改 dfine_hgnetv2_n_custom.yml
+HybridEncoder_CGFM:
+  in_channels: [512, 1024]
+  feat_strides: [16, 32]
+
+  # intra
+  hidden_dim: 128
+  # 对应上面feat_strides的32倍下采样
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 12
+
+## Our LR-Scheduler
+flat_epoch: 78    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+lr_gamma: 1.0
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 78, 148]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 78]
+    stop_epoch: 148
+    base_size_repeat: ~
\ No newline at end of file
diff --git a/configs/deim/deim_hgnetv2_n_custom.yml b/configs/deim/deim_hgnetv2_n_custom.yml
new file mode 100644
index 00000000..a9f7eab6
--- /dev/null
+++ b/configs/deim/deim_hgnetv2_n_custom.yml
@@ -0,0 +1,45 @@
+__include__: [
+  '../dfine/dfine_hgnetv2_n_custom.yml',
+  '../base/deim.yml'
+]
+
+print_freq: 20
+output_dir: ./outputs/deim_hgnetv2_n_custom
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 12
+
+## Our LR-Scheduler
+flat_epoch: 78    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+lr_gamma: 1.0
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 78, 148]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 78]
+    stop_epoch: 148
+    base_size_repeat: ~
\ No newline at end of file
diff --git a/configs/deim_dfine/deim_hgnetv2_s_visdrone.yml b/configs/deim_dfine/deim_hgnetv2_s_visdrone.yml
new file mode 100644
index 00000000..51ea254c
--- /dev/null
+++ b/configs/deim_dfine/deim_hgnetv2_s_visdrone.yml
@@ -0,0 +1,46 @@
+__include__: [
+  './dfine_hgnetv2_s_coco.yml',
+  '../base/deim.yml'
+]
+
+print_freq: 100
+output_dir: ./outputs/deim_hgnetv2_s_visdrone
+  
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*bn).*$'
+      lr: 0.0002
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'     # except bias
+      weight_decay: 0.
+
+  lr: 0.0004
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 132 # 120 + 4n
+
+## Our LR-Scheduler
+flat_epoch: 64    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 64, 120]   # list 
+  total_batch_size: 4
+
+  collate_fn:
+    mixup_epochs: [4, 64]
+    stop_epoch: 120
+
+val_dataloader:
+  total_batch_size: 8
\ No newline at end of file
diff --git a/configs/deim_dfine/dfine_hgnetv2_s_coco.yml b/configs/deim_dfine/dfine_hgnetv2_s_coco.yml
index 33857bc4..97f01735 100644
--- a/configs/deim_dfine/dfine_hgnetv2_s_coco.yml
+++ b/configs/deim_dfine/dfine_hgnetv2_s_coco.yml
@@ -1,5 +1,6 @@
 __include__: [
-  '../dataset/coco_detection.yml',
+  # '../dataset/coco_detection.yml',
+  '/data/WZW/DEIM/configs/dataset/visdrone_detection.yml',
   '../runtime.yml',
   '../base/dataloader.yml',
   '../base/optimizer.yml',
diff --git a/configs/dfine/dfine_hgnetv2_n_custom.yml b/configs/dfine/dfine_hgnetv2_n_custom.yml
new file mode 100644
index 00000000..23e332b4
--- /dev/null
+++ b/configs/dfine/dfine_hgnetv2_n_custom.yml
@@ -0,0 +1,84 @@
+__include__: [
+  '../dataset/custom_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader_dfine.yml',
+  '../base/optimizer.yml',
+  '../base/dfine_hgnetv2.yml',
+]
+
+print_freq: 20
+output_dir: ./outputs/dfine_hgnetv2_n_custom
+
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: 'B0'
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+
+HybridEncoder:
+  in_channels: [512, 1024]
+  feat_strides: [16, 32]
+
+  # intra
+  hidden_dim: 128
+  # 对应上面feat_strides的32倍下采样
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+
+
+DFINETransformer:
+  feat_channels: [128, 128]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  dim_feedforward: 512
+  num_levels: 2
+
+  num_layers: 3
+  eval_idx: -1
+
+  num_points: [6, 6]
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 4n
+train_dataloader:
+  total_batch_size: 8
+  dataset:
+    transforms:
+      policy:
+        epoch: 148
+  collate_fn:
+    stop_epoch: 148
+    ema_restart_decay: 0.9999
+    base_size_repeat: ~
+
+val_dataloader:
+  total_batch_size: 8
diff --git a/configs/dfine/dfine_hgnetv2_n_mal_custom.yml b/configs/dfine/dfine_hgnetv2_n_mal_custom.yml
new file mode 100644
index 00000000..4c38916b
--- /dev/null
+++ b/configs/dfine/dfine_hgnetv2_n_mal_custom.yml
@@ -0,0 +1,89 @@
+__include__: [
+  '../dataset/custom_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader_dfine.yml',
+  '../base/optimizer.yml',
+  '../base/dfine_hgnetv2.yml',
+]
+
+print_freq: 20
+output_dir: ./outputs/dfine_hgnetv2_n_custom
+
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: 'B0'
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+
+HybridEncoder:
+  in_channels: [512, 1024]
+  feat_strides: [16, 32]
+
+  # intra
+  hidden_dim: 128
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+
+
+DFINETransformer:
+  feat_channels: [128, 128]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  dim_feedforward: 512
+  num_levels: 2
+
+  num_layers: 3
+  eval_idx: -1
+
+  num_points: [6, 6]
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 4n
+train_dataloader:
+  total_batch_size: 8
+  dataset:
+    transforms:
+      policy:
+        epoch: 148
+  collate_fn:
+    stop_epoch: 148
+    ema_restart_decay: 0.9999
+    base_size_repeat: ~
+
+val_dataloader:
+  total_batch_size: 8
+
+## Our Loss
+DEIMCriterion:
+  weight_dict: {loss_mal: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
+  losses: ['mal', 'boxes', 'local']
+  gamma: 1.5
\ No newline at end of file
diff --git a/configs/dfine/dfine_hgnetv2_s_custom.yml b/configs/dfine/dfine_hgnetv2_s_custom.yml
new file mode 100644
index 00000000..2c841b93
--- /dev/null
+++ b/configs/dfine/dfine_hgnetv2_s_custom.yml
@@ -0,0 +1,65 @@
+__include__: [
+  '../dataset/custom_detection.yml',
+  '../runtime.yml',
+  '../base/dataloader_dfine.yml',
+  '../base/optimizer.yml',
+  '../base/dfine_hgnetv2.yml',
+]
+
+print_freq: 20
+output_dir: ./outputs/dfine_hgnetv2_s_custom
+
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: 'B0'
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+DFINETransformer:
+  num_layers: 3  # 4 5 6
+  eval_idx: -1  # -2 -3 -4
+
+HybridEncoder:
+  in_channels: [256, 512, 1024]
+  hidden_dim: 256
+  depth_mult: 0.34
+  expansion: 0.5
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0001
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0001
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 132 # 120 + 4n
+train_dataloader:
+  dataset:
+    transforms:
+      policy:
+        epoch: 120
+  collate_fn:
+    stop_epoch: 120
+    ema_restart_decay: 0.9999
+    base_size_repeat: 20
+  total_batch_size: 4
+val_dataloader:
+  total_batch_size: 4
\ No newline at end of file
diff --git a/configs/runtime.yml b/configs/runtime.yml
index 004a27c6..546efb74 100644
--- a/configs/runtime.yml
+++ b/configs/runtime.yml
@@ -1,11 +1,11 @@
 print_freq: 100
 output_dir: './logs'
 checkpoint_freq: 12
-
+plot_train_batch_freq: 12
 
 sync_bn: True
 find_unused_parameters: False
-
+verbose_type: 'progress' # origin:原始的输出方式 progress:进度条形式
 
 use_amp: False
 scaler:
diff --git a/configs/test/deim_hgnetv2_n_visdrone.yml b/configs/test/deim_hgnetv2_n_visdrone.yml
new file mode 100644
index 00000000..f1ada477
--- /dev/null
+++ b/configs/test/deim_hgnetv2_n_visdrone.yml
@@ -0,0 +1,49 @@
+__include__: [
+  '../dfine/dfine_hgnetv2_n_custom.yml',
+  'visdrone2019.yml',
+  '../base/deim.yml',
+]
+
+print_freq: 20
+output_dir: ./outputs/deim_hgnetv2_n_custom
+
+HGNetv2:
+  agg: ese
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 12
+
+## Our LR-Scheduler
+flat_epoch: 78    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+lr_gamma: 1.0
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 78, 148]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 78]
+    stop_epoch: 148
+    base_size_repeat: ~
\ No newline at end of file
diff --git a/configs/test/deim_hgnetv2_pconv.yml b/configs/test/deim_hgnetv2_pconv.yml
new file mode 100644
index 00000000..51e2e7a4
--- /dev/null
+++ b/configs/test/deim_hgnetv2_pconv.yml
@@ -0,0 +1,58 @@
+__include__: [
+  '../dfine/dfine_hgnetv2_n_custom.yml',
+  'visdrone2019.yml',
+  '../base/deim.yml',
+]
+
+print_freq: 20
+output_dir: ./outputs/deim_hgnetv2_n_custom
+
+DEIM:
+  backbone: HGNetv2_PConv
+
+HGNetv2_PConv:
+  pretrained: False
+  name: 'B0'
+  # ！！！控制哪些层送入特征金字塔里面
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 12
+
+## Our LR-Scheduler
+flat_epoch: 78    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+lr_gamma: 1.0
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 78, 148]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 78]
+    stop_epoch: 148
+    base_size_repeat: ~
\ No newline at end of file
diff --git a/configs/test/deim_hgnetv2_s_visdrone.yml b/configs/test/deim_hgnetv2_s_visdrone.yml
new file mode 100644
index 00000000..83707667
--- /dev/null
+++ b/configs/test/deim_hgnetv2_s_visdrone.yml
@@ -0,0 +1,44 @@
+__include__: [
+  '../dfine/dfine_hgnetv2_s_custom.yml',
+  'visdrone2019.yml',
+  '../base/deim.yml',
+]
+
+print_freq: 20
+output_dir: ./outputs/deim_hgnetv2_s_custom
+                                                                                                                                       
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*bn).*$'
+      lr: 0.0002
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'     # except bias
+      weight_decay: 0.
+
+  lr: 0.0004
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+# Increase to search for the optimal ema
+epoches: 132 # 120 + 4n
+
+## Our LR-Scheduler
+flat_epoch: 64    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 64, 120]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 64]
+    stop_epoch: 120
+  total_batch_size: 4
+val_dataloader:
+  total_batch_size: 4
\ No newline at end of file
diff --git a/configs/test/deim_hgnetv3.yml b/configs/test/deim_hgnetv3.yml
new file mode 100644
index 00000000..30cff316
--- /dev/null
+++ b/configs/test/deim_hgnetv3.yml
@@ -0,0 +1,59 @@
+__include__: [
+  '../dfine/dfine_hgnetv2_n_custom.yml',
+  'visdrone2019.yml',
+  '../base/deim.yml',
+]
+
+print_freq: 20
+output_dir: ./outputs/deim_hgnetv2_n_custom
+
+DEIM:
+  backbone: HGNetv3
+
+HGNetv3:
+  # agg: se
+  pretrained: False
+  name: 'B0'
+  # ！！！控制哪些层送入特征金字塔里面
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.0004
+    -
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.0004
+      weight_decay: 0.
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Increase to search for the optimal ema
+epoches: 160 # 148 + 12
+
+## Our LR-Scheduler
+flat_epoch: 78    # 4 + epoch // 2, e.g., 40 = 4 + 72 / 2
+no_aug_epoch: 12
+lr_gamma: 1.0
+
+## Our DataAug
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: [4, 78, 148]   # list 
+
+  collate_fn:
+    mixup_epochs: [4, 78]
+    stop_epoch: 148
+    base_size_repeat: ~
\ No newline at end of file
diff --git a/configs/test/visdrone2019.yml b/configs/test/visdrone2019.yml
new file mode 100644
index 00000000..0156e779
--- /dev/null
+++ b/configs/test/visdrone2019.yml
@@ -0,0 +1,70 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ['bbox', ]
+
+num_classes: 10 # your dataset classes
+remap_mscoco_category: False
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    # img_folder: /datasets/MoGuiMianJu/visdrone/VisDrone2019-DET-train/images
+    # ann_file: /datasets/MoGuiMianJu/visdrone/VisDrone2019-DET-train/annotations/train.json
+    # img_folder: /root/dataset/dataset_visdrone/VisDrone2019-DET-train/images
+    # ann_file: /root/dataset/dataset_visdrone/VisDrone2019-DET-train/annotations/train.json
+    img_folder: /data/WZW/visdrone2019/VisDrone2019-DET-train/images
+    ann_file: /data/WZW/visdrone2019/VisDrone2019-DET-train/train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True
+  pin_memory: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    # img_folder: /datasets/MoGuiMianJu/visdrone/VisDrone2019-DET-val/images
+    # ann_file: /datasets/MoGuiMianJu/visdrone/VisDrone2019-DET-val/annotations/val.json
+    # img_folder: /root/dataset/dataset_visdrone/VisDrone2019-DET-val/images
+    # ann_file: /root/dataset/dataset_visdrone/VisDrone2019-DET-val/annotations/val.json
+    img_folder: /data/WZW/visdrone2019/VisDrone2019-DET-val/images
+    ann_file: /data/WZW/visdrone2019/VisDrone2019-DET-val/val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  pin_memory: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+# val_dataloader:
+#   type: DataLoader
+#   dataset:
+#     type: CocoDetection
+#     img_folder: /datasets/MoGuiMianJu/visdrone/VisDrone2019-DET-test-dev/images
+#     ann_file: /datasets/MoGuiMianJu/visdrone/VisDrone2019-DET-test-dev/annotations/test.json
+#     img_folder: /home/dataset/dataset_visdrone/VisDrone2019-DET-test-dev/images
+#     ann_file: /home/dataset/dataset_visdrone/VisDrone2019-DET-test-dev/annotations/test.json
+#     return_masks: False
+#     transforms:
+#       type: Compose
+#       ops: ~
+#   shuffle: False
+#   num_workers: 4
+#   drop_last: False
+#   pin_memory: True
+#   collate_fn:
+#     type: BatchImageCollateFunction
\ No newline at end of file
diff --git a/engine/backbone/__init__.py b/engine/backbone/__init__.py
index e6902ae9..5ab498d7 100644
--- a/engine/backbone/__init__.py
+++ b/engine/backbone/__init__.py
@@ -18,3 +18,5 @@
 from .csp_darknet import CSPDarkNet, CSPPAN
 
 from .hgnetv2 import HGNetv2
+from .hgnetv3 import HGNetv3
+from .hgnetv2_pconv import HGNetv2_PConv
\ No newline at end of file
diff --git a/engine/backbone/hgnetv2.py b/engine/backbone/hgnetv2.py
index a26f38b0..2293401b 100644
--- a/engine/backbone/hgnetv2.py
+++ b/engine/backbone/hgnetv2.py
@@ -12,6 +12,8 @@
 from .common import FrozenBatchNorm2d
 from ..core import register
 import logging
+from ..extre_module.custom_nn.attention.ema import EMA
+from ..extre_module.custom_nn.attention.simam import SimAM
 
 # Constants for initialization
 kaiming_normal_ = nn.init.kaiming_normal_
@@ -237,6 +239,9 @@ def __init__(
 
         # feature aggregation
         total_chs = in_chs + layer_num * mid_chs
+        
+        print(f'-----------this is agg: {agg}---------------')
+        
         if agg == 'se':
             aggregation_squeeze_conv = ConvBNAct(
                 total_chs,
@@ -256,7 +261,7 @@ def __init__(
                 aggregation_squeeze_conv,
                 aggregation_excitation_conv,
             )
-        else:
+        elif agg == 'ese':    # 默认ESE注意力机制
             aggregation_conv = ConvBNAct(
                 total_chs,
                 out_chs,
@@ -269,7 +274,26 @@ def __init__(
                 aggregation_conv,
                 att,
             )
-
+        elif agg == 'ema':   # mywork: ema注意力
+            aggregation_conv = ConvBNAct(
+                total_chs, out_chs, kernel_size=1, stride=1, use_lab=use_lab
+            )
+            att = EMA(out_chs)
+            self.aggregation = nn.Sequential(
+                aggregation_conv,
+                att,
+            )
+        elif agg == 'simam': # simam注意力 
+            aggregation_conv = ConvBNAct(
+                total_chs, out_chs, kernel_size=1, stride=1, use_lab=use_lab
+            )     
+            att = SimAM()
+            self.aggregation = nn.Sequential( 
+                aggregation_conv,  
+                att,    
+            )
+        else:
+            raise Exception(f"param agg{agg} Illegal")
         self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
 
     def forward(self, x):
@@ -278,8 +302,13 @@ def forward(self, x):
         for layer in self.layers:
             x = layer(x)
             output.append(x)
+        # x1 = layers_1(x)
+        # x2 = layers_2(x1)
+        # x3 = layers_3(x2)
+        # output = [x, x1, x2, x3]
         x = torch.cat(output, dim=1)
         x = self.aggregation(x)
+        # 做残差连接的特征图通道数必须相同！(即x 与 identity chs一样)
         if self.residual:
             x = self.drop_path(x) + identity
         return x
@@ -297,7 +326,7 @@ def __init__(
             light_block=False,
             kernel_size=3,
             use_lab=False,
-            agg='se',
+            agg='ese',
             drop_path=0.,
     ):
         super().__init__()
@@ -441,6 +470,7 @@ def __init__(self,
                  freeze_at=0,
                  freeze_norm=True,
                  pretrained=True,
+                 agg='se',
                  local_model_dir='weight/hgnetv2/'):
         super().__init__()
         self.use_lab = use_lab
@@ -475,7 +505,8 @@ def __init__(self,
                     downsample,
                     light_block,
                     kernel_size,
-                    use_lab))
+                    use_lab,
+                    agg))
 
         if freeze_at >= 0:
             self._freeze_parameters(self.stem)
@@ -495,7 +526,7 @@ def __init__(self,
                     print(f"Loaded stage1 {name} HGNetV2 from local file.")
                 else:
                     # If the file doesn't exist locally, download from the URL
-                    if torch.distributed.get_rank() == 0:
+                    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
                         print(GREEN + "If the pretrained HGNetV2 can't be downloaded automatically. Please check your network connection." + RESET)
                         print(GREEN + "Please check your network connection. Or download the model manually from " + RESET + f"{download_url}" + GREEN + " to " + RESET + f"{local_model_dir}." + RESET)
                         state = torch.hub.load_state_dict_from_url(download_url, map_location='cpu', model_dir=local_model_dir)
@@ -509,7 +540,7 @@ def __init__(self,
                 self.load_state_dict(state)
 
             except (Exception, KeyboardInterrupt) as e:
-                if torch.distributed.get_rank() == 0:
+                if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
                     print(f"{str(e)}")
                     logging.error(RED + "CRITICAL WARNING: Failed to load pretrained HGNetV2 model" + RESET)
                     logging.error(GREEN + "Please check your network connection. Or download the model manually from " \
diff --git a/engine/backbone/hgnetv2_pconv.py b/engine/backbone/hgnetv2_pconv.py
new file mode 100644
index 00000000..2c6f6cfd
--- /dev/null
+++ b/engine/backbone/hgnetv2_pconv.py
@@ -0,0 +1,102 @@
+"""
+reference
+- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+from .common import FrozenBatchNorm2d
+from ..core import register
+from .hgnetv2 import HG_Block, HG_Stage, HGNetv2
+from engine.extre_module.custom_nn.conv_module.pconv import Partial_Conv
+
+# Constants for initialization
+kaiming_normal_ = nn.init.kaiming_normal_
+zeros_ = nn.init.zeros_
+ones_ = nn.init.ones_
+
+class HG_Block_PConv(HG_Block):
+    def __init__(self, in_chs, mid_chs, out_chs, layer_num, kernel_size=3, residual=False, light_block=False, use_lab=False, agg='ese', drop_path=0):
+        super().__init__(in_chs, mid_chs, out_chs, layer_num, kernel_size, residual, light_block, use_lab, agg, drop_path)
+
+        if light_block:
+            self.layers = nn.ModuleList()
+            for i in range(layer_num):
+                self.layers.append(
+                    Partial_Conv(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                    )
+                )
+
+        
+
+
+class HG_Stage_PConv(HG_Stage):
+    def __init__(self, in_chs, mid_chs, out_chs, block_num, layer_num, downsample=True, light_block=False, kernel_size=3, use_lab=False, agg='ese', drop_path=0):
+        super().__init__(in_chs, mid_chs, out_chs, block_num, layer_num, downsample, light_block, kernel_size, use_lab, agg, drop_path)
+        
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block_PConv(
+                    in_chs if i == 0 else out_chs,
+                    mid_chs,
+                    out_chs,
+                    layer_num,
+                    residual=False if i == 0 else True,
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    agg=agg,
+                    drop_path=drop_path[i] if isinstance(drop_path, (list, tuple)) else drop_path,
+                )
+            )
+        self.blocks = nn.Sequential(*blocks_list)
+
+
+
+@register()
+class HGNetv2_PConv(HGNetv2):
+    def __init__(self, name, use_lab=False, return_idx=..., freeze_stem_only=True, freeze_at=0, freeze_norm=True, pretrained=True, agg='se', local_model_dir='weight/hgnetv2/'):
+        super().__init__(name, use_lab, return_idx, freeze_stem_only, freeze_at, freeze_norm, pretrained, agg, local_model_dir)
+        
+        stage_config = self.arch_configs[name]['stage_config']
+
+        # stages
+        self.stages = nn.ModuleList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage_PConv(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    agg))
+
+        '''
+        注意此处修改逻辑：
+        如果把“冻结逻辑”放在父类的 __init__ 里，父类根本就不知道子类会在后面添加哪些额外模块，
+        也不知道“要不要冻结这些子模块”。唯一能做的就是在父类里把自己那一套层（stem、backbone 的通用部分、某些归一化层等）先冻结，
+        而对“子类专有的阶段”无能为力。 只有在“父类部分 + 子类部分的所有模块都已经创建完毕”之后，再去“遍历并冻结它们的参数”，才能保证冻结操作对所有目标层都生效。
+        '''
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            self._freeze_norm(self)
+
diff --git a/engine/backbone/hgnetv3.py b/engine/backbone/hgnetv3.py
new file mode 100644
index 00000000..360785e4
--- /dev/null
+++ b/engine/backbone/hgnetv3.py
@@ -0,0 +1,573 @@
+"""
+reference
+- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+from .common import FrozenBatchNorm2d
+from ..core import register
+import logging
+from ..extre_module.custom_nn.attention.ema import EMA
+from ..extre_module.custom_nn.attention.simam import SimAM
+
+# Constants for initialization
+kaiming_normal_ = nn.init.kaiming_normal_
+zeros_ = nn.init.zeros_
+ones_ = nn.init.ones_
+
+__all__ = ['HGNetv3']
+
+
+class LearnableAffineBlock(nn.Module):
+    def __init__(
+            self,
+            scale_value=1.0,
+            bias_value=0.0
+    ):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size,
+            stride=1,
+            groups=1,
+            padding='',
+            use_act=True,
+            use_lab=False
+    ):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        if padding == 'same':
+            self.conv = nn.Sequential(
+                nn.ZeroPad2d([0, 1, 0, 1]),
+                nn.Conv2d(
+                    in_chs,
+                    out_chs,
+                    kernel_size,
+                    stride,
+                    groups=groups,
+                    bias=False
+                )
+            )
+        else:
+            self.conv = nn.Conv2d(
+                in_chs,
+                out_chs,
+                kernel_size,
+                stride,
+                padding=(kernel_size - 1) // 2,
+                groups=groups,
+                bias=False
+            )
+        self.bn = nn.BatchNorm2d(out_chs)
+        if self.use_act:
+            self.act = nn.ReLU()
+        else:
+            self.act = nn.Identity()
+        if self.use_act and self.use_lab:
+            self.lab = LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size,
+            groups=1,
+            use_lab=False,
+    ):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_chs,
+            out_chs,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+        )
+        self.conv2 = ConvBNAct(
+            out_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            groups=out_chs,
+            use_act=True,
+            use_lab=use_lab,
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Module):
+    # for HGNetv2
+    def __init__(self, in_chs, mid_chs, out_chs, use_lab=False):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_chs,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem2a = ConvBNAct(
+            mid_chs,
+            mid_chs // 2,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem2b = ConvBNAct(
+            mid_chs // 2,
+            mid_chs,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem3 = ConvBNAct(
+            mid_chs * 2,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem4 = ConvBNAct(
+            mid_chs,
+            out_chs,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x = F.pad(x, (0, 1, 0, 1))
+        x2 = self.stem2a(x)
+        x2 = F.pad(x2, (0, 1, 0, 1))
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = torch.cat([x1, x2], dim=1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+        return x
+
+
+class EseModule(nn.Module):
+    def __init__(self, chs):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            chs,
+            chs,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = x.mean((2, 3), keepdim=True)
+        x = self.conv(x)
+        x = self.sigmoid(x)
+        return torch.mul(identity, x)
+
+
+class HG_Block(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            mid_chs,
+            out_chs,
+            layer_num,
+            kernel_size=3,
+            residual=False,
+            light_block=False,
+            use_lab=False,
+            agg='ese',
+            drop_path=0.,
+    ):
+        super().__init__()
+        self.residual = residual
+
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            if light_block:
+                self.layers.append(
+                    LightConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        use_lab=use_lab,
+                    )
+                )
+            else:
+                self.layers.append(
+                    ConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        use_lab=use_lab,
+                    )
+                )
+
+        # feature aggregation
+        total_chs = in_chs + layer_num * mid_chs
+        
+        print(f'-----------this is agg: {agg}---------------')
+        
+        if agg == 'se':
+            aggregation_squeeze_conv = ConvBNAct(
+                total_chs,
+                out_chs // 2,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            aggregation_excitation_conv = ConvBNAct(
+                out_chs // 2,
+                out_chs,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            self.aggregation = nn.Sequential(
+                aggregation_squeeze_conv,
+                aggregation_excitation_conv,
+            )
+        elif agg == 'ese':    # 默认ESE注意力机制
+            aggregation_conv = ConvBNAct(
+                total_chs,
+                out_chs,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            att = EseModule(out_chs)
+            self.aggregation = nn.Sequential(
+                aggregation_conv,
+                att,
+            )
+        elif agg == 'ema':   # mywork: ema注意力
+            aggregation_conv = ConvBNAct(
+                total_chs, out_chs, kernel_size=1, stride=1, use_lab=use_lab
+            )
+            att = EMA(out_chs)
+            self.aggregation = nn.Sequential(
+                aggregation_conv,
+                att,
+            )
+        elif agg == 'simam': # simam注意力 
+            aggregation_conv = ConvBNAct(
+                total_chs, out_chs, kernel_size=1, stride=1, use_lab=use_lab
+            )     
+            att = SimAM()
+            self.aggregation = nn.Sequential( 
+                aggregation_conv,  
+                att,    
+            )
+        else:
+            raise Exception(f"param agg{agg} Illegal")
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+
+    def forward(self, x):
+        identity = x
+        output = [x]
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        # x1 = layers_1(x)
+        # x2 = layers_2(x1)
+        # x3 = layers_3(x2)
+        # output = [x, x1, x2, x3]
+        x = torch.cat(output, dim=1)
+        x = self.aggregation(x)
+        if self.residual:
+            x = self.drop_path(x) + identity
+        return x
+
+
+class HG_Stage(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            mid_chs,
+            out_chs,
+            block_num,
+            layer_num,
+            downsample=True,
+            light_block=False,
+            kernel_size=3,
+            use_lab=False,
+            agg='ese',
+            drop_path=0.,
+    ):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_chs,
+                in_chs,
+                kernel_size=3,
+                stride=2,
+                groups=in_chs,
+                use_act=False,
+                use_lab=use_lab,
+            )
+        else:
+            self.downsample = nn.Identity()
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_chs if i == 0 else out_chs,
+                    mid_chs,
+                    out_chs,
+                    layer_num,
+                    residual=False if i == 0 else True,
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    agg=agg,
+                    drop_path=drop_path[i] if isinstance(drop_path, (list, tuple)) else drop_path,
+                )
+            )
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+
+@register()
+class HGNetv3(nn.Module):
+    """
+    HGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of HGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific HGNetV2 model depends on args.
+    """
+
+    arch_configs = {
+        'B0': {
+            'stem_channels': [3, 16, 16],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [16, 16, 64, 1, False, False, 3, 3],
+                "stage2": [64, 32, 256, 1, True, False, 3, 3],
+                "stage3": [256, 64, 512, 2, True, True, 5, 3],
+                "stage4": [512, 128, 1024, 1, True, True, 5, 3],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B0_stage1.pth'
+        },
+        'B1': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 64, 1, False, False, 3, 3],
+                "stage2": [64, 48, 256, 1, True, False, 3, 3],
+                "stage3": [256, 96, 512, 2, True, True, 5, 3],
+                "stage4": [512, 192, 1024, 1, True, True, 5, 3],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B1_stage1.pth'
+        },
+        'B2': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 96, 1, False, False, 3, 4],
+                "stage2": [96, 64, 384, 1, True, False, 3, 4],
+                "stage3": [384, 128, 768, 3, True, True, 5, 4],
+                "stage4": [768, 256, 1536, 1, True, True, 5, 4],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B2_stage1.pth'
+        },
+        'B3': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 128, 1, False, False, 3, 5],
+                "stage2": [128, 64, 512, 1, True, False, 3, 5],
+                "stage3": [512, 128, 1024, 3, True, True, 5, 5],
+                "stage4": [1024, 256, 2048, 1, True, True, 5, 5],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B3_stage1.pth'
+        },
+        'B4': {
+            'stem_channels': [3, 32, 48],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B4_stage1.pth'
+        },
+        'B5': {
+            'stem_channels': [3, 32, 64],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B5_stage1.pth'
+        },
+        'B6': {
+            'stem_channels': [3, 48, 96],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [96, 96, 192, 2, False, False, 3, 6],
+                "stage2": [192, 192, 512, 3, True, False, 3, 6],
+                "stage3": [512, 384, 1024, 6, True, True, 5, 6],
+                "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B6_stage1.pth'
+        },
+    }
+
+    def __init__(self,
+                 name,
+                 use_lab=False,
+                 return_idx=[1, 2, 3],
+                 freeze_stem_only=True,
+                 freeze_at=0,
+                 freeze_norm=True,
+                 pretrained=True,
+                 agg='ese',
+                 local_model_dir='weight/hgnetv2/'):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[name]['stem_channels']
+        stage_config = self.arch_configs[name]['stage_config']
+        download_url = self.arch_configs[name]['url']
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = StemBlock(
+                in_chs=stem_channels[0],
+                mid_chs=stem_channels[1],
+                out_chs=stem_channels[2],
+                use_lab=use_lab)
+
+        # stages
+        self.stages = nn.ModuleList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    agg))
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            self._freeze_norm(self)
+
+        if pretrained:
+            RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+            try:
+                model_path = local_model_dir + 'PPHGNetV2_' + name + '_stage1.pth'
+                if os.path.exists(model_path):
+                    state = torch.load(model_path, map_location='cpu')
+                    print(f"Loaded stage1 {name} HGNetV2 from local file.")
+                else:
+                    # If the file doesn't exist locally, download from the URL
+                    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+                        print(GREEN + "If the pretrained HGNetV2 can't be downloaded automatically. Please check your network connection." + RESET)
+                        print(GREEN + "Please check your network connection. Or download the model manually from " + RESET + f"{download_url}" + GREEN + " to " + RESET + f"{local_model_dir}." + RESET)
+                        state = torch.hub.load_state_dict_from_url(download_url, map_location='cpu', model_dir=local_model_dir)
+                        torch.distributed.barrier()
+                    else:
+                        torch.distributed.barrier()
+                        state = torch.load(local_model_dir)
+
+                    print(f"Loaded stage1 {name} HGNetV2 from URL.")
+
+                self.load_state_dict(state)
+
+            except (Exception, KeyboardInterrupt) as e:
+                if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+                    print(f"{str(e)}")
+                    logging.error(RED + "CRITICAL WARNING: Failed to load pretrained HGNetV2 model" + RESET)
+                    logging.error(GREEN + "Please check your network connection. Or download the model manually from " \
+                                + RESET + f"{download_url}" + GREEN + " to " + RESET + f"{local_model_dir}." + RESET)
+                exit()
+
+
+
+
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+
+    def forward(self, x):
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
diff --git a/engine/core/yaml_config.py b/engine/core/yaml_config.py
index bdd27b41..5953cdcc 100644
--- a/engine/core/yaml_config.py
+++ b/engine/core/yaml_config.py
@@ -45,6 +45,7 @@ def postprocessor(self, ) -> torch.nn.Module:
         return super().postprocessor
 
     @property
+    # 损失函数
     def criterion(self, ) -> torch.nn.Module:
         if self._criterion is None and 'criterion' in self.yaml_cfg:
             self._criterion = create(self.yaml_cfg['criterion'], self.global_cfg)
diff --git a/engine/data/dataloader.py b/engine/data/dataloader.py
index 20b6a0a1..9c6db24b 100644
--- a/engine/data/dataloader.py
+++ b/engine/data/dataloader.py
@@ -23,7 +23,7 @@
 from copy import deepcopy
 from PIL import Image, ImageDraw
 import os
-
+import numpy as np
 
 __all__ = [
     'DataLoader',
@@ -178,7 +178,8 @@ def apply_mixup(self, images, targets):
         return images, targets
 
     def __call__(self, items):
-        images = torch.cat([x[0][None] for x in items], dim=0)
+        # print(type(items[0][0]))
+        images = torch.cat([torch.from_numpy(np.array(x[0])[None]) for x in items], dim=0)
         targets = [x[1] for x in items]
 
         # Mixup
diff --git a/engine/deim/__init__.py b/engine/deim/__init__.py
index acd3dc3c..aca2fe94 100644
--- a/engine/deim/__init__.py
+++ b/engine/deim/__init__.py
@@ -15,4 +15,5 @@
 from .rtdetrv2_decoder import RTDETRTransformerv2
 
 from .postprocessor import PostProcessor
-from .deim_criterion import DEIMCriterion
\ No newline at end of file
+from .deim_criterion import DEIMCriterion
+from .hybrid_encoder_cgfm import HybridEncoder_CGFM
\ No newline at end of file
diff --git a/engine/deim/hybrid_encoder.py b/engine/deim/hybrid_encoder.py
index 14b49159..17889467 100644
--- a/engine/deim/hybrid_encoder.py
+++ b/engine/deim/hybrid_encoder.py
@@ -16,7 +16,7 @@
 from .utils import get_activation
 
 from ..core import register
-
+from engine.extre_module.custom_nn.upsample.eucb import EUCB
 __all__ = ['HybridEncoder']
 
 
@@ -189,7 +189,15 @@ def forward(self, x):
         x_1 = self.conv1(x)
         x_1 = self.bottlenecks(x_1)
         return self.conv3(x_1 + x_2)
+'''
+c1 ＝ 模块的 输入 通道数。
+
+c3 ＝ cv1 投影后的通道数，并被拆分成两路各 c3/2。
+
+c4 ＝ 中间 cv2、cv3 的输入和输出通道数。
 
+c2 ＝ 模块的 输出 通道数，由最后一层 cv4 给出。
+'''
 class RepNCSPELAN4(nn.Module):
     # csp-elan
     def __init__(self, c1, c2, c3, c4, n=3,
@@ -200,9 +208,11 @@ def __init__(self, c1, c2, c3, c4, n=3,
         self.cv1 = ConvNormLayer_fuse(c1, c3, 1, 1, bias=bias, act=act)
         self.cv2 = nn.Sequential(CSPLayer(c3//2, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
         self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        # inc=c3+(2*c4) outc=c2
         self.cv4 = ConvNormLayer_fuse(c3+(2*c4), c2, 1, 1, bias=bias, act=act)
 
     def forward_chunk(self, x):
+        # 对半 split
         y = list(self.cv1(x).chunk(2, 1))
         y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
         return self.cv4(torch.cat(y, 1))
@@ -242,6 +252,7 @@ def with_pos_embed(tensor, pos_embed):
         return tensor if pos_embed is None else tensor + pos_embed
 
     def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
+        # print('!!!!!!!!!!!!!!!!!!!!', src.size())
         residual = src
         if self.normalize_before:
             src = self.norm1(src)
@@ -255,6 +266,8 @@ def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
         residual = src
         if self.normalize_before:
             src = self.norm2(src)
+        # print('!!!!!!!!!!!!!!!!!!!!', src.size())
+        # MLP
         src = self.linear2(self.dropout(self.activation(self.linear1(src))))
         src = residual + self.dropout2(src)
         if not self.normalize_before:
@@ -313,6 +326,7 @@ def __init__(self,
         self.out_strides = feat_strides
 
         # channel projection
+        # 通道数全部投影成hidden_dim
         self.input_proj = nn.ModuleList()
         for in_channel in in_channels:
             proj = nn.Sequential(OrderedDict([
@@ -338,13 +352,16 @@ def __init__(self,
         # top-down fpn
         self.lateral_convs = nn.ModuleList()
         self.fpn_blocks = nn.ModuleList()
+        self.fpn_upsample_blocks = nn.ModuleList()
         for _ in range(len(in_channels) - 1, 0, -1):
             # TODO, add activation for those lateral convs
             if version == 'dfine':
                 self.lateral_convs.append(ConvNormLayer_fuse(hidden_dim, hidden_dim, 1, 1))
             else:
                 self.lateral_convs.append(ConvNormLayer_fuse(hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_upsample_blocks.append(EUCB(hidden_dim))
             self.fpn_blocks.append(
+                # 对应RT-DETR的RepBlock
                 RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(expansion * hidden_dim // 2), round(3 * depth_mult), act=act) \
                 if version == 'dfine' else CSPLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion, bottletype=VGGBlock)
             )
@@ -353,6 +370,13 @@ def __init__(self,
         self.downsample_convs = nn.ModuleList()
         self.pan_blocks = nn.ModuleList()
         for _ in range(len(in_channels) - 1):
+            # 下采样卷积，将低层特征下采样以匹配高层特征尺寸
+            # 通常会将来自不同层的特征图进行拼接或加和操作。为了使这些操作顺利进行，
+            # 参与融合的特征图必须在通道数上保持一致
+            # self.downsample_convs.append(
+            #     nn.Sequential(ContextGuidedBlock_Down(hidden_dim, 2*hidden_dim), ConvNormLayer_fuse(hidden_dim*2, hidden_dim, 1, 1)) \
+            #     if version == 'dfine' else ConvNormLayer_fuse(hidden_dim, hidden_dim, 3, 2, act=act)
+            # )
             self.downsample_convs.append(
                 nn.Sequential(SCDown(hidden_dim, hidden_dim, 3, 2, act=act)) \
                 if version == 'dfine' else ConvNormLayer_fuse(hidden_dim, hidden_dim, 3, 2, act=act)
@@ -394,9 +418,10 @@ def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
 
     def forward(self, feats):
         assert len(feats) == len(self.in_channels)
+        # 此处通道全部是hidden_dim
         proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
 
-        # encoder
+        # transformer encoder(仅针对p5层)
         if self.num_encoder_layers > 0:
             for i, enc_ind in enumerate(self.use_encoder_idx):
                 h, w = proj_feats[enc_ind].shape[2:]
@@ -411,22 +436,28 @@ def forward(self, feats):
                 memory :torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
                 proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
 
-        # broadcasting and fusion
+        # fpn融合：自顶向下融合高层特征到低层特征
         inner_outs = [proj_feats[-1]]
         for idx in range(len(self.in_channels) - 1, 0, -1):
             feat_heigh = inner_outs[0]
+            # 通道数是 hidden_dim
             feat_low = proj_feats[idx - 1]
             feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
             inner_outs[0] = feat_heigh
-            upsample_feat = F.interpolate(feat_heigh, scale_factor=2., mode='nearest')
+            # 修改上采样部分, 取对应的列表下标！
+            # upsample_feat = F.interpolate(feat_heigh, scale_factor=2., mode='nearest')
+            upsample_feat = self.fpn_upsample_blocks[len(self.in_channels) - 1 - idx](feat_heigh)
             inner_out = self.fpn_blocks[len(self.in_channels)-1-idx](torch.concat([upsample_feat, feat_low], dim=1))
             inner_outs.insert(0, inner_out)
 
+        # PAN 融合：自底向上融合低层特征到高层特征 
         outs = [inner_outs[0]]
         for idx in range(len(self.in_channels) - 1):
             feat_low = outs[-1]
             feat_height = inner_outs[idx + 1]
             downsample_feat = self.downsample_convs[idx](feat_low)
+            # 通道数是 hidden_dim*2
+            # [batch, channel, height, width]  dim=1 按照channel维度拼接！
             out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_height], dim=1))
             outs.append(out)
 
diff --git a/engine/deim/hybrid_encoder_cgfm.py b/engine/deim/hybrid_encoder_cgfm.py
new file mode 100644
index 00000000..f519ea1f
--- /dev/null
+++ b/engine/deim/hybrid_encoder_cgfm.py
@@ -0,0 +1,88 @@
+import copy
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .utils import get_activation
+from engine.extre_module.custom_nn.featurefusion.cgfm import ContextGuideFusionModule
+from engine.deim.hybrid_encoder import HybridEncoder
+from ..core import register
+__all__ = ['HybridEncoder_CGFM']
+
+
+@register()
+class HybridEncoder_CGFM(HybridEncoder):
+    __share__ = ['eval_spatial_size', ]
+    
+    def __init__(self, in_channels=..., feat_strides=..., hidden_dim=256, nhead=8, dim_feedforward=1024, dropout=0, enc_act='gelu', use_encoder_idx=..., num_encoder_layers=1, pe_temperature=10000, expansion=1, depth_mult=1, act='silu', eval_spatial_size=None, version='dfine'):
+        super().__init__(in_channels, feat_strides, hidden_dim, nhead, dim_feedforward, dropout, enc_act, use_encoder_idx, num_encoder_layers, pe_temperature, expansion, depth_mult, act, eval_spatial_size, version)
+        # fpn
+        self.fpn_feat_fusion_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1, 0, -1):
+            self.fpn_feat_fusion_blocks.append(
+                ContextGuideFusionModule([hidden_dim, hidden_dim], hidden_dim*2)
+            )
+            
+        # pan
+        self.pan_feat_fusion_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1):
+            self.pan_feat_fusion_blocks.append(
+                ContextGuideFusionModule([hidden_dim, hidden_dim], hidden_dim*2)
+            )
+            
+    def forward(self, feats):
+        assert len(feats) == len(self.in_channels)
+        # 此处通道全部是hidden_dim
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1)
+                if self.training or self.eval_spatial_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device)
+                else:
+                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None).to(src_flatten.device)
+
+                memory :torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
+
+        # broadcasting and fusion
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            # 通道数是 hidden_dim
+            feat_low = proj_feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
+            inner_outs[0] = feat_heigh
+            upsample_feat = F.interpolate(feat_heigh, scale_factor=2., mode='nearest')
+            # 把通道拼接换成fusion_blocks
+            # inner_out = self.fpn_blocks[len(self.in_channels)-1-idx](torch.concat([upsample_feat, feat_low], dim=1))
+            inner_out = self.fpn_blocks[len(self.in_channels)-1-idx](
+                #                                            hidden_dim 参数在构造的时候已经确定了, 故此处无需再次传入
+                self.fpn_feat_fusion_blocks[len(self.in_channels)-1-idx]([upsample_feat, feat_low])
+                )
+            inner_outs.insert(0, inner_out)
+
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            # 通道数是 hidden_dim*2
+            # [batch, channel, height, width]  dim=1 按照channel维度拼接！
+            # 把通道拼接换成fusion_blocks
+            # out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_height], dim=1))
+            out = self.pan_blocks[idx](
+                self.pan_feat_fusion_blocks[idx]([downsample_feat, feat_height])
+            )
+            outs.append(out)
+
+        return outs
+        
+    
\ No newline at end of file
diff --git a/engine/extre_module/__init__.py b/engine/extre_module/__init__.py
new file mode 100644
index 00000000..aaef35d0
--- /dev/null
+++ b/engine/extre_module/__init__.py
@@ -0,0 +1 @@
+# from .custom_nn.neck.FDPN import FDPN 
diff --git a/engine/extre_module/custom_nn/attention/DeformableLKA.py b/engine/extre_module/custom_nn/attention/DeformableLKA.py
new file mode 100644
index 00000000..75840990
--- /dev/null
+++ b/engine/extre_module/custom_nn/attention/DeformableLKA.py
@@ -0,0 +1,72 @@
+'''
+本文件由BiliBili：魔傀面具整理     
+engine/extre_module/module_images/WACV2024-DeformableLKA.png
+论文链接：https://arxiv.org/abs/2309.00121
+'''
+  
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+     
+import torch, torchvision     
+import torch.nn as nn    
+
+class DeformConv(nn.Module):
+
+    def __init__(self, in_channels, groups, kernel_size=(3,3), padding=1, stride=1, dilation=1, bias=True):  
+        super(DeformConv, self).__init__()
+        
+        self.offset_net = nn.Conv2d(in_channels=in_channels,
+                                    out_channels=2 * kernel_size[0] * kernel_size[1],
+                                    kernel_size=kernel_size,
+                                    padding=padding,
+                                    stride=stride,   
+                                    dilation=dilation,
+                                    bias=True)  
+
+        self.deform_conv = torchvision.ops.DeformConv2d(in_channels=in_channels,
+                                                        out_channels=in_channels, 
+                                                        kernel_size=kernel_size,   
+                                                        padding=padding,
+                                                        groups=groups,    
+                                                        stride=stride,     
+                                                        dilation=dilation,
+                                                        bias=False)     
+
+    def forward(self, x):
+        offsets = self.offset_net(x) 
+        out = self.deform_conv(x, offsets)
+        return out    
+
+class DeformableLKA(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv0 = DeformConv(dim, kernel_size=(5, 5), padding=2, groups=dim)     
+        self.conv_spatial = DeformConv(dim, kernel_size=(7, 7), stride=1, padding=9, groups=dim, dilation=3)
+        self.conv1 = nn.Conv2d(dim, dim, 1)    
+   
+    def forward(self, x):    
+        u = x.clone()   
+        attn = self.conv0(x)
+        attn = self.conv_spatial(attn)    
+        attn = self.conv1(attn)
+        return u * attn  
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"     
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')   
+    batch_size, channel, height, width = 1, 16, 32, 32   
+    inputs = torch.randn((batch_size, channel, height, width)).to(device) 
+    
+    module = DeformableLKA(channel).to(device)     
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)  
+
+    print(ORANGE)   
+    flops, macs, _ = calculate_flops(model=module,  
+                                     input_shape=(batch_size, channel, height, width),
+                                     output_as_string=True,   
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET) 
diff --git a/engine/extre_module/custom_nn/attention/SEAM.py b/engine/extre_module/custom_nn/attention/SEAM.py
new file mode 100644
index 00000000..9dfc72a6
--- /dev/null
+++ b/engine/extre_module/custom_nn/attention/SEAM.py
@@ -0,0 +1,90 @@
+''' 
+本文件由BiliBili：魔傀面具整理   
+engine/extre_module/module_images/SEAM.png  
+论文链接：https://arxiv.org/pdf/2208.02019v2
+'''
+    
+import warnings     
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops     
+
+import torch, math
+import torch.nn as nn
+import torch.nn.functional as F     
+   
+class Residual(nn.Module):    
+    def __init__(self, fn): 
+        super(Residual, self).__init__()    
+        self.fn = fn
+ 
+    def forward(self, x):
+        return self.fn(x) + x     
+     
+class SEAM(nn.Module):
+    def __init__(self, c1, n=1, reduction=16):  
+        super(SEAM, self).__init__()
+        self.DCovN = nn.Sequential(  
+            *[nn.Sequential(
+                Residual(nn.Sequential(  
+                    nn.Conv2d(in_channels=c1, out_channels=c1, kernel_size=3, stride=1, padding=1, groups=c1),    
+                    nn.GELU(),
+                    nn.BatchNorm2d(c1)
+                )),
+                nn.Conv2d(in_channels=c1, out_channels=c1, kernel_size=1, stride=1, padding=0, groups=1), 
+                nn.GELU(),  
+                nn.BatchNorm2d(c1)
+            ) for i in range(n)]   
+        )     
+        self.avg_pool = torch.nn.AdaptiveAvgPool2d(1)    
+        self.fc = nn.Sequential( 
+            nn.Linear(c1, c1 // reduction, bias=False),    
+            nn.ReLU(inplace=True),
+            nn.Linear(c1 // reduction, c1, bias=False),
+            nn.Sigmoid()   
+        )
+
+        self._initialize_weights()   
+        # self.initialize_layer(self.avg_pool)
+        self.initialize_layer(self.fc)    
+ 
+  
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.DCovN(x)  
+        y = self.avg_pool(y).view(b, c)    
+        y = self.fc(y).view(b, c, 1, 1)     
+        y = torch.exp(y)
+        return x * y.expand_as(x)
+ 
+    def _initialize_weights(self):    
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_uniform_(m.weight, gain=1)    
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)    
+
+    def initialize_layer(self, layer):  
+        if isinstance(layer, (nn.Conv2d, nn.Linear)):     
+            torch.nn.init.normal_(layer.weight, mean=0., std=0.001)
+            if layer.bias is not None:  
+                torch.nn.init.constant_(layer.bias, 0)    
+  
+if __name__ == '__main__':    
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  
+    batch_size, channel, height, width = 1, 16, 32, 32  
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)     
+  
+    module = SEAM(channel).to(device)
+
+    outputs = module(inputs) 
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+    
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),   
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/attention/ca.py b/engine/extre_module/custom_nn/attention/ca.py
new file mode 100644
index 00000000..84542cec
--- /dev/null
+++ b/engine/extre_module/custom_nn/attention/ca.py
@@ -0,0 +1,84 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2021-Coordinate Attention.png
+论文链接：https://arxiv.org/pdf/2103.02907  
+'''  
+ 
+import warnings
+warnings.filterwarnings('ignore')   
+from calflops import calculate_flops    
+     
+import torch
+import torch.nn as nn   
+     
+class h_sigmoid(nn.Module):     
+    def __init__(self, inplace=True):  
+        super(h_sigmoid, self).__init__()
+        self.relu = nn.ReLU6(inplace=inplace)
+     
+    def forward(self, x):
+        return self.relu(x + 3) / 6 
+    
+class h_swish(nn.Module):     
+    def __init__(self, inplace=True):  
+        super(h_swish, self).__init__()
+        self.sigmoid = h_sigmoid(inplace=inplace) 
+    
+    def forward(self, x):  
+        return x * self.sigmoid(x) 
+   
+class CoordAtt(nn.Module): 
+    def __init__(self, inp, reduction=32):
+        super(CoordAtt, self).__init__()
+        self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
+        self.pool_w = nn.AdaptiveAvgPool2d((1, None))
+
+        mip = max(8, inp // reduction)   
+     
+        self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0)     
+        self.bn1 = nn.BatchNorm2d(mip)     
+        self.act = h_swish() 
+
+        self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0)
+        self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0)     
+
+    def forward(self, x):  
+        identity = x   
+     
+        n, c, h, w = x.size()   
+        x_h = self.pool_h(x)
+        x_w = self.pool_w(x).permute(0, 1, 3, 2)
+
+        y = torch.cat([x_h, x_w], dim=2)     
+        y = self.conv1(y) 
+        y = self.bn1(y)
+        y = self.act(y)   
+
+        x_h, x_w = torch.split(y, [h, w], dim=2)
+        x_w = x_w.permute(0, 1, 3, 2)
+
+        a_h = self.conv_h(x_h).sigmoid()   
+        a_w = self.conv_w(x_w).sigmoid()  
+  
+        out = identity * a_w * a_h
+ 
+        return out
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"   
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, channel, height, width = 1, 16, 32, 32     
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)  
+
+    module = CoordAtt(channel, reduction=32).to(device)
+   
+    outputs = module(inputs)   
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)     
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),
+                                     output_as_string=True,    
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/attention/ema.py b/engine/extre_module/custom_nn/attention/ema.py
new file mode 100644
index 00000000..31af65e1
--- /dev/null
+++ b/engine/extre_module/custom_nn/attention/ema.py
@@ -0,0 +1,60 @@
+'''
+本文件由BiliBili：魔傀面具整理  
+engine/extre_module/module_images/ICASSP2023-EMA.png
+论文链接：https://arxiv.org/pdf/2305.13563v2     
+'''  
+
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+ 
+import torch
+import torch.nn as nn
+
+class EMA(nn.Module):
+    def __init__(self, channels, factor=8): 
+        super(EMA, self).__init__()
+        self.groups = factor
+        assert channels // self.groups > 0
+        self.softmax = nn.Softmax(-1)    
+        self.agp = nn.AdaptiveAvgPool2d((1, 1))    
+        self.pool_h = nn.AdaptiveAvgPool2d((None, 1))     
+        self.pool_w = nn.AdaptiveAvgPool2d((1, None))   
+        self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups)
+        self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0)
+        self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):     
+        b, c, h, w = x.size()
+        group_x = x.reshape(b * self.groups, -1, h, w)  # b*g,c//g,h,w
+        x_h = self.pool_h(group_x)
+        x_w = self.pool_w(group_x).permute(0, 1, 3, 2)    
+        hw = self.conv1x1(torch.cat([x_h, x_w], dim=2))
+        x_h, x_w = torch.split(hw, [h, w], dim=2)
+        x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid())
+        x2 = self.conv3x3(group_x) 
+        x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
+        x12 = x2.reshape(b * self.groups, c // self.groups, -1)  # b*g, c//g, hw
+        x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1))     
+        x22 = x1.reshape(b * self.groups, c // self.groups, -1)  # b*g, c//g, hw 
+        weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w)     
+        return (group_x * weights.sigmoid()).reshape(b, c, h, w)    
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel, height, width = 1, 16, 32, 32  
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)   
+
+    module = EMA(channel).to(device)
+     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+ 
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,     
+                                     print_detailed=True)    
+    print(RESET)    
diff --git a/engine/extre_module/custom_nn/attention/lsk.py b/engine/extre_module/custom_nn/attention/lsk.py
new file mode 100644
index 00000000..155e5c8d
--- /dev/null
+++ b/engine/extre_module/custom_nn/attention/lsk.py
@@ -0,0 +1,75 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/ICCV2023-LSKBlock.png     
+论文链接：https://arxiv.org/pdf/2403.11735
+'''    
+
+import warnings   
+warnings.filterwarnings('ignore')    
+from calflops import calculate_flops    
+
+import torch
+import torch.nn as nn  
+ 
+class LSKBlock_SA(nn.Module):
+    def __init__(self, dim): 
+        super().__init__()  
+        self.conv0 = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)  
+        self.conv_spatial = nn.Conv2d(dim, dim, 7, stride=1, padding=9, groups=dim, dilation=3)   
+        self.conv1 = nn.Conv2d(dim, dim//2, 1)    
+        self.conv2 = nn.Conv2d(dim, dim//2, 1)
+        self.conv_squeeze = nn.Conv2d(2, 2, 7, padding=3)
+        self.conv = nn.Conv2d(dim//2, dim, 1)
+     
+    def forward(self, x):   
+        attn1 = self.conv0(x)   
+        attn2 = self.conv_spatial(attn1)   
+  
+        attn1 = self.conv1(attn1)
+        attn2 = self.conv2(attn2)
+        
+        attn = torch.cat([attn1, attn2], dim=1)
+        avg_attn = torch.mean(attn, dim=1, keepdim=True) 
+        max_attn, _ = torch.max(attn, dim=1, keepdim=True)     
+        agg = torch.cat([avg_attn, max_attn], dim=1)
+        sig = self.conv_squeeze(agg).sigmoid()
+        attn = attn1 * sig[:,0,:,:].unsqueeze(1) + attn2 * sig[:,1,:,:].unsqueeze(1)     
+        attn = self.conv(attn) 
+        return x * attn
+
+class LSKBlock(nn.Module): 
+    def __init__(self, d_model):  
+        super().__init__()
+
+        self.proj_1 = nn.Conv2d(d_model, d_model, 1) 
+        self.activation = nn.GELU()
+        self.spatial_gating_unit = LSKBlock_SA(d_model)  
+        self.proj_2 = nn.Conv2d(d_model, d_model, 1)
+ 
+    def forward(self, x):
+        shorcut = x.clone()     
+        x = self.proj_1(x)    
+        x = self.activation(x)     
+        x = self.spatial_gating_unit(x)
+        x = self.proj_2(x)
+        x = x + shorcut   
+        return x
+
+if __name__ == '__main__':    
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')     
+    batch_size, channel, height, width = 1, 16, 32, 32  
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)
+ 
+    module = LSKBlock(channel).to(device)
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),     
+                                     output_as_string=True,
+                                     output_precision=4,    
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/attention/mlca.py b/engine/extre_module/custom_nn/attention/mlca.py
new file mode 100644
index 00000000..8a1ad30e
--- /dev/null
+++ b/engine/extre_module/custom_nn/attention/mlca.py
@@ -0,0 +1,78 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/Mixed Local Channel Attention.png  
+论文链接：https://www.sciencedirect.com/science/article/abs/pii/S0952197623006267
+'''  
+    
+import warnings
+warnings.filterwarnings('ignore')  
+from calflops import calculate_flops    
+     
+import torch, math     
+import torch.nn as nn
+import torch.nn.functional as F
+     
+class MLCA(nn.Module):     
+    def __init__(self, in_size, local_size=5, gamma = 2, b = 1,local_weight=0.5): 
+        super(MLCA, self).__init__()     
+
+        # ECA 计算方法    
+        self.local_size=local_size  
+        self.gamma = gamma  
+        self.b = b
+        t = int(abs(math.log(in_size, 2) + self.b) / self.gamma)   # eca  gamma=2
+        k = t if t % 2 else t + 1    
+   
+        self.conv = nn.Conv1d(1, 1, kernel_size=k, padding=(k - 1) // 2, bias=False)    
+        self.conv_local = nn.Conv1d(1, 1, kernel_size=k, padding=(k - 1) // 2, bias=False)    
+     
+        self.local_weight=local_weight     
+
+        self.local_arv_pool = nn.AdaptiveAvgPool2d(local_size)
+        self.global_arv_pool=nn.AdaptiveAvgPool2d(1)   
+ 
+    def forward(self, x):
+        local_arv=self.local_arv_pool(x)     
+        global_arv=self.global_arv_pool(local_arv)
+  
+        b,c,m,n = x.shape
+        b_local, c_local, m_local, n_local = local_arv.shape   
+
+        # (b,c,local_size,local_size) -> (b,c,local_size*local_size)-> (b,local_size*local_size,c)-> (b,1,local_size*local_size*c)
+        temp_local= local_arv.view(b, c_local, -1).transpose(-1, -2).reshape(b, 1, -1)
+        temp_global = global_arv.view(b, c, -1).transpose(-1, -2)  
+
+        y_local = self.conv_local(temp_local) 
+        y_global = self.conv(temp_global)
+
+
+        # (b,c,local_size,local_size) <- (b,c,local_size*local_size)<-(b,local_size*local_size,c) <- (b,1,local_size*local_size*c)   
+        y_local_transpose=y_local.reshape(b, self.local_size * self.local_size,c).transpose(-1,-2).view(b,c, self.local_size , self.local_size)
+        y_global_transpose = y_global.view(b, -1).transpose(-1, -2).unsqueeze(-1)
+
+        # 反池化
+        att_local = y_local_transpose.sigmoid()     
+        att_global = F.adaptive_avg_pool2d(y_global_transpose.sigmoid(),[self.local_size, self.local_size])
+        att_all = F.adaptive_avg_pool2d(att_global*(1-self.local_weight)+(att_local*self.local_weight), [m, n]) 
+   
+        x=x * att_all     
+        return x     
+    
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)
+
+    module = MLCA(channel).to(device) 
+    
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)     
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, channel, height, width), 
+                                     output_as_string=True,  
+                                     output_precision=4,
+                                     print_detailed=True) 
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/attention/simam.py b/engine/extre_module/custom_nn/attention/simam.py
new file mode 100644
index 00000000..bcbaed6e
--- /dev/null
+++ b/engine/extre_module/custom_nn/attention/simam.py
@@ -0,0 +1,58 @@
+'''  
+本文件由BiliBili：魔傀面具整理    
+engine/extre_module/module_images/ICML2021-SimAM.png
+论文链接：https://proceedings.mlr.press/v139/yang21o/yang21o.pdf    
+'''
+
+import warnings     
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+
+import torch     
+import torch.nn as nn
+
+class SimAM(torch.nn.Module):    
+    def __init__(self, e_lambda=1e-4):
+        super(SimAM, self).__init__()
+
+        self.activaton = nn.Sigmoid()
+        self.e_lambda = e_lambda    
+     
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += ('lambda=%f)' % self.e_lambda)
+        return s  
+
+    @staticmethod 
+    def get_module_name():
+        return "simam"
+
+    def forward(self, x):
+        b, c, h, w = x.size()
+
+        n = w * h - 1 
+     
+        x_minus_mu_square = (x - x.mean(dim=[2, 3], keepdim=True)).pow(2)
+        y = x_minus_mu_square / (4 * (x_minus_mu_square.sum(dim=[2, 3], keepdim=True) / n + self.e_lambda)) + 0.5 
+  
+        return x * self.activaton(y)
+  
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m" 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)
+
+    module = SimAM(channel).to(device)
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+  
+    # 这个模块完全没参数，测不了  
+    # print(ORANGE)   
+    # flops, macs, _ = calculate_flops(model=module,
+    #                                  input_shape=(batch_size, channel, height, width),
+    #                                  output_as_string=True,   
+    #                                  output_precision=4,   
+    #                                  print_detailed=True)
+    # print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/block/RepHMS.py b/engine/extre_module/custom_nn/block/RepHMS.py
new file mode 100644
index 00000000..5b09e0da
--- /dev/null
+++ b/engine/extre_module/custom_nn/block/RepHMS.py
@@ -0,0 +1,329 @@
+'''  
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/RepHMS.png
+论文链接：https://arxiv.org/abs/2502.04656  
+'''   
+    
+import os, sys 
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')     
+ 
+import warnings 
+warnings.filterwarnings('ignore')     
+from calflops import calculate_flops 
+    
+import torch
+import torch.nn as nn     
+import torch.nn.functional as F
+import numpy as np
+  
+from engine.extre_module.ultralytics_nn.conv import Conv
+from engine.extre_module.torch_utils import model_fuse_test  
+     
+class AVG(nn.Module):   
+    def __init__(self, down_n=2):
+        super().__init__()
+        self.avg_pool = nn.functional.adaptive_avg_pool2d
+        self.down_n = down_n
+        # self.output_size = np.array([H, W])    
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        H = int(H / self.down_n)  
+        W = int(W / self.down_n)
+        output_size = np.array([H, W])
+        x = self.avg_pool(x, output_size)     
+        return x   
+     
+class UniRepLKNetBlock(nn.Module):
+    
+    def __init__(self,     
+                 dim,
+                 kernel_size,     
+                 deploy=False,
+                 attempt_use_lk_impl=True): 
+        super().__init__()
+        if deploy:
+            print('------------------------------- Note: deploy mode')    
+        if kernel_size == 0: 
+            self.dwconv = nn.Identity()
+        elif kernel_size >= 3:    
+            self.dwconv = DilatedReparamBlock(dim, kernel_size, deploy=deploy,
+                                              attempt_use_lk_impl=attempt_use_lk_impl)   
+        else:
+            assert kernel_size in [3]
+            self.dwconv = get_conv2d_uni(dim, dim, kernel_size=kernel_size, stride=1, padding=kernel_size // 2,     
+                                     dilation=1, groups=dim, bias=deploy,  
+                                     attempt_use_lk_impl=attempt_use_lk_impl)     
+   
+        if deploy or kernel_size == 0:     
+            self.norm = nn.Identity()   
+        else:
+            self.norm = get_bn(dim)     
+    
+    
+    def forward(self, inputs):
+   
+        out = self.norm(self.dwconv(inputs))     
+        return out
+
+    def convert_to_deploy(self):
+        if hasattr(self.dwconv, 'merge_dilated_branches'):
+            self.dwconv.merge_dilated_branches()     
+        if hasattr(self.norm, 'running_var'):
+            std = (self.norm.running_var + self.norm.eps).sqrt()
+            if hasattr(self.dwconv, 'lk_origin'):
+                self.dwconv.lk_origin.weight.data *= (self.norm.weight / std).view(-1, 1, 1, 1)
+                self.dwconv.lk_origin.bias.data = self.norm.bias + (     
+                            self.dwconv.lk_origin.bias - self.norm.running_mean) * self.norm.weight / std   
+            else:
+                conv = nn.Conv2d(self.dwconv.in_channels, self.dwconv.out_channels, self.dwconv.kernel_size,
+                                 self.dwconv.padding, self.dwconv.groups, bias=True)     
+                conv.weight.data = self.dwconv.weight * (self.norm.weight / std).view(-1, 1, 1, 1) 
+                conv.bias.data = self.norm.bias - self.norm.running_mean * self.norm.weight / std    
+                self.dwconv = conv
+            self.norm = nn.Identity()
+  
+class DilatedReparamBlock(nn.Module):
+    """
+    Dilated Reparam Block proposed in UniRepLKNet (https://github.com/AILab-CVC/UniRepLKNet)    
+    We assume the inputs to this block are (N, C, H, W)    
+    """
+    def __init__(self, channels, kernel_size, deploy, use_sync_bn=False, attempt_use_lk_impl=True): 
+        super().__init__()  
+        self.lk_origin = get_conv2d_uni(channels, channels, kernel_size, stride=1,     
+                                    padding=kernel_size//2, dilation=1, groups=channels, bias=deploy, 
+                                    )
+        self.attempt_use_lk_impl = attempt_use_lk_impl  
+
+        if kernel_size == 17:
+            self.kernel_sizes = [5, 9, 3, 3, 3]    
+            self.dilates = [1, 2, 4, 5, 7]    
+        elif kernel_size == 15:     
+            self.kernel_sizes = [5, 7, 3, 3, 3]
+            self.dilates = [1, 2, 3, 5, 7] 
+        elif kernel_size == 13:
+            self.kernel_sizes = [5, 7, 3, 3, 3]
+            self.dilates = [1, 2, 3, 4, 5]
+        elif kernel_size == 11:
+            self.kernel_sizes = [5, 5, 3, 3, 3]
+            self.dilates = [1, 2, 3, 4, 5]
+        elif kernel_size == 9:
+            self.kernel_sizes = [7, 5, 3]     
+            self.dilates = [1, 1, 1] 
+        elif kernel_size == 7:
+            self.kernel_sizes = [5, 3]
+            self.dilates = [1, 1] 
+        elif kernel_size == 5:
+            self.kernel_sizes = [3, 1]
+            self.dilates = [1, 1]
+        elif kernel_size == 3:
+            self.kernel_sizes = [3, 1]
+            self.dilates = [1, 1]  
+
+
+        else:   
+            raise ValueError('Dilated Reparam Block requires kernel_size >= 5')
+
+        if not deploy:
+            self.origin_bn = get_bn(channels)
+            for k, r in zip(self.kernel_sizes, self.dilates):
+                self.__setattr__('dil_conv_k{}_{}'.format(k, r), 
+                                 nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=k, stride=1,
+                                           padding=(r * (k - 1) + 1) // 2, dilation=r, groups=channels, 
+                                           bias=False))
+                self.__setattr__('dil_bn_k{}_{}'.format(k, r), get_bn(channels))  
+
+    def forward(self, x):
+        if not hasattr(self, 'origin_bn'):      # deploy mode
+            return self.lk_origin(x)
+        out = self.origin_bn(self.lk_origin(x))
+        for k, r in zip(self.kernel_sizes, self.dilates):
+            conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r))
+            bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r))
+            out = out + bn(conv(x))   
+        return out
+ 
+    def merge_dilated_branches(self):
+        if hasattr(self, 'origin_bn'):
+            origin_k, origin_b = fuse_bn(self.lk_origin, self.origin_bn)  
+            for k, r in zip(self.kernel_sizes, self.dilates):    
+                conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r))
+                bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r))
+                branch_k, branch_b = fuse_bn(conv, bn)
+                origin_k = merge_dilated_into_large_kernel(origin_k, branch_k, r)
+                origin_b += branch_b
+            merged_conv = get_conv2d_uni(origin_k.size(0), origin_k.size(0), origin_k.size(2), stride=1,     
+                                    padding=origin_k.size(2)//2, dilation=1, groups=origin_k.size(0), bias=True,
+                                    attempt_use_lk_impl=self.attempt_use_lk_impl)  
+            merged_conv.weight.data = origin_k
+            merged_conv.bias.data = origin_b
+            self.lk_origin = merged_conv
+            self.__delattr__('origin_bn')
+            for k, r in zip(self.kernel_sizes, self.dilates):    
+                self.__delattr__('dil_conv_k{}_{}'.format(k, r))
+                self.__delattr__('dil_bn_k{}_{}'.format(k, r))
+
+from itertools import repeat
+import collections.abc
+def _ntuple(n):     
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):     
+            return tuple(x)
+        return tuple(repeat(x, n))  
+    return parse     
+  
+    
+to_1tuple = _ntuple(1) 
+to_2tuple = _ntuple(2) 
+to_3tuple = _ntuple(3)  
+to_4tuple = _ntuple(4)     
+to_ntuple = _ntuple
+def fuse_bn(conv, bn):
+    kernel = conv.weight
+    running_mean = bn.running_mean
+    running_var = bn.running_var
+    gamma = bn.weight
+    beta = bn.bias
+    eps = bn.eps
+    std = (running_var + eps).sqrt()   
+    t = (gamma / std).reshape(-1, 1, 1, 1)
+    return kernel * t, beta - running_mean * gamma / std
+def get_conv2d_uni(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias,     
+               attempt_use_lk_impl=True):   
+    kernel_size = to_2tuple(kernel_size)
+    if padding is None:
+        padding = (kernel_size[0] // 2, kernel_size[1] // 2)   
+    else:
+        padding = to_2tuple(padding)
+    need_large_impl = kernel_size[0] == kernel_size[1] and kernel_size[0] > 5 and padding == (kernel_size[0] // 2, kernel_size[1] // 2)    
+
+    return nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
+                     padding=padding, dilation=dilation, groups=groups, bias=bias)    
+def convert_dilated_to_nondilated(kernel, dilate_rate):
+    identity_kernel = torch.ones((1, 1, 1, 1), dtype=kernel.dtype, device =kernel.device )
+    if kernel.size(1) == 1:
+        #   This is a DW kernel   
+        dilated = F.conv_transpose2d(kernel, identity_kernel, stride=dilate_rate) 
+        return dilated     
+    else:
+        #   This is a dense or group-wise (but not DW) kernel   
+        slices = []
+        for i in range(kernel.size(1)): 
+            dilated = F.conv_transpose2d(kernel[:,i:i+1,:,:], identity_kernel, stride=dilate_rate)     
+            slices.append(dilated)  
+        return torch.cat(slices, dim=1)
+    
+def merge_dilated_into_large_kernel(large_kernel, dilated_kernel, dilated_r):
+    large_k = large_kernel.size(2)
+    dilated_k = dilated_kernel.size(2) 
+    equivalent_kernel_size = dilated_r * (dilated_k - 1) + 1
+    equivalent_kernel = convert_dilated_to_nondilated(dilated_kernel, dilated_r)
+    rows_to_pad = large_k // 2 - equivalent_kernel_size // 2  
+    merged_kernel = large_kernel + F.pad(equivalent_kernel, [rows_to_pad] * 4)
+    return merged_kernel   
+def get_bn(channels): 
+    return nn.BatchNorm2d(channels) 
+ 
+class DepthBottleneckUniv2(nn.Module):
+    def __init__(self,
+                 in_channels,     
+                 out_channels,     
+                 shortcut=True,
+                 kersize=5,     
+                 expansion_depth=1,
+                 small_kersize=3,
+                 use_depthwise=True):    
+        super(DepthBottleneckUniv2, self).__init__()   
+
+        mid_channel = int(in_channels * expansion_depth)    
+        mid_channel2 = mid_channel  
+        self.conv1 = Conv(in_channels, mid_channel, 1)
+        self.shortcut = shortcut
+        if use_depthwise:     
+            self.conv2 = UniRepLKNetBlock(mid_channel, kernel_size=kersize)
+            self.act = nn.SiLU()
+            self.one_conv = Conv(mid_channel, mid_channel2, 1) 
+
+            self.conv3 = UniRepLKNetBlock(mid_channel2, kernel_size=kersize)
+            self.act1 = nn.SiLU()
+            self.one_conv2 = Conv(mid_channel2, out_channels, 1) 
+        else:
+            self.conv2 = Conv(out_channels, out_channels, 3)
+
+    def forward(self, x):    
+        y = self.conv1(x)  
+        y = self.act(self.conv2(y))
+        y = self.one_conv(y)
+        y = self.act1(self.conv3(y))     
+        y = self.one_conv2(y)     
+        return y
+
+class RepHMS(nn.Module):
+    def __init__(self, in_channels, out_channels, width=3, depth=1, depth_expansion=2, kersize=5, shortcut=True, 
+                 expansion=0.5,
+                 small_kersize=3, use_depthwise=True):    
+        super(RepHMS, self).__init__() 
+        self.width = width
+        self.depth = depth
+        c1 = int(out_channels * expansion) * width  
+        c_ = int(out_channels * expansion)
+        self.c_ = c_
+        self.conv1 = Conv(in_channels, c1, 1)  
+        self.RepElanMSBlock = nn.ModuleList()   
+        for _ in range(width - 1):
+            DepthBlock = nn.ModuleList([  
+                DepthBottleneckUniv2(self.c_, self.c_, shortcut, kersize, depth_expansion, small_kersize, use_depthwise)     
+                for _ in range(depth)     
+            ])   
+            self.RepElanMSBlock.append(DepthBlock)  
+   
+        self.conv2 = Conv(c_ * 1 + c_ * (width - 1) * depth, out_channels, 1)     
+     
+    def forward(self, x):
+        x = self.conv1(x)    
+        x_out = [x[:, i * self.c_:(i + 1) * self.c_] for i in range(self.width)]     
+        x_out[1] = x_out[1] + x_out[0]     
+        cascade = []   
+        elan = [x_out[0]]  
+        for i in range(self.width - 1):
+            for j in range(self.depth):   
+                if i > 0:
+                    x_out[i + 1] = x_out[i + 1] + cascade[j]
+                    if j == self.depth - 1:    
+                        #cascade = [cascade[-1]]  
+                        if self.depth > 1:
+                            cascade =[cascade[-1]]
+                        else:
+                            cascade = []     
+                x_out[i + 1] = self.RepElanMSBlock[i][j](x_out[i + 1])
+                elan.append(x_out[i + 1])
+                if i < self.width - 2:    
+                    cascade.append(x_out[i + 1])
+  
+        y_out = torch.cat(elan, 1)
+        y_out = self.conv2(y_out)     
+        return y_out   
+   
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m" 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32  
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)    
+   
+    module = RepHMS(in_channel, out_channel, width=3, depth=1).to(device)
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(GREEN + 'test reparameterization.' + RESET)  
+    module = model_fuse_test(module)  
+    outputs = module(inputs)   
+    print(GREEN + 'test reparameterization done.' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,  
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,   
+                                     print_detailed=True) 
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/block/rgcspelan.py b/engine/extre_module/custom_nn/block/rgcspelan.py
new file mode 100644
index 00000000..ed12df66
--- /dev/null
+++ b/engine/extre_module/custom_nn/block/rgcspelan.py
@@ -0,0 +1,122 @@
+'''    
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/自研模块-RGCSPELAN.png     
+自研模块：RepGhostCSPELAN     
+'''   
+    
+import os, sys     
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')  
+   
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+  
+import torch 
+import torch.nn as nn
+from engine.extre_module.ultralytics_nn.conv import Conv, RepConv, autopad 
+from engine.extre_module.torch_utils import model_fuse_test
+
+# RepGhostCSPELAN模块学术描述   
+# 1. RepGhostCSPELAN模块的应用场景与解决的问题
+# RepGhostCSPELAN模块是一种高效的多阶段特征提取与聚合架构，专为深度学习中的复杂视觉任务设计。该模块特别适用于需要高精度特征表示的场景，例如目标检测、图像分割和场景理解等计算机视觉任务。针对传统卷积神经网络在特征提取过程中计算复杂度和冗余性较高的问题，RepGhostCSPELAN通过其独特的多路径特征处理机制，优化了特征提取效率，显著降低了计算开销，同时保持甚至提升了模型的表达能力。
+# 具体而言，RepGhostCSPELAN能够有效解决以下问题：  
+
+# 特征冗余与计算效率的平衡：通过结合多种卷积操作（如1x1和3x3卷积）与通道分割策略，该模块在保留丰富特征信息的同时，减少了参数量和计算量。
+# 多尺度特征融合的不足：模块通过多阶段特征处理和聚合，增强了模型对不同尺度目标的感知能力，特别适合处理具有复杂背景或多尺度目标的视觉任务。     
+# 模型轻量化需求：在边缘设备或实时应用场景中，RepGhostCSPELAN能够以较低的计算成本实现高性能特征提取，满足轻量化模型设计的需求。     
+
+# 2. RepGhostCSPELAN模块的创新点与优点     
+# RepGhostCSPELAN模块在设计上融入了多项创新性理念，展现出显著的学术价值和工程优势。其创新点和优点主要包括以下几个方面：    
+# 创新点  
+
+# 动态通道分割与多路径特征处理RepGhostCSPELAN通过初始1x1卷积实现输入特征的动态通道分割，并结合多路径的3x3卷积处理，构建了灵活的特征提取流程。这种设计不仅增强了特征的多样性，还通过通道缩放机制有效控制了计算复杂度。   
+ 
+# RepConv与Ghost思想的融合模块创新性地将RepConv（可重参数化卷积）与Ghost模块的思想相结合，利用RepConv的结构化稀疏性和Ghost模块的低成本特征生成能力，实现了高效的特征表达。这种融合在推理阶段能够进一步优化模型结构，降低延迟。
+
+# 多阶段特征聚合的层次化设计RepGhostCSPELAN通过多阶段卷积操作和最终的特征拼接，构建了层次化的特征聚合机制。这种设计能够捕捉从低层次纹理到高层次语义的丰富信息，提升了模型对复杂场景的理解能力。 
+
+# 优点    
+    
+# 高效性与性能的协同优化相较于传统特征提取模块（如CSPNet或SPP），RepGhostCSPELAN在保持高精度的同时，显著降低了计算量和参数量，使其在资源受限场景中具有明显优势。
+ 
+# 模块化与通用性该模块采用模块化设计，易于集成到现有的深度学习框架（如YOLO系列或其他CNN架构）中。其灵活的参数配置（例如通道缩放因子和中间层数）使其能够适配多种任务需求。     
+  
+# 鲁棒性与适应性通过多路径和多阶段的特征处理，RepGhostCSPELAN展现出对噪声、尺度变化和复杂背景的强大鲁棒性，能够在多样化的视觉任务中稳定表现。 
+  
+# 综上所述，RepGhostCSPELAN模块以其创新的多路径特征提取、动态通道处理以及高效的计算优化策略，为计算机视觉任务提供了一种兼具高性能和低复杂度的解决方案。其独特的设计理念不仅推动了轻量化神经网络的发展，也为学术界和工业界的模型优化提供了新的思路。
+
+
+class RGCSPELAN(nn.Module):
+    """
+    RGCSPELAN: 该模块用于多阶段特征提取和聚合，结合多种卷积操作。    
+    
+    参数:
+        c1 (int): 输入通道数。
+        c2 (int): 输出通道数。 
+        n (int): 额外的中间卷积层数（默认值为1）。   
+        scale (float): 中间通道的缩放系数。
+        e (float): 隐藏通道的扩展因子。
+    """
+    def __init__(self, c1, c2, n=1, scale=0.5, e=0.5):
+        super(RGCSPELAN, self).__init__()  
+    
+        # 计算中间通道数量
+        self.c = int(c2 * e)  # 隐藏通道数    
+        self.mid = int(self.c * scale)  # 经过缩放后的中间通道数
+        
+        # 1x1卷积用于将输入特征拆分为两个部分（后续用于chunk或split） 
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+    
+        # 最终的1x1卷积层，用于整合所有处理后的特征 
+        self.cv2 = Conv(self.c + self.mid * (n + 1), c2, 1)     
+        
+        # 3x3卷积，处理输入特征的第二部分
+        self.cv3 = RepConv(self.c, self.mid, 3)
+        
+        # 一系列额外的3x3卷积层，用于进一步特征提取     
+        self.m = nn.ModuleList(Conv(self.mid, self.mid, 3) for _ in range(n - 1)) 
+        
+        # 1x1卷积，用于进一步处理最后阶段的特征
+        self.cv4 = Conv(self.mid, self.mid, 1)
+ 
+    def forward(self, x):  
+        """前向传播，使用chunk()方法分割特征图。"""   
+        
+        # 步骤1: 使用1x1卷积将输入特征拆分成两部分
+        y = list(self.cv1(x).chunk(2, 1))   
+     
+        # 步骤2: 对拆分的第二部分应用3x3卷积  
+        y[-1] = self.cv3(y[-1])   
+        
+        # 步骤3: 依次通过多个3x3卷积进行特征提取   
+        y.extend(m(y[-1]) for m in self.m)   
+   
+        # 步骤4: 使用1x1卷积进一步提取特征
+        y.append(self.cv4(y[-1]))   
+        
+        # 步骤5: 将所有处理后的特征图拼接，并通过最终1x1卷积得到输出
+        return self.cv2(torch.cat(y, 1)) 
+
+if __name__ == '__main__': 
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"     
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32 
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)    
+
+    module = RGCSPELAN(in_channel, out_channel, n=2, scale=0.5, e=0.5).to(device)     
+  
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+    
+    print(GREEN + 'test reparameterization.' + RESET)
+    module = model_fuse_test(module)   
+    outputs = module(inputs)
+    print(GREEN + 'test reparameterization done.' + RESET)   
+
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module,     
+                                     input_shape=(batch_size, in_channel, height, width),   
+                                     output_as_string=True,    
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/conv_module/DilatedReparamConv.py b/engine/extre_module/custom_nn/conv_module/DilatedReparamConv.py
new file mode 100644
index 00000000..a756b3e8
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/DilatedReparamConv.py
@@ -0,0 +1,189 @@
+'''  
+本文件由BiliBili：魔傀面具整理  
+engine/extre_module/module_images/CVPR2024-DilatedReparamConv.png     
+论文链接：https://arxiv.org/abs/2311.15599 
+'''    
+   
+import os, sys     
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..') 
+
+import warnings   
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+ 
+import torch
+import torch.nn as nn  
+import torch.nn.functional as F   
+from timm.layers import to_2tuple
+   
+from engine.extre_module.ultralytics_nn.conv import Conv
+from engine.extre_module.torch_utils import model_fuse_test
+
+#================== This function decides which conv implementation (the native or iGEMM) to use
+#   Note that iGEMM large-kernel conv impl will be used if 
+#       -   you attempt to do so (attempt_to_use_large_impl=True), and 
+#       -   it has been installed (follow https://github.com/AILab-CVC/UniRepLKNet), and
+#       -   the conv layer is depth-wise, stride = 1, non-dilated, kernel_size > 5, and padding == kernel_size // 2
+def get_conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias,
+               attempt_use_lk_impl=True):
+    kernel_size = to_2tuple(kernel_size)
+    if padding is None:
+        padding = (kernel_size[0] // 2, kernel_size[1] // 2)    
+    else:
+        padding = to_2tuple(padding)  
+    need_large_impl = kernel_size[0] == kernel_size[1] and kernel_size[0] > 5 and padding == (kernel_size[0] // 2, kernel_size[1] // 2)     
+
+    if attempt_use_lk_impl and need_large_impl:     
+        # print('---------------- trying to import iGEMM implementation for large-kernel conv')
+        try:
+            from depthwise_conv2d_implicit_gemm import DepthWiseConv2dImplicitGEMM     
+            # print('---------------- found iGEMM implementation ')
+        except: 
+            DepthWiseConv2dImplicitGEMM = None
+            # print('---------------- found no iGEMM. use original conv. follow https://github.com/AILab-CVC/UniRepLKNet to install it.')
+        if DepthWiseConv2dImplicitGEMM is not None and need_large_impl and in_channels == out_channels \
+                and out_channels == groups and stride == 1 and dilation == 1:     
+            # print(f'===== iGEMM Efficient Conv Impl, channels {in_channels}, kernel size {kernel_size} =====')  
+            return DepthWiseConv2dImplicitGEMM(in_channels, kernel_size, bias=bias)    
+    return nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
+                     padding=padding, dilation=dilation, groups=groups, bias=bias) 
+   
+def get_bn(dim, use_sync_bn=False): 
+    if use_sync_bn:
+        return nn.SyncBatchNorm(dim)
+    else:     
+        return nn.BatchNorm2d(dim)  
+
+def fuse_bn(conv, bn):
+    conv_bias = 0 if conv.bias is None else conv.bias 
+    std = (bn.running_var + bn.eps).sqrt()
+    return conv.weight * (bn.weight / std).reshape(-1, 1, 1, 1), bn.bias + (conv_bias - bn.running_mean) * bn.weight / std
+
+def convert_dilated_to_nondilated(kernel, dilate_rate):   
+    identity_kernel = torch.ones((1, 1, 1, 1)).to(kernel.device) 
+    if kernel.size(1) == 1:
+        #   This is a DW kernel 
+        dilated = F.conv_transpose2d(kernel, identity_kernel, stride=dilate_rate)
+        return dilated   
+    else:
+        #   This is a dense or group-wise (but not DW) kernel
+        slices = []    
+        for i in range(kernel.size(1)):    
+            dilated = F.conv_transpose2d(kernel[:,i:i+1,:,:], identity_kernel, stride=dilate_rate)
+            slices.append(dilated)     
+        return torch.cat(slices, dim=1)     
+
+def merge_dilated_into_large_kernel(large_kernel, dilated_kernel, dilated_r):   
+    large_k = large_kernel.size(2)
+    dilated_k = dilated_kernel.size(2)
+    equivalent_kernel_size = dilated_r * (dilated_k - 1) + 1     
+    equivalent_kernel = convert_dilated_to_nondilated(dilated_kernel, dilated_r)    
+    rows_to_pad = large_k // 2 - equivalent_kernel_size // 2  
+    merged_kernel = large_kernel + F.pad(equivalent_kernel, [rows_to_pad] * 4)
+    return merged_kernel  
+
+class DilatedReparamConv(nn.Module):
+    """
+    Dilated Reparam Block proposed in UniRepLKNet (https://github.com/AILab-CVC/UniRepLKNet)   
+    We assume the inputs to this block are (N, C, H, W)
+    """     
+    def __init__(self, in_channels, out_channels, kernel_size, deploy=False, use_sync_bn=False, attempt_use_lk_impl=True):
+        super().__init__()
+        self.lk_origin = get_conv2d(out_channels, out_channels, kernel_size, stride=1,
+                                    padding=kernel_size//2, dilation=1, groups=out_channels, bias=deploy,     
+                                    attempt_use_lk_impl=attempt_use_lk_impl)     
+        self.attempt_use_lk_impl = attempt_use_lk_impl
+ 
+        if in_channels != out_channels:
+            self.conv1x1 = Conv(in_channels, out_channels, k=1) # 用作调整通道数
+        else:
+            self.conv1x1 = nn.Identity()    
+
+        #   Default settings. We did not tune them carefully. Different settings may work better.   
+        if kernel_size == 17:
+            self.kernel_sizes = [5, 9, 3, 3, 3]
+            self.dilates = [1, 2, 4, 5, 7]    
+        elif kernel_size == 15:     
+            self.kernel_sizes = [5, 7, 3, 3, 3]
+            self.dilates = [1, 2, 3, 5, 7]
+        elif kernel_size == 13:
+            self.kernel_sizes = [5, 7, 3, 3, 3]
+            self.dilates = [1, 2, 3, 4, 5]    
+        elif kernel_size == 11:
+            self.kernel_sizes = [5, 5, 3, 3, 3]
+            self.dilates = [1, 2, 3, 4, 5]  
+        elif kernel_size == 9:    
+            self.kernel_sizes = [5, 5, 3, 3]   
+            self.dilates = [1, 2, 3, 4]
+        elif kernel_size == 7:
+            self.kernel_sizes = [5, 3, 3]
+            self.dilates = [1, 2, 3]
+        elif kernel_size == 5:     
+            self.kernel_sizes = [3, 3]
+            self.dilates = [1, 2]
+        else:
+            raise ValueError('Dilated Reparam Block requires kernel_size >= 5')
+  
+        if not deploy:
+            self.origin_bn = get_bn(out_channels, use_sync_bn)
+            for k, r in zip(self.kernel_sizes, self.dilates):   
+                self.__setattr__('dil_conv_k{}_{}'.format(k, r),
+                                 nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=k, stride=1,
+                                           padding=(r * (k - 1) + 1) // 2, dilation=r, groups=out_channels,
+                                           bias=False))
+                self.__setattr__('dil_bn_k{}_{}'.format(k, r), get_bn(out_channels, use_sync_bn=use_sync_bn))
+
+    def forward(self, x):     
+        x = self.conv1x1(x)     
+        if not hasattr(self, 'origin_bn'):      # deploy mode
+            return self.lk_origin(x)
+        out = self.origin_bn(self.lk_origin(x))
+        for k, r in zip(self.kernel_sizes, self.dilates):
+            conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r))
+            bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r))     
+            out = out + bn(conv(x))    
+        return out  
+
+    def convert_to_deploy(self):
+        if hasattr(self, 'origin_bn'): 
+            origin_k, origin_b = fuse_bn(self.lk_origin, self.origin_bn)
+            for k, r in zip(self.kernel_sizes, self.dilates):
+                conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r))    
+                bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r))     
+                branch_k, branch_b = fuse_bn(conv, bn)
+                origin_k = merge_dilated_into_large_kernel(origin_k, branch_k, r) 
+                origin_b += branch_b    
+            merged_conv = get_conv2d(origin_k.size(0), origin_k.size(0), origin_k.size(2), stride=1,
+                                    padding=origin_k.size(2)//2, dilation=1, groups=origin_k.size(0), bias=True,
+                                    attempt_use_lk_impl=self.attempt_use_lk_impl)
+            merged_conv.weight.data = origin_k
+            merged_conv.bias.data = origin_b
+            self.lk_origin = merged_conv    
+            self.__delattr__('origin_bn')  
+            for k, r in zip(self.kernel_sizes, self.dilates):     
+                self.__delattr__('dil_conv_k{}_{}'.format(k, r))    
+                self.__delattr__('dil_bn_k{}_{}'.format(k, r))  
+
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+
+    module = DilatedReparamConv(in_channel, out_channel, kernel_size=11).to(device)    
+  
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(GREEN + 'test reparameterization.' + RESET)     
+    module = model_fuse_test(module)  
+    outputs = module(inputs)
+    print(GREEN + 'test reparameterization done.' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, in_channel, height, width), 
+                                     output_as_string=True,
+                                     output_precision=4,     
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/conv_module/ScConv.py b/engine/extre_module/custom_nn/conv_module/ScConv.py
new file mode 100644
index 00000000..16113dc0
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/ScConv.py
@@ -0,0 +1,157 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2023-SCConv.png 
+论文链接：https://openaccess.thecvf.com/content/CVPR2023/papers/Li_SCConv_Spatial_and_Channel_Reconstruction_Convolution_for_Feature_Redundancy_CVPR_2023_paper.pdf 
+'''  
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings   
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops  
+
+import torch
+import torch.nn as nn    
+import torch.nn.functional as F  
+   
+from engine.extre_module.ultralytics_nn.conv import Conv 
+ 
+class GroupBatchnorm2d(nn.Module):
+    def __init__(self, c_num:int, 
+                 group_num:int = 16,    
+                 eps:float = 1e-10 
+                 ):
+        super(GroupBatchnorm2d,self).__init__()  
+        assert c_num    >= group_num
+        self.group_num  = group_num 
+        self.gamma      = nn.Parameter(torch.randn(c_num, 1, 1)) 
+        self.beta       = nn.Parameter(torch.zeros(c_num, 1, 1))
+        self.eps        = eps
+
+    def forward(self, x):   
+        N, C, H, W  = x.size()
+        x           = x.view(   N, self.group_num, -1   )   
+        mean        = x.mean(   dim = 2, keepdim = True ) 
+        std         = x.std (   dim = 2, keepdim = True )   
+        x           = (x - mean) / (std+self.eps)
+        x           = x.view(N, C, H, W)
+        return x * self.gamma + self.beta   
+  
+class SRU(nn.Module):
+    def __init__(self,     
+                 oup_channels:int, 
+                 group_num:int = 16,
+                 gate_treshold:float = 0.5    
+                 ):
+        super().__init__() 
+        
+        self.gn             = GroupBatchnorm2d( oup_channels, group_num = group_num )     
+        self.gate_treshold  = gate_treshold
+        self.sigomid        = nn.Sigmoid()  
+ 
+    def forward(self,x):
+        gn_x        = self.gn(x)   
+        w_gamma     = self.gn.gamma/sum(self.gn.gamma)  
+        reweigts    = self.sigomid( gn_x * w_gamma )
+        # Gate
+        info_mask   = reweigts>=self.gate_treshold     
+        noninfo_mask= reweigts<self.gate_treshold  
+        x_1         = info_mask * x
+        x_2         = noninfo_mask * x
+        x           = self.reconstruct(x_1,x_2)
+        return x
+    
+    def reconstruct(self,x_1,x_2):     
+        x_11,x_12 = torch.split(x_1, x_1.size(1)//2, dim=1)
+        x_21,x_22 = torch.split(x_2, x_2.size(1)//2, dim=1)     
+        return torch.cat([ x_11+x_22, x_12+x_21 ],dim=1)
+
+  
+class CRU(nn.Module):
+    '''
+    alpha: 0<alpha<1
+    ''' 
+    def __init__(self, 
+                 op_channel:int,
+                 alpha:float = 1/2,    
+                 squeeze_radio:int = 2 ,
+                 group_size:int = 2,    
+                 group_kernel_size:int = 3,
+                 ):
+        super().__init__()
+        self.up_channel     = up_channel   =   int(alpha*op_channel)
+        self.low_channel    = low_channel  =   op_channel-up_channel 
+        self.squeeze1       = nn.Conv2d(up_channel,up_channel//squeeze_radio,kernel_size=1,bias=False)    
+        self.squeeze2       = nn.Conv2d(low_channel,low_channel//squeeze_radio,kernel_size=1,bias=False)   
+        #up 
+        self.GWC            = nn.Conv2d(up_channel//squeeze_radio, op_channel,kernel_size=group_kernel_size, stride=1,padding=group_kernel_size//2, groups = group_size) 
+        self.PWC1           = nn.Conv2d(up_channel//squeeze_radio, op_channel,kernel_size=1, bias=False)
+        #low 
+        self.PWC2           = nn.Conv2d(low_channel//squeeze_radio, op_channel-low_channel//squeeze_radio,kernel_size=1, bias=False)   
+        self.advavg         = nn.AdaptiveAvgPool2d(1)
+   
+    def forward(self,x):
+        # Split
+        up,low  = torch.split(x,[self.up_channel,self.low_channel],dim=1)    
+        up,low  = self.squeeze1(up),self.squeeze2(low)
+        # Transform 
+        Y1      = self.GWC(up) + self.PWC1(up) 
+        Y2      = torch.cat( [self.PWC2(low), low], dim= 1 )  
+        # Fuse
+        out     = torch.cat( [Y1,Y2], dim= 1 )  
+        out     = F.softmax( self.advavg(out), dim=1 ) * out
+        out1,out2 = torch.split(out,out.size(1)//2,dim=1)   
+        return out1+out2
+
+    
+class ScConv(nn.Module):
+    # https://github.com/cheng-haha/ScConv/blob/main/ScConv.py     
+    def __init__(self,     
+                in_channel:int,     
+                out_channel:int,   
+                group_num:int = 16,   
+                gate_treshold:float = 0.5,  
+                alpha:float = 1/2,
+                squeeze_radio:int = 2 ,  
+                group_size:int = 2,
+                group_kernel_size:int = 3,
+                 ):   
+        super().__init__()
+        self.SRU = SRU(in_channel, 
+                       group_num            = group_num,     
+                       gate_treshold        = gate_treshold)    
+        self.CRU = CRU(in_channel, 
+                       alpha                = alpha, 
+                       squeeze_radio        = squeeze_radio ,
+                       group_size           = group_size ,
+                       group_kernel_size    = group_kernel_size)     
+
+        if in_channel != out_channel:     
+            self.conv1x1 = Conv(in_channel, out_channel, 1)  
+        else:    
+            self.conv1x1 = nn.Identity()
+    
+    def forward(self,x):  
+        x = self.SRU(x)
+        x = self.CRU(x)   
+        return self.conv1x1(x)   
+
+if __name__ == '__main__':     
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32    
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+
+    module = ScConv(in_channel, out_channel).to(device)
+  
+    outputs = module(inputs) 
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,    
+                                     output_precision=4,   
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/conv_module/ShiftwiseConv.py b/engine/extre_module/custom_nn/conv_module/ShiftwiseConv.py
new file mode 100644
index 00000000..ffcfeefe
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/ShiftwiseConv.py
@@ -0,0 +1,367 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2025-ShiftwiseConv.png
+论文链接：https://arxiv.org/abs/2401.12736
+''' 
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')  
+ 
+import warnings  
+warnings.filterwarnings('ignore') 
+from calflops import calculate_flops
+ 
+import math
+import torch     
+import torch.nn as nn
+import torch.nn.functional as F   
+
+from engine.extre_module.torch_utils import model_fuse_test    
+
+def get_conv2d(
+        in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias  
+):    
+    # return DepthWiseConv2dImplicitGEMM(in_channels, kernel_size, bias=bias)
+    try:
+        paddings = (kernel_size[0] // 2, kernel_size[1] // 2)   
+    except Exception as e:   
+        paddings = padding
+    return nn.Conv2d(   
+        in_channels, out_channels, kernel_size, stride, paddings, dilation, groups, bias
+    ) 
+
+def get_bn(channels):  
+    return nn.BatchNorm2d(channels)   
+     
+class Mask(nn.Module):
+    def __init__(self, size):
+        super().__init__()  
+        self.weight = torch.nn.Parameter(data=torch.Tensor(*size), requires_grad=True)
+        self.weight.data.uniform_(-1, 1)
+  
+    def forward(self, x):
+        w = torch.sigmoid(self.weight) 
+        masked_wt = w.mul(x)    
+        return masked_wt
+   
+def conv_bn_ori(  
+        in_channels, out_channels, kernel_size, stride, padding, groups, dilation=1, bn=True    
+):
+    if padding is None:   
+        padding = kernel_size // 2  
+    result = nn.Sequential()
+    result.add_module(
+        "conv",
+        get_conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,    
+            kernel_size=kernel_size,
+            stride=stride,    
+            padding=padding,
+            dilation=dilation,
+            groups=groups,    
+            bias=False,   
+        ),
+    )
+
+    if bn:
+        result.add_module("bn", get_bn(out_channels))     
+    return result     
+
+class LoRAConvsByWeight(nn.Module):   
+    '''     
+    merge LoRA1 LoRA2  
+    shuffle channel by weights rather index
+    ''' 
+    
+    def __init__(self,
+                 in_channels: int, 
+                 out_channels: int,
+                 big_kernel, small_kernel,    
+                 stride=1, group=1,   
+                 bn=True, use_small_conv=True):
+        super().__init__()
+        self.kernels = (small_kernel, big_kernel)   
+        self.stride = stride
+        self.small_conv = use_small_conv  
+        # add same padding for vertical and horizon axis. should delete it accordingly    
+        padding, after_padding_index, index = self.shift(self.kernels)
+        self.pad = padding, after_padding_index, index
+        self.nk = math.ceil(big_kernel / small_kernel)    
+        out_n = out_channels * self.nk
+        self.split_convs = nn.Conv2d(in_channels, out_n,
+                                     kernel_size=small_kernel, stride=stride,
+                                     padding=padding, groups=group,
+                                     bias=False)     
+
+        self.lora1 = Mask((1, out_n, 1, 1))   
+        self.lora2 = Mask((1, out_n, 1, 1))
+        self.use_bn = bn 
+ 
+        if bn:   
+            self.bn_lora1 = get_bn(out_channels)     
+            self.bn_lora2 = get_bn(out_channels)   
+        else:
+            self.bn_lora1 = None    
+            self.bn_lora2 = None 
+
+    def forward(self, inputs):
+        out = self.split_convs(inputs)
+        # split output
+        *_, ori_h, ori_w = inputs.shape    
+        lora1_x = self.forward_lora(self.lora1(out), ori_h, ori_w, VH='H', bn=self.bn_lora1)    
+        lora2_x = self.forward_lora(self.lora2(out), ori_h, ori_w, VH='W', bn=self.bn_lora2)
+        x = lora1_x + lora2_x 
+        return x  
+  
+    def forward_lora(self, out, ori_h, ori_w, VH='H', bn=None):     
+        # shift along the index of every group
+        b, c, h, w = out.shape
+        out = torch.split(out.reshape(b, -1, self.nk, h, w), 1, 2)  # ※※※※※※※※※※※   
+        x = 0  
+        for i in range(self.nk): 
+            outi = self.rearrange_data(out[i], i, ori_h, ori_w, VH)  
+            x = x + outi
+        if self.use_bn:    
+            x = bn(x)   
+        return x    
+     
+    def rearrange_data(self, x, idx, ori_h, ori_w, VH):
+        padding, _, index = self.pad  
+        x = x.squeeze(2)  # ※※※※※※※
+        *_, h, w = x.shape
+        k = min(self.kernels)
+        ori_k = max(self.kernels)
+        ori_p = ori_k // 2
+        stride = self.stride    
+        # need to calculate start point after conv
+        # how many windows shift from real start window index 
+        if (idx + 1) >= index:    
+            pad_l = 0
+            s = (idx + 1 - index) * (k // stride)    
+        else: 
+            pad_l = (index - 1 - idx) * (k // stride)
+            s = 0  
+        if VH == 'H':   
+            # assume add sufficient padding for origin conv    
+            suppose_len = (ori_w + 2 * ori_p - ori_k) // stride + 1
+            pad_r = 0 if (s + suppose_len) <= (w + pad_l) else s + suppose_len - w - pad_l     
+            new_pad = (pad_l, pad_r, 0, 0)
+            dim = 3
+            # e = w + pad_l + pad_r - s - suppose_len 
+        else:    
+            # assume add sufficient padding for origin conv
+            suppose_len = (ori_h + 2 * ori_p - ori_k) // stride + 1
+            pad_r = 0 if (s + suppose_len) <= (h + pad_l) else s + suppose_len - h - pad_l
+            new_pad = (0, 0, pad_l, pad_r)
+            dim = 2
+            # e = h + pad_l + pad_r - s - suppose_len 
+        # print('new_pad', new_pad)  
+        if len(set(new_pad)) > 1:     
+            x = F.pad(x, new_pad)
+        # split_list = [s, suppose_len, e]
+        # padding on v direction 
+        if padding * 2 + 1 != k:
+            pad = padding - k // 2
+            if VH == 'H':  # horizonal     
+                x = torch.narrow(x, 2, pad, h - 2 * pad)
+            else:  # vertical 
+                x = torch.narrow(x, 3, pad, w - 2 * pad)
+
+        xs = torch.narrow(x, dim, s, suppose_len)
+        return xs    
+     
+    def shift(self, kernels):
+        '''   
+        We assume the conv does not change the feature map size, so padding = bigger_kernel_size//2. Otherwise,
+        you may configure padding as you wish, and change the padding of small_conv accordingly.   
+        '''
+        mink, maxk = min(kernels), max(kernels)  
+        mid_p = maxk // 2     
+        # 1. new window size is mink. middle point index in the window
+        offset_idx_left = mid_p % mink  
+        offset_idx_right = (math.ceil(maxk / mink) * mink - mid_p - 1) % mink     
+        # 2. padding
+        padding = offset_idx_left % mink
+        while padding < offset_idx_right:     
+            padding += mink
+        # 3. make sure last pixel can be scan by min window   
+        while padding < (mink - 1):
+            padding += mink
+        # 4. index of windows start point of middle point  
+        after_padding_index = padding - offset_idx_left
+        index = math.ceil((mid_p + 1) / mink)
+        real_start_idx = index - after_padding_index // mink  
+        # 5. output:padding how to padding input in v&h direction;
+        # after_padding_index: middle point of original kernel will located in which window  
+        # real_start_idx: start window index after padding in original kernel along long side 
+        return padding, after_padding_index, real_start_idx    
+
+
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups, dilation=1, bn=True, use_small_conv=True): 
+    if isinstance(kernel_size, int) or len(set(kernel_size)) == 1:    
+        return conv_bn_ori(  
+            in_channels,     
+            out_channels,
+            kernel_size,
+            stride,   
+            padding,  
+            groups,
+            dilation,
+            bn)
+    else:     
+        big_kernel, small_kernel = kernel_size    
+        return LoRAConvsByWeight(in_channels, out_channels, bn=bn,
+                                 big_kernel=big_kernel, small_kernel=small_kernel,  
+                                 group=groups, stride=stride,    
+                                 use_small_conv=use_small_conv)     
+
+ 
+def fuse_bn(conv, bn):   
+    kernel = conv.weight   
+    running_mean = bn.running_mean
+    running_var = bn.running_var     
+    gamma = bn.weight   
+    beta = bn.bias
+    eps = bn.eps
+    std = (running_var + eps).sqrt()  
+    t = (gamma / std).reshape(-1, 1, 1, 1) 
+    return kernel * t, beta - running_mean * gamma / std  
+
+     
+class ReparamLargeKernelConv(nn.Module):
+    def __init__(
+            self,  
+            in_channels,    
+            out_channels,    
+            kernel_size,   
+            small_kernel=5, 
+            stride=1,  
+            groups=1,
+            small_kernel_merged=False,
+            Decom=True,
+            bn=True,
+    ):
+        super(ReparamLargeKernelConv, self).__init__()     
+        self.kernel_size = kernel_size    
+        self.small_kernel = small_kernel  
+        self.Decom = Decom  
+        # We assume the conv does not change the feature map size, so padding = k//2. Otherwise, you may configure padding as you wish, and change the padding of small_conv accordingly.    
+        padding = kernel_size // 2 
+        if small_kernel_merged:  # cpp版本的conv，加快速度
+            self.lkb_reparam = get_conv2d(  
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,    
+                stride=stride, 
+                padding=padding,  
+                dilation=1,   
+                groups=groups,     
+                bias=True, 
+            )
+        else:  
+            if self.Decom:
+                self.LoRA = conv_bn(  
+                    in_channels=in_channels, 
+                    out_channels=out_channels, 
+                    kernel_size=(kernel_size, small_kernel),
+                    stride=stride,
+                    padding=padding,
+                    dilation=1,     
+                    groups=groups,  
+                    bn=bn   
+                )    
+            else: 
+                self.lkb_origin = conv_bn(
+                    in_channels=in_channels, 
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,  
+                    stride=stride,    
+                    padding=padding,
+                    dilation=1,
+                    groups=groups,
+                    bn=bn, 
+                )
+    
+            if (small_kernel is not None) and small_kernel < kernel_size:     
+                self.small_conv = conv_bn(  
+                    in_channels=in_channels,   
+                    out_channels=out_channels,
+                    kernel_size=small_kernel,
+                    stride=stride,
+                    padding=small_kernel // 2,    
+                    groups=groups,   
+                    dilation=1,
+                    bn=bn,
+                )
+  
+        self.bn = get_bn(out_channels)
+        self.act = nn.SiLU()
+     
+    def forward(self, inputs):  
+        if hasattr(self, "lkb_reparam"):
+            out = self.lkb_reparam(inputs)
+        elif self.Decom:
+            # out = self.LoRA1(inputs) + self.LoRA2(inputs)     
+            out = self.LoRA(inputs)
+            if hasattr(self, "small_conv"):     
+                out += self.small_conv(inputs)  
+        else:    
+            out = self.lkb_origin(inputs)
+            if hasattr(self, "small_conv"):
+                out += self.small_conv(inputs)
+        return self.act(self.bn(out)) 
+
+    def get_equivalent_kernel_bias(self):
+        eq_k, eq_b = fuse_bn(self.lkb_origin.conv, self.lkb_origin.bn) 
+        if hasattr(self, "small_conv"):   
+            small_k, small_b = fuse_bn(self.small_conv.conv, self.small_conv.bn)
+            eq_b += small_b
+            #   add to the central part  
+            eq_k += nn.functional.pad(
+                small_k, [(self.kernel_size - self.small_kernel) // 2] * 4
+            )
+        return eq_k, eq_b
+
+    def convert_to_deploy(self):
+        if hasattr(self, 'lkb_origin'):    
+            eq_k, eq_b = self.get_equivalent_kernel_bias()
+            self.lkb_reparam = get_conv2d(   
+                in_channels=self.lkb_origin.conv.in_channels, 
+                out_channels=self.lkb_origin.conv.out_channels,
+                kernel_size=self.lkb_origin.conv.kernel_size,    
+                stride=self.lkb_origin.conv.stride,   
+                padding=self.lkb_origin.conv.padding,    
+                dilation=self.lkb_origin.conv.dilation,
+                groups=self.lkb_origin.conv.groups,
+                bias=True,   
+            )
+            self.lkb_reparam.weight.data = eq_k
+            self.lkb_reparam.bias.data = eq_b 
+            self.__delattr__("lkb_origin") 
+            if hasattr(self, "small_conv"):   
+                self.__delattr__("small_conv") 
+
+if __name__ == '__main__': 
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"     
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32  
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)   
+    
+    module = ReparamLargeKernelConv(in_channel, out_channel, kernel_size=13).to(device)
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+
+    print(GREEN + 'test reparameterization.' + RESET)
+    module = model_fuse_test(module)
+    outputs = module(inputs)     
+    print(GREEN + 'test reparameterization done.' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,   
+                                     print_detailed=True)
+    print(RESET)    
diff --git a/engine/extre_module/custom_nn/conv_module/dbb.py b/engine/extre_module/custom_nn/conv_module/dbb.py
new file mode 100644
index 00000000..e02614e0
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/dbb.py
@@ -0,0 +1,298 @@
+'''
+本文件由BiliBili：魔傀面具整理 
+engine/extre_module/module_images/CVPR2021-Diverse Branch Block.png  
+论文链接：https://arxiv.org/abs/2103.13425
+''' 
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')     
+
+import warnings
+warnings.filterwarnings('ignore')     
+from calflops import calculate_flops
+
+import torch   
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad     
+
+from engine.extre_module.torch_utils import model_fuse_test 
+  
+def transI_fusebn(kernel, bn):
+    gamma = bn.weight 
+    std = (bn.running_var + bn.eps).sqrt()
+    return kernel * ((gamma / std).reshape(-1, 1, 1, 1)), bn.bias - bn.running_mean * gamma / std
+
+def transII_addbranch(kernels, biases):  
+    return sum(kernels), sum(biases)
+
+def transIII_1x1_kxk(k1, b1, k2, b2, groups):
+    if groups == 1: 
+        k = F.conv2d(k2, k1.permute(1, 0, 2, 3))      #
+        b_hat = (k2 * b1.reshape(1, -1, 1, 1)).sum((1, 2, 3))
+    else:     
+        k_slices = [] 
+        b_slices = []
+        k1_T = k1.permute(1, 0, 2, 3)    
+        k1_group_width = k1.size(0) // groups    
+        k2_group_width = k2.size(0) // groups    
+        for g in range(groups): 
+            k1_T_slice = k1_T[:, g*k1_group_width:(g+1)*k1_group_width, :, :]
+            k2_slice = k2[g*k2_group_width:(g+1)*k2_group_width, :, :, :]
+            k_slices.append(F.conv2d(k2_slice, k1_T_slice))    
+            b_slices.append((k2_slice * b1[g*k1_group_width:(g+1)*k1_group_width].reshape(1, -1, 1, 1)).sum((1, 2, 3)))   
+        k, b_hat = transIV_depthconcat(k_slices, b_slices)
+    return k, b_hat + b2     
+
+def transIV_depthconcat(kernels, biases):  
+    return torch.cat(kernels, dim=0), torch.cat(biases)    
+
+def transV_avg(channels, kernel_size, groups):     
+    input_dim = channels // groups
+    k = torch.zeros((channels, input_dim, kernel_size, kernel_size))
+    k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2
+    return k
+
+#   This has not been tested with non-square kernels (kernel.size(2) != kernel.size(3)) nor even-size kernels
+def transVI_multiscale(kernel, target_kernel_size): 
+    H_pixels_to_pad = (target_kernel_size - kernel.size(2)) // 2  
+    W_pixels_to_pad = (target_kernel_size - kernel.size(3)) // 2
+    return F.pad(kernel, [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad])
+
+def conv_bn(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1,    
+                   padding_mode='zeros'):
+    conv_layer = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
+                           stride=stride, padding=padding, dilation=dilation, groups=groups,   
+                           bias=False, padding_mode=padding_mode)  
+    bn_layer = nn.BatchNorm2d(num_features=out_channels, affine=True)
+    se = nn.Sequential()
+    se.add_module('conv', conv_layer)
+    se.add_module('bn', bn_layer)
+    return se
+    
+
+class IdentityBasedConv1x1(nn.Module):   
+    def __init__(self, channels, groups=1):     
+        super().__init__()
+        assert channels % groups == 0
+        input_dim = channels // groups    
+        self.conv = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=1, groups=groups, bias=False)   
+     
+        id_value = np.zeros((channels, input_dim, 1, 1)) 
+        for i in range(channels):     
+            id_value[i, i % input_dim, 0, 0] = 1
+        self.id_tensor = torch.from_numpy(id_value)
+        nn.init.zeros_(self.conv.weight)
+        self.groups = groups
+    
+    def forward(self, input):    
+        kernel = self.conv.weight + self.id_tensor.to(self.conv.weight.device).type_as(self.conv.weight)     
+        result = F.conv2d(input, kernel, None, stride=1, groups=self.groups)
+        return result    
+  
+    def get_actual_kernel(self):
+        return self.conv.weight + self.id_tensor.to(self.conv.weight.device).type_as(self.conv.weight)
+  
+class BNAndPadLayer(nn.Module):    
+    def __init__(self, 
+                 pad_pixels,
+                 num_features,  
+                 eps=1e-5,     
+                 momentum=0.1,  
+                 affine=True,
+                 track_running_stats=True): 
+        super(BNAndPadLayer, self).__init__()    
+        self.bn = nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats)
+        self.pad_pixels = pad_pixels   
+  
+    def forward(self, input):
+        output = self.bn(input)
+        if self.pad_pixels > 0:
+            if self.bn.affine:  
+                pad_values = self.bn.bias.detach() - self.bn.running_mean * self.bn.weight.detach() / torch.sqrt(self.bn.running_var + self.bn.eps)     
+            else:   
+                pad_values = - self.bn.running_mean / torch.sqrt(self.bn.running_var + self.bn.eps)
+            output = F.pad(output, [self.pad_pixels] * 4)    
+            pad_values = pad_values.view(1, -1, 1, 1)   
+            output[:, :, 0:self.pad_pixels, :] = pad_values
+            output[:, :, -self.pad_pixels:, :] = pad_values 
+            output[:, :, :, 0:self.pad_pixels] = pad_values
+            output[:, :, :, -self.pad_pixels:] = pad_values
+        return output    
+
+    @property
+    def weight(self): 
+        return self.bn.weight
+ 
+    @property
+    def bias(self):  
+        return self.bn.bias
+
+    @property     
+    def running_mean(self):  
+        return self.bn.running_mean     
+
+    @property
+    def running_var(self):
+        return self.bn.running_var
+
+    @property
+    def eps(self):  
+        return self.bn.eps
+    
+    
+class DiverseBranchBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size,     
+                 stride=1, padding=None, dilation=1, groups=1,
+                 internal_channels_1x1_3x3=None,
+                 deploy=False, single_init=False):
+        super(DiverseBranchBlock, self).__init__()   
+        self.deploy = deploy   
+
+        self.nonlinear = Conv.default_act     
+
+        self.kernel_size = kernel_size 
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.groups = groups    
+  
+        if padding is None: 
+            padding = autopad(kernel_size, padding, dilation) 
+        assert padding == kernel_size // 2
+   
+        if deploy:
+            self.dbb_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,   
+                                      padding=padding, dilation=dilation, groups=groups, bias=True)
+   
+        else:   
+ 
+            self.dbb_origin = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+            self.dbb_avg = nn.Sequential()  
+            if groups < out_channels:     
+                self.dbb_avg.add_module('conv',
+                                        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1,
+                                                  stride=1, padding=0, groups=groups, bias=False))
+                self.dbb_avg.add_module('bn', BNAndPadLayer(pad_pixels=padding, num_features=out_channels))
+                self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=0))
+                self.dbb_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride,    
+                                       padding=0, groups=groups)  
+            else: 
+                self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=padding))
+  
+            self.dbb_avg.add_module('avgbn', nn.BatchNorm2d(out_channels)) 
+
+
+            if internal_channels_1x1_3x3 is None:
+                internal_channels_1x1_3x3 = in_channels if groups < out_channels else 2 * in_channels   # For mobilenet, it is better to have 2X internal channels
+
+            self.dbb_1x1_kxk = nn.Sequential()   
+            if internal_channels_1x1_3x3 == in_channels:
+                self.dbb_1x1_kxk.add_module('idconv1', IdentityBasedConv1x1(channels=in_channels, groups=groups))
+            else:
+                self.dbb_1x1_kxk.add_module('conv1', nn.Conv2d(in_channels=in_channels, out_channels=internal_channels_1x1_3x3,
+                                                            kernel_size=1, stride=1, padding=0, groups=groups, bias=False))
+            self.dbb_1x1_kxk.add_module('bn1', BNAndPadLayer(pad_pixels=padding, num_features=internal_channels_1x1_3x3, affine=True))    
+            self.dbb_1x1_kxk.add_module('conv2', nn.Conv2d(in_channels=internal_channels_1x1_3x3, out_channels=out_channels, 
+                                                            kernel_size=kernel_size, stride=stride, padding=0, groups=groups, bias=False))   
+            self.dbb_1x1_kxk.add_module('bn2', nn.BatchNorm2d(out_channels))
+   
+        #   The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases.
+        if single_init:    
+            #   Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting.
+            self.single_init()    
+
+    def get_equivalent_kernel_bias(self):  
+        k_origin, b_origin = transI_fusebn(self.dbb_origin.conv.weight, self.dbb_origin.bn)
+
+        if hasattr(self, 'dbb_1x1'):     
+            k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight, self.dbb_1x1.bn)    
+            k_1x1 = transVI_multiscale(k_1x1, self.kernel_size)
+        else:
+            k_1x1, b_1x1 = 0, 0
+    
+        if hasattr(self.dbb_1x1_kxk, 'idconv1'):
+            k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel()
+        else:  
+            k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight    
+        k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(k_1x1_kxk_first, self.dbb_1x1_kxk.bn1)
+        k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2)     
+        k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(k_1x1_kxk_first, b_1x1_kxk_first, k_1x1_kxk_second, b_1x1_kxk_second, groups=self.groups)
+
+        k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups)  
+        k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg.to(self.dbb_avg.avgbn.weight.device), self.dbb_avg.avgbn)    
+        if hasattr(self.dbb_avg, 'conv'): 
+            k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(self.dbb_avg.conv.weight, self.dbb_avg.bn)   
+            k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(k_1x1_avg_first, b_1x1_avg_first, k_1x1_avg_second, b_1x1_avg_second, groups=self.groups)
+        else:
+            k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second
+    
+        return transII_addbranch((k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged), (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged))
+
+    def convert_to_deploy(self): 
+        if hasattr(self, 'dbb_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias() 
+        self.dbb_reparam = nn.Conv2d(in_channels=self.dbb_origin.conv.in_channels, out_channels=self.dbb_origin.conv.out_channels, 
+                                     kernel_size=self.dbb_origin.conv.kernel_size, stride=self.dbb_origin.conv.stride,  
+                                     padding=self.dbb_origin.conv.padding, dilation=self.dbb_origin.conv.dilation, groups=self.dbb_origin.conv.groups, bias=True)
+        self.dbb_reparam.weight.data = kernel
+        self.dbb_reparam.bias.data = bias    
+        for para in self.parameters(): 
+            para.detach_()
+        self.__delattr__('dbb_origin')
+        self.__delattr__('dbb_avg')
+        if hasattr(self, 'dbb_1x1'):    
+            self.__delattr__('dbb_1x1')
+        self.__delattr__('dbb_1x1_kxk')     
+
+    def forward(self, inputs):  
+        if hasattr(self, 'dbb_reparam'):   
+            return self.nonlinear(self.dbb_reparam(inputs))
+     
+        out = self.dbb_origin(inputs)   
+        if hasattr(self, 'dbb_1x1'):
+            out += self.dbb_1x1(inputs)
+        out += self.dbb_avg(inputs)     
+        out += self.dbb_1x1_kxk(inputs)
+        return self.nonlinear(out)  
+
+    def init_gamma(self, gamma_value):  
+        if hasattr(self, "dbb_origin"):
+            torch.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1"):  
+            torch.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value)   
+        if hasattr(self, "dbb_avg"):     
+            torch.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1_kxk"):   
+            torch.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value)   
+
+    def single_init(self):  
+        self.init_gamma(0.0)    
+        if hasattr(self, "dbb_origin"):     
+            torch.nn.init.constant_(self.dbb_origin.bn.weight, 1.0)     
+     
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')   
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32     
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+
+    module = DiverseBranchBlock(in_channel, out_channel, kernel_size=3, stride=1).to(device)
+  
+    outputs = module(inputs)   
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+
+    print(GREEN + 'test reparameterization.' + RESET)
+    module = model_fuse_test(module)    
+    outputs = module(inputs)  
+    print(GREEN + 'test reparameterization done.' + RESET)     
+ 
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True) 
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/conv_module/dcnv2.py b/engine/extre_module/custom_nn/conv_module/dcnv2.py
new file mode 100644
index 00000000..dd819733
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/dcnv2.py
@@ -0,0 +1,102 @@
+'''
+本文件由BiliBili：魔傀面具整理 
+论文链接：https://arxiv.org/pdf/1811.11168   
+'''
+  
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings     
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+
+import torch, math
+import torch.nn as nn 
+
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad   
+
+# Deformable Conv v2
+class DCNv2(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,    
+                 padding=None, groups=1, dilation=1, act=True, deformable_groups=1):  
+        super(DCNv2, self).__init__()   
+
+        self.in_channels = in_channels   
+        self.out_channels = out_channels    
+        self.kernel_size = (kernel_size, kernel_size)
+        self.stride = (stride, stride)  
+        padding = autopad(kernel_size, padding, dilation)
+        self.padding = (padding, padding) 
+        self.dilation = (dilation, dilation)  
+        self.groups = groups
+        self.deformable_groups = deformable_groups    
+    
+        self.weight = nn.Parameter(
+            torch.empty(out_channels, in_channels, *self.kernel_size)
+        )
+        self.bias = nn.Parameter(torch.empty(out_channels))
+
+        out_channels_offset_mask = (self.deformable_groups * 3 *
+                                    self.kernel_size[0] * self.kernel_size[1])
+        self.conv_offset_mask = nn.Conv2d(
+            self.in_channels,    
+            out_channels_offset_mask,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,    
+            bias=True,
+        )     
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = Conv.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+        self.reset_parameters()     
+
+    def forward(self, x):
+        offset_mask = self.conv_offset_mask(x)  
+        o1, o2, mask = torch.chunk(offset_mask, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)  
+        mask = torch.sigmoid(mask)
+        x = torch.ops.torchvision.deform_conv2d(
+            x, 
+            self.weight,
+            offset,
+            mask,
+            self.bias,
+            self.stride[0], self.stride[1],
+            self.padding[0], self.padding[1],
+            self.dilation[0], self.dilation[1],
+            self.groups,
+            self.deformable_groups, 
+            True
+        )
+        x = self.bn(x)  
+        x = self.act(x)
+        return x     
+
+    def reset_parameters(self):
+        n = self.in_channels   
+        for k in self.kernel_size:
+            n *= k    
+        std = 1. / math.sqrt(n)  
+        self.weight.data.uniform_(-std, std)   
+        self.bias.data.zero_()  
+        self.conv_offset_mask.weight.data.zero_()
+        self.conv_offset_mask.bias.data.zero_()
+   
+if __name__ == '__main__': 
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)     
+   
+    module = DCNv2(in_channel, out_channel, kernel_size=3, stride=1).to(device)
+     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)   
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,    
+                                     output_precision=4,
+                                     print_detailed=True) 
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/conv_module/deconv.py b/engine/extre_module/custom_nn/conv_module/deconv.py
new file mode 100644
index 00000000..5b3ee5b5
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/deconv.py
@@ -0,0 +1,208 @@
+''' 
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/IEEETIP2024-DEConv.png    
+论文链接：https://arxiv.org/pdf/2301.04805     
+'''
+   
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings   
+warnings.filterwarnings('ignore')     
+from calflops import calculate_flops
+   
+import math
+import torch 
+from torch import nn
+from einops.layers.torch import Rearrange
+ 
+from engine.extre_module.ultralytics_nn.conv import Conv  
+from engine.extre_module.torch_utils import model_fuse_test    
+
+class Conv2d_cd(nn.Module):   
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1,
+                 padding=1, dilation=1, groups=1, bias=False, theta=1.0):    
+   
+        super(Conv2d_cd, self).__init__() 
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)    
+        self.theta = theta
+   
+    def get_weight(self):
+        conv_weight = self.conv.weight
+        conv_shape = conv_weight.shape
+        conv_weight = Rearrange('c_in c_out k1 k2 -> c_in c_out (k1 k2)')(conv_weight)
+        if conv_weight.is_cuda:
+            conv_weight_cd = torch.cuda.FloatTensor(conv_shape[0], conv_shape[1], 3 * 3).fill_(0)  
+        else:
+            conv_weight_cd = torch.FloatTensor(conv_shape[0], conv_shape[1], 3 * 3).fill_(0)     
+        conv_weight_cd = conv_weight_cd.to(conv_weight.dtype)    
+        conv_weight_cd[:, :, :] = conv_weight[:, :, :] 
+        conv_weight_cd[:, :, 4] = conv_weight[:, :, 4] - conv_weight[:, :, :].sum(2)
+        conv_weight_cd = Rearrange('c_in c_out (k1 k2) -> c_in c_out k1 k2', k1=conv_shape[2], k2=conv_shape[3])(conv_weight_cd)
+        return conv_weight_cd, self.conv.bias
+
+  
+class Conv2d_ad(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1,  
+                 padding=1, dilation=1, groups=1, bias=False, theta=1.0):    
+   
+        super(Conv2d_ad, self).__init__()     
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
+        self.theta = theta   
+  
+    def get_weight(self):  
+        conv_weight = self.conv.weight
+        conv_shape = conv_weight.shape    
+        conv_weight = Rearrange('c_in c_out k1 k2 -> c_in c_out (k1 k2)')(conv_weight)   
+        conv_weight_ad = conv_weight - self.theta * conv_weight[:, :, [3, 0, 1, 6, 4, 2, 7, 8, 5]]     
+        conv_weight_ad = Rearrange('c_in c_out (k1 k2) -> c_in c_out k1 k2', k1=conv_shape[2], k2=conv_shape[3])(conv_weight_ad) 
+        return conv_weight_ad, self.conv.bias     
+
+
+class Conv2d_rd(nn.Module):  
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1,    
+                 padding=2, dilation=1, groups=1, bias=False, theta=1.0):
+
+        super(Conv2d_rd, self).__init__() 
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
+        self.theta = theta
+
+    def forward(self, x):     
+  
+        if math.fabs(self.theta - 0.0) < 1e-8:
+            out_normal = self.conv(x)
+            return out_normal 
+        else:
+            conv_weight = self.conv.weight 
+            conv_shape = conv_weight.shape
+            if conv_weight.is_cuda:
+                conv_weight_rd = torch.cuda.FloatTensor(conv_shape[0], conv_shape[1], 5 * 5).fill_(0)
+            else:
+                conv_weight_rd = torch.FloatTensor(conv_shape[0], conv_shape[1], 5 * 5).fill_(0)  
+            conv_weight_rd = conv_weight_rd.to(conv_weight.dtype)
+            conv_weight = Rearrange('c_in c_out k1 k2 -> c_in c_out (k1 k2)')(conv_weight) 
+            conv_weight_rd[:, :, [0, 2, 4, 10, 14, 20, 22, 24]] = conv_weight[:, :, 1:]     
+            conv_weight_rd[:, :, [6, 7, 8, 11, 13, 16, 17, 18]] = -conv_weight[:, :, 1:] * self.theta  
+            conv_weight_rd[:, :, 12] = conv_weight[:, :, 0] * (1 - self.theta) 
+            conv_weight_rd = conv_weight_rd.view(conv_shape[0], conv_shape[1], 5, 5)   
+            out_diff = nn.functional.conv2d(input=x, weight=conv_weight_rd, bias=self.conv.bias, stride=self.conv.stride, padding=self.conv.padding, groups=self.conv.groups) 
+    
+            return out_diff  
+
+ 
+class Conv2d_hd(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, 
+                 padding=1, dilation=1, groups=1, bias=False, theta=1.0):     
+
+        super(Conv2d_hd, self).__init__()   
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)    
+
+    def get_weight(self):    
+        conv_weight = self.conv.weight     
+        conv_shape = conv_weight.shape 
+        if conv_weight.is_cuda:
+            conv_weight_hd = torch.cuda.FloatTensor(conv_shape[0], conv_shape[1], 3 * 3).fill_(0)
+        else:
+            conv_weight_hd = torch.FloatTensor(conv_shape[0], conv_shape[1], 3 * 3).fill_(0)
+        conv_weight_hd = conv_weight_hd.to(conv_weight.dtype)     
+        conv_weight_hd[:, :, [0, 3, 6]] = conv_weight[:, :, :]
+        conv_weight_hd[:, :, [2, 5, 8]] = -conv_weight[:, :, :]
+        conv_weight_hd = Rearrange('c_in c_out (k1 k2) -> c_in c_out k1 k2', k1=conv_shape[2], k2=conv_shape[2])(conv_weight_hd)
+        return conv_weight_hd, self.conv.bias
+   
+  
+class Conv2d_vd(nn.Module):    
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1,    
+                 padding=1, dilation=1, groups=1, bias=False): 
+
+        super(Conv2d_vd, self).__init__()   
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)   
+     
+    def get_weight(self):   
+        conv_weight = self.conv.weight
+        conv_shape = conv_weight.shape
+        if conv_weight.is_cuda: 
+            conv_weight_vd = torch.cuda.FloatTensor(conv_shape[0], conv_shape[1], 3 * 3).fill_(0)
+        else:
+            conv_weight_vd = torch.FloatTensor(conv_shape[0], conv_shape[1], 3 * 3).fill_(0)
+        conv_weight_vd = conv_weight_vd.to(conv_weight.dtype)    
+        conv_weight_vd[:, :, [0, 1, 2]] = conv_weight[:, :, :]   
+        conv_weight_vd[:, :, [6, 7, 8]] = -conv_weight[:, :, :]
+        conv_weight_vd = Rearrange('c_in c_out (k1 k2) -> c_in c_out k1 k2', k1=conv_shape[2], k2=conv_shape[2])(conv_weight_vd)
+        return conv_weight_vd, self.conv.bias  
+
+     
+class DEConv(nn.Module):  
+    def __init__(self, inc, ouc):
+        super(DEConv, self).__init__()   
+        self.conv1_1 = Conv2d_cd(inc, inc, 3, bias=True)    
+        self.conv1_2 = Conv2d_hd(inc, inc, 3, bias=True)
+        self.conv1_3 = Conv2d_vd(inc, inc, 3, bias=True) 
+        self.conv1_4 = Conv2d_ad(inc, inc, 3, bias=True)    
+        self.conv1_5 = nn.Conv2d(inc, inc, 3, padding=1, bias=True)
+        
+        self.bn = nn.BatchNorm2d(inc)
+        self.act = nn.SiLU()    
+
+        if inc != ouc:     
+            self.conv1x1 = Conv(inc, ouc, 1)
+        else:
+            self.conv1x1 = nn.Identity()     
+    
+    def forward(self, x):    
+        if hasattr(self, 'conv1_1'):
+            w1, b1 = self.conv1_1.get_weight()
+            w2, b2 = self.conv1_2.get_weight() 
+            w3, b3 = self.conv1_3.get_weight()
+            w4, b4 = self.conv1_4.get_weight()     
+            w5, b5 = self.conv1_5.weight, self.conv1_5.bias
+    
+            w = w1 + w2 + w3 + w4 + w5     
+            b = b1 + b2 + b3 + b4 + b5
+            res = nn.functional.conv2d(input=x, weight=w, bias=b, stride=1, padding=1, groups=1)
+        else:
+            res = self.conv1_5(x)    
+            
+        if hasattr(self, 'bn'):
+            res = self.bn(res)
+  
+        return self.conv1x1(self.act(res))
+    
+    def convert_to_deploy(self):  
+        w1, b1 = self.conv1_1.get_weight()
+        w2, b2 = self.conv1_2.get_weight() 
+        w3, b3 = self.conv1_3.get_weight()     
+        w4, b4 = self.conv1_4.get_weight()     
+        w5, b5 = self.conv1_5.weight, self.conv1_5.bias  
+
+        self.conv1_5.weight = torch.nn.Parameter(w1 + w2 + w3 + w4 + w5)     
+        self.conv1_5.bias = torch.nn.Parameter(b1 + b2 + b3 + b4 + b5)
+  
+        del self.conv1_1     
+        del self.conv1_2     
+        del self.conv1_3 
+        del self.conv1_4
+  
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32     
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)     
+    
+    module = DEConv(in_channel, out_channel).to(device)   
+  
+    outputs = module(inputs) 
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)  
+
+    print(GREEN + 'test reparameterization.' + RESET) 
+    module = model_fuse_test(module)
+    outputs = module(inputs)    
+    print(GREEN + 'test reparameterization done.' + RESET)  
+   
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),  
+                                     output_as_string=True,   
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)   
diff --git a/engine/extre_module/custom_nn/conv_module/deepdbb.py b/engine/extre_module/custom_nn/conv_module/deepdbb.py
new file mode 100644
index 00000000..5f6ca2e2
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/deepdbb.py
@@ -0,0 +1,466 @@
+'''
+本文件由BiliBili：魔傀面具整理
+论文链接：https://www.sciencedirect.com/science/article/abs/pii/S1474034624003574
+'''
+    
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')  
+     
+import warnings 
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np     
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad
+
+from engine.extre_module.torch_utils import model_fuse_test
+    
+def transI_fusebn(kernel, bn):
+    gamma = bn.weight 
+    std = (bn.running_var + bn.eps).sqrt()  
+    return kernel * ((gamma / std).reshape(-1, 1, 1, 1)), bn.bias - bn.running_mean * gamma / std  
+ 
+def transII_addbranch(kernels, biases): 
+    return sum(kernels), sum(biases)
+  
+def transIII_1x1_kxk(k1, b1, k2, b2, groups):     
+    if groups == 1:
+        k = F.conv2d(k2, k1.permute(1, 0, 2, 3))      # 
+        b_hat = (k2 * b1.reshape(1, -1, 1, 1)).sum((1, 2, 3))    
+    else:   
+        k_slices = []
+        b_slices = []
+        k1_T = k1.permute(1, 0, 2, 3)
+        k1_group_width = k1.size(0) // groups
+        k2_group_width = k2.size(0) // groups
+        for g in range(groups):
+            k1_T_slice = k1_T[:, g*k1_group_width:(g+1)*k1_group_width, :, :] 
+            k2_slice = k2[g*k2_group_width:(g+1)*k2_group_width, :, :, :]     
+            k_slices.append(F.conv2d(k2_slice, k1_T_slice))
+            b_slices.append((k2_slice * b1[g*k1_group_width:(g+1)*k1_group_width].reshape(1, -1, 1, 1)).sum((1, 2, 3)))    
+        k, b_hat = transIV_depthconcat(k_slices, b_slices)     
+    return k, b_hat + b2
+  
+def transIV_depthconcat(kernels, biases):     
+    return torch.cat(kernels, dim=0), torch.cat(biases)
+
+def transV_avg(channels, kernel_size, groups):
+    input_dim = channels // groups   
+    k = torch.zeros((channels, input_dim, kernel_size, kernel_size))     
+    k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2    
+    return k   
+
+#   This has not been tested with non-square kernels (kernel.size(2) != kernel.size(3)) nor even-size kernels  
+def transVI_multiscale(kernel, target_kernel_size):  
+    H_pixels_to_pad = (target_kernel_size - kernel.size(2)) // 2   
+    W_pixels_to_pad = (target_kernel_size - kernel.size(3)) // 2   
+    return F.pad(kernel, [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad])
+
+def conv_bn(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1,
+                   padding_mode='zeros'):  
+    conv_layer = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
+                           stride=stride, padding=padding, dilation=dilation, groups=groups, 
+                           bias=False, padding_mode=padding_mode)
+    bn_layer = nn.BatchNorm2d(num_features=out_channels, affine=True)
+    se = nn.Sequential()
+    se.add_module('conv', conv_layer)  
+    se.add_module('bn', bn_layer)    
+    return se
+
+
+class IdentityBasedConv1x1(nn.Module):  
+    def __init__(self, channels, groups=1):
+        super().__init__()
+        assert channels % groups == 0
+        input_dim = channels // groups
+        self.conv = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=1, groups=groups, bias=False) 
+        
+        id_value = np.zeros((channels, input_dim, 1, 1))  
+        for i in range(channels):
+            id_value[i, i % input_dim, 0, 0] = 1
+        self.id_tensor = torch.from_numpy(id_value)    
+        nn.init.zeros_(self.conv.weight)
+        self.groups = groups  
+   
+    def forward(self, input):
+        kernel = self.conv.weight + self.id_tensor.to(self.conv.weight.device).type_as(self.conv.weight)
+        result = F.conv2d(input, kernel, None, stride=1, groups=self.groups)
+        return result  
+
+    def get_actual_kernel(self):  
+        return self.conv.weight + self.id_tensor.to(self.conv.weight.device).type_as(self.conv.weight)  
+    
+class BNAndPadLayer(nn.Module):  
+    def __init__(self,
+                 pad_pixels,
+                 num_features,
+                 eps=1e-5,   
+                 momentum=0.1,
+                 affine=True,    
+                 track_running_stats=True):  
+        super(BNAndPadLayer, self).__init__()
+        self.bn = nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats) 
+        self.pad_pixels = pad_pixels
+    
+    def forward(self, input):
+        output = self.bn(input)  
+        if self.pad_pixels > 0:
+            if self.bn.affine:     
+                pad_values = self.bn.bias.detach() - self.bn.running_mean * self.bn.weight.detach() / torch.sqrt(self.bn.running_var + self.bn.eps)     
+            else:
+                pad_values = - self.bn.running_mean / torch.sqrt(self.bn.running_var + self.bn.eps)   
+            output = F.pad(output, [self.pad_pixels] * 4)
+            pad_values = pad_values.view(1, -1, 1, 1)
+            output[:, :, 0:self.pad_pixels, :] = pad_values
+            output[:, :, -self.pad_pixels:, :] = pad_values    
+            output[:, :, :, 0:self.pad_pixels] = pad_values   
+            output[:, :, :, -self.pad_pixels:] = pad_values
+        return output
+     
+    @property    
+    def weight(self):
+        return self.bn.weight  
+
+    @property
+    def bias(self):
+        return self.bn.bias
+     
+    @property  
+    def running_mean(self):
+        return self.bn.running_mean     
+
+    @property
+    def running_var(self):
+        return self.bn.running_var  
+  
+    @property     
+    def eps(self):
+        return self.bn.eps 
+     
+class DiverseBranchBlockNOAct(nn.Module):  
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=None, dilation=1, groups=1, 
+                 internal_channels_1x1_3x3=None,
+                 deploy=False, single_init=False):
+        super(DiverseBranchBlockNOAct, self).__init__()    
+        self.deploy = deploy
+
+        # self.nonlinear = Conv.default_act 
+
+        self.kernel_size = kernel_size
+        self.out_channels = out_channels
+        self.groups = groups     
+    
+        if padding is None:
+            # padding=None
+            padding = autopad(kernel_size, padding, dilation)
+        assert padding == kernel_size // 2
+
+        if deploy:
+            self.dbb_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
+                                         stride=stride,     
+                                         padding=padding, dilation=dilation, groups=groups, bias=True) 
+
+        else: 
+
+            self.dbb_origin = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
+                                      stride=stride, padding=padding, dilation=dilation, groups=groups)    
+
+            self.dbb_avg = nn.Sequential()
+            if groups < out_channels:   
+                self.dbb_avg.add_module('conv',
+                                        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1,  
+                                                  stride=1, padding=0, groups=groups, bias=False)) 
+                self.dbb_avg.add_module('bn', BNAndPadLayer(pad_pixels=padding, num_features=out_channels))  
+                self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=0))
+                self.dbb_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride,    
+                                       padding=0, groups=groups)   
+            else: 
+                self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=padding))
+
+            self.dbb_avg.add_module('avgbn', nn.BatchNorm2d(out_channels))
+ 
+            if internal_channels_1x1_3x3 is None:
+                internal_channels_1x1_3x3 = in_channels if groups < out_channels else 2 * in_channels  # For mobilenet, it is better to have 2X internal channels   
+
+            self.dbb_1x1_kxk = nn.Sequential()  
+            if internal_channels_1x1_3x3 == in_channels:
+                self.dbb_1x1_kxk.add_module('idconv1', IdentityBasedConv1x1(channels=in_channels, groups=groups))
+            else:
+                self.dbb_1x1_kxk.add_module('conv1',
+                                            nn.Conv2d(in_channels=in_channels, out_channels=internal_channels_1x1_3x3,
+                                                      kernel_size=1, stride=1, padding=0, groups=groups, bias=False))   
+            self.dbb_1x1_kxk.add_module('bn1', BNAndPadLayer(pad_pixels=padding, num_features=internal_channels_1x1_3x3,
+                                                             affine=True))
+            self.dbb_1x1_kxk.add_module('conv2',    
+                                        nn.Conv2d(in_channels=internal_channels_1x1_3x3, out_channels=out_channels, 
+                                                  kernel_size=kernel_size, stride=stride, padding=0, groups=groups,  
+                                                  bias=False))
+            self.dbb_1x1_kxk.add_module('bn2', nn.BatchNorm2d(out_channels))    
+ 
+        #   The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases.   
+        if single_init:
+            #   Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting.
+            self.single_init()
+
+    def get_equivalent_kernel_bias(self):
+        k_origin, b_origin = transI_fusebn(self.dbb_origin.conv.weight, self.dbb_origin.bn)
+
+        if hasattr(self, 'dbb_1x1'):  
+            k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight, self.dbb_1x1.bn)
+            k_1x1 = transVI_multiscale(k_1x1, self.kernel_size)
+        else:    
+            k_1x1, b_1x1 = 0, 0    
+
+        if hasattr(self.dbb_1x1_kxk, 'idconv1'): 
+            k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel()   
+        else:
+            k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight
+        k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(k_1x1_kxk_first, self.dbb_1x1_kxk.bn1) 
+        k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2)   
+        k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(k_1x1_kxk_first, b_1x1_kxk_first, k_1x1_kxk_second,  
+                                                              b_1x1_kxk_second, groups=self.groups)
+ 
+        k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups)  
+        k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg.to(self.dbb_avg.avgbn.weight.device),
+                                                           self.dbb_avg.avgbn)
+        if hasattr(self.dbb_avg, 'conv'):   
+            k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(self.dbb_avg.conv.weight, self.dbb_avg.bn) 
+            k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(k_1x1_avg_first, b_1x1_avg_first, k_1x1_avg_second,   
+                                                                  b_1x1_avg_second, groups=self.groups)
+        else:  
+            k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second  
+
+        return transII_addbranch((k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged),
+                                 (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged))
+
+    def switch_to_deploy(self):
+        if hasattr(self, 'dbb_reparam'):   
+            return   
+        kernel, bias = self.get_equivalent_kernel_bias()  
+        self.dbb_reparam = nn.Conv2d(in_channels=self.dbb_origin.conv.in_channels, 
+                                     out_channels=self.dbb_origin.conv.out_channels,
+                                     kernel_size=self.dbb_origin.conv.kernel_size, stride=self.dbb_origin.conv.stride,
+                                     padding=self.dbb_origin.conv.padding, dilation=self.dbb_origin.conv.dilation,    
+                                     groups=self.dbb_origin.conv.groups, bias=True)
+        self.dbb_reparam.weight.data = kernel     
+        self.dbb_reparam.bias.data = bias
+        for para in self.parameters():    
+            para.detach_()   
+        self.__delattr__('dbb_origin')
+        self.__delattr__('dbb_avg')  
+        if hasattr(self, 'dbb_1x1'):
+            self.__delattr__('dbb_1x1')   
+        self.__delattr__('dbb_1x1_kxk')     
+
+    def forward(self, inputs):
+        if hasattr(self, 'dbb_reparam'):
+            # return self.nonlinear(self.dbb_reparam(inputs))
+            return self.dbb_reparam(inputs)  
+  
+        out = self.dbb_origin(inputs)
+   
+        # print(inputs.shape)
+        # print(self.dbb_1x1(inputs).shape)
+        if hasattr(self, 'dbb_1x1'): 
+            out += self.dbb_1x1(inputs)  
+        out += self.dbb_avg(inputs)
+        out += self.dbb_1x1_kxk(inputs)     
+        # return self.nonlinear(out)  
+    
+        return out
+     
+    def init_gamma(self, gamma_value):     
+        if hasattr(self, "dbb_origin"):     
+            torch.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1"):  
+            torch.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value)
+        if hasattr(self, "dbb_avg"):    
+            torch.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1_kxk"):
+            torch.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value)   
+   
+    def single_init(self):    
+        self.init_gamma(0.0)
+        if hasattr(self, "dbb_origin"):
+            torch.nn.init.constant_(self.dbb_origin.bn.weight, 1.0)     
+ 
+    @property 
+    def weight(self):  ##含有@property
+        if hasattr(self, 'dbb_reparam'):
+            # return self.nonlinear(self.dbb_reparam(inputs))
+            return self.dbb_reparam.weight    
+
+class DeepDiverseBranchBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size,  
+                 stride=1, padding=None, dilation=1, groups=1,
+                 internal_channels_1x1_3x3=None,
+                 deploy=False, single_init=False,conv_orgin=DiverseBranchBlockNOAct):
+        super(DeepDiverseBranchBlock, self).__init__()     
+        self.deploy = deploy    
+   
+        self.nonlinear = Conv.default_act     
+
+        self.kernel_size = kernel_size
+        self.out_channels = out_channels
+        self.groups = groups  
+        # padding=0
+        if padding is None:     
+            padding = autopad(kernel_size, padding, dilation)     
+        assert padding == kernel_size // 2 
+     
+        if deploy:
+            self.dbb_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, 
+                                         stride=stride,
+                                         padding=padding, dilation=dilation, groups=groups, bias=True)   
+    
+        else:
+
+            self.dbb_origin = DiverseBranchBlockNOAct(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
+                                      stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+            self.dbb_avg = nn.Sequential()     
+            if groups < out_channels:   
+                self.dbb_avg.add_module('conv',
+                                        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1,
+                                                  stride=1, padding=0, groups=groups, bias=False))
+                self.dbb_avg.add_module('bn', BNAndPadLayer(pad_pixels=padding, num_features=out_channels))    
+                self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=0))     
+                self.dbb_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride,
+                                       padding=0, groups=groups)   
+            else:     
+                self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=padding))   
+  
+            self.dbb_avg.add_module('avgbn', nn.BatchNorm2d(out_channels)) 
+
+            if internal_channels_1x1_3x3 is None:    
+                internal_channels_1x1_3x3 = in_channels if groups < out_channels else 2 * in_channels  # For mobilenet, it is better to have 2X internal channels
+     
+            self.dbb_1x1_kxk = nn.Sequential()   
+            if internal_channels_1x1_3x3 == in_channels:
+                self.dbb_1x1_kxk.add_module('idconv1', IdentityBasedConv1x1(channels=in_channels, groups=groups))     
+            else:
+                self.dbb_1x1_kxk.add_module('conv1',
+                                            nn.Conv2d(in_channels=in_channels, out_channels=internal_channels_1x1_3x3, 
+                                                      kernel_size=1, stride=1, padding=0, groups=groups, bias=False))
+            self.dbb_1x1_kxk.add_module('bn1', BNAndPadLayer(pad_pixels=padding, num_features=internal_channels_1x1_3x3,
+                                                             affine=True))    
+            self.dbb_1x1_kxk.add_module('conv2',     
+                                        nn.Conv2d(in_channels=internal_channels_1x1_3x3, out_channels=out_channels,   
+                                                  kernel_size=kernel_size, stride=stride, padding=0, groups=groups,
+                                                  bias=False))
+            self.dbb_1x1_kxk.add_module('bn2', nn.BatchNorm2d(out_channels)) 
+
+        #   The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases.    
+        if single_init:
+            #   Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting.     
+            self.single_init()    
+     
+    def get_equivalent_kernel_bias(self): 
+        self.dbb_origin.switch_to_deploy()  
+        # k_origin, b_origin = transI_fusebn(self.dbb_origin.conv.dbb_reparam.weight, self.dbb_origin.bn)
+
+        k_origin, b_origin = self.dbb_origin.dbb_reparam.weight, self.dbb_origin.dbb_reparam.bias    
+  
+        if hasattr(self, 'dbb_1x1'):  
+            k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight, self.dbb_1x1.bn)  
+            k_1x1 = transVI_multiscale(k_1x1, self.kernel_size)
+        else:     
+            k_1x1, b_1x1 = 0, 0
+  
+        if hasattr(self.dbb_1x1_kxk, 'idconv1'):    
+            k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel()
+        else:
+            k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight
+        k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(k_1x1_kxk_first, self.dbb_1x1_kxk.bn1)  
+        k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2)
+        k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(k_1x1_kxk_first, b_1x1_kxk_first, k_1x1_kxk_second,
+                                                              b_1x1_kxk_second, groups=self.groups)
+   
+        k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups)
+        k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg.to(self.dbb_avg.avgbn.weight.device),
+                                                           self.dbb_avg.avgbn)
+        if hasattr(self.dbb_avg, 'conv'):
+            k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(self.dbb_avg.conv.weight, self.dbb_avg.bn)     
+            k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(k_1x1_avg_first, b_1x1_avg_first, k_1x1_avg_second,
+                                                                  b_1x1_avg_second, groups=self.groups)
+        else:
+            k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second    
+
+        return transII_addbranch((k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged),     
+                                 (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged))  
+
+    def convert_to_deploy(self):    
+        if hasattr(self, 'dbb_reparam'):  
+            return     
+        kernel, bias = self.get_equivalent_kernel_bias()     
+        self.dbb_reparam = nn.Conv2d(in_channels=self.dbb_origin.dbb_reparam.in_channels,
+                                     out_channels=self.dbb_origin.dbb_reparam.out_channels,
+                                     kernel_size=self.dbb_origin.dbb_reparam.kernel_size, stride=self.dbb_origin.dbb_reparam.stride,  
+                                     padding=self.dbb_origin.dbb_reparam.padding, dilation=self.dbb_origin.dbb_reparam.dilation, 
+                                     groups=self.dbb_origin.dbb_reparam.groups, bias=True)
+        self.dbb_reparam.weight.data = kernel
+        self.dbb_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__('dbb_origin')  
+        self.__delattr__('dbb_avg')    
+        if hasattr(self, 'dbb_1x1'):
+            self.__delattr__('dbb_1x1')
+        self.__delattr__('dbb_1x1_kxk')   
+     
+    def forward(self, inputs):
+        if hasattr(self, 'dbb_reparam'):
+            return self.nonlinear(self.dbb_reparam(inputs)) 
+            # return self.dbb_reparam(inputs)  
+
+        out = self.dbb_origin(inputs) 
+        if hasattr(self, 'dbb_1x1'):   
+            out += self.dbb_1x1(inputs)
+        out += self.dbb_avg(inputs)
+        out += self.dbb_1x1_kxk(inputs) 
+        return self.nonlinear(out)     
+    
+        # return out    
+   
+    def init_gamma(self, gamma_value):
+        if hasattr(self, "dbb_origin"): 
+            torch.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value)     
+        if hasattr(self, "dbb_1x1"):
+            torch.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value)
+        if hasattr(self, "dbb_avg"):
+            torch.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1_kxk"):   
+            torch.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value)  
+     
+    def single_init(self):
+        self.init_gamma(0.0) 
+        if hasattr(self, "dbb_origin"): 
+            torch.nn.init.constant_(self.dbb_origin.bn.weight, 1.0)
+     
+if __name__ == '__main__':   
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32     
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+   
+    module = DeepDiverseBranchBlock(in_channel, out_channel, kernel_size=3, stride=1).to(device)
+ 
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)     
+
+    print(GREEN + 'test reparameterization.' + RESET)
+    module = model_fuse_test(module)
+    outputs = module(inputs)
+    print(GREEN + 'test reparameterization done.' + RESET)   
+  
+    print(ORANGE)  
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,   
+                                     output_precision=4,     
+                                     print_detailed=True)  
+    print(RESET) 
diff --git a/engine/extre_module/custom_nn/conv_module/dynamic_snake_conv.py b/engine/extre_module/custom_nn/conv_module/dynamic_snake_conv.py
new file mode 100644
index 00000000..7072dbb3
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/dynamic_snake_conv.py
@@ -0,0 +1,381 @@
+'''
+本文件由BiliBili：魔傀面具整理   
+engine/extre_module/module_images/ICCV2023-dynamic_snake_conv.png    
+论文链接：https://arxiv.org/pdf/2307.08388
+'''
+  
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')    
+ 
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+     
+import torch
+import torch.nn as nn    
+from engine.extre_module.ultralytics_nn.conv import Conv 
+   
+class DySnakeConv(nn.Module):
+    def __init__(self, inc, ouc, k=3) -> None:   
+        super().__init__()  
+        
+        self.conv_0 = Conv(inc, inc, k)
+        self.conv_x = DSConv(inc, inc, 0, k)   
+        self.conv_y = DSConv(inc, inc, 1, k) 
+   
+        self.conv1x1 = Conv(inc * 3, ouc, 1)   
+     
+    def forward(self, x):
+        return self.conv1x1(torch.cat([self.conv_0(x), self.conv_x(x), self.conv_y(x)], dim=1))
+
+class DSConv(nn.Module):  
+    def __init__(self, in_ch, out_ch, morph, kernel_size=3, if_offset=True, extend_scope=1):   
+        """
+        The Dynamic Snake Convolution
+        :param in_ch: input channel    
+        :param out_ch: output channel 
+        :param kernel_size: the size of kernel
+        :param extend_scope: the range to expand (default 1 for this method)   
+        :param morph: the morphology of the convolution kernel is mainly divided into two types
+                        along the x-axis (0) and the y-axis (1) (see the paper for details) 
+        :param if_offset: whether deformation is required, if it is False, it is the standard convolution kernel 
+        """     
+        super(DSConv, self).__init__()
+        # use the <offset_conv> to learn the deformable offset
+        self.offset_conv = nn.Conv2d(in_ch, 2 * kernel_size, 3, padding=1)     
+        self.bn = nn.BatchNorm2d(2 * kernel_size)  
+        self.kernel_size = kernel_size
+
+        # two types of the DSConv (along x-axis and y-axis)    
+        self.dsc_conv_x = nn.Conv2d(
+            in_ch,   
+            out_ch,
+            kernel_size=(kernel_size, 1),
+            stride=(kernel_size, 1),
+            padding=0,  
+        )
+        self.dsc_conv_y = nn.Conv2d( 
+            in_ch,     
+            out_ch,
+            kernel_size=(1, kernel_size),
+            stride=(1, kernel_size),
+            padding=0,   
+        )  
+    
+        self.gn = nn.GroupNorm(out_ch // 4, out_ch)
+        self.act = Conv.default_act
+
+        self.extend_scope = extend_scope
+        self.morph = morph  
+        self.if_offset = if_offset    
+
+    def forward(self, f):
+        offset = self.offset_conv(f)
+        offset = self.bn(offset)
+        # We need a range of deformation between -1 and 1 to mimic the snake's swing  
+        offset = torch.tanh(offset)  
+        input_shape = f.shape
+        dsc = DSC(input_shape, self.kernel_size, self.extend_scope, self.morph)     
+        deformed_feature = dsc.deform_conv(f, offset, self.if_offset)  
+        if self.morph == 0:
+            x = self.dsc_conv_x(deformed_feature.type(f.dtype))     
+            x = self.gn(x)    
+            x = self.act(x)
+            return x   
+        else:
+            x = self.dsc_conv_y(deformed_feature.type(f.dtype))     
+            x = self.gn(x)     
+            x = self.act(x)
+            return x    
+   
+
+# Core code, for ease of understanding, we mark the dimensions of input and output next to the code
+class DSC(object):
+    def __init__(self, input_shape, kernel_size, extend_scope, morph):
+        self.num_points = kernel_size   
+        self.width = input_shape[2]   
+        self.height = input_shape[3]  
+        self.morph = morph  
+        self.extend_scope = extend_scope  # offset (-1 ~ 1) * extend_scope
+ 
+        # define feature map shape
+        """
+        B: Batch size  C: Channel  W: Width  H: Height
+        """
+        self.num_batch = input_shape[0]   
+        self.num_channels = input_shape[1]  
+     
+    """
+    input: offset [B,2*K,W,H]  K: Kernel size (2*K: 2D image, deformation contains <x_offset> and <y_offset>)
+    output_x: [B,1,W,K*H]   coordinate map 
+    output_y: [B,1,K*W,H]   coordinate map     
+    """
+
+    def _coordinate_map_3D(self, offset, if_offset):   
+        device = offset.device
+        # offset
+        y_offset, x_offset = torch.split(offset, self.num_points, dim=1)
+ 
+        y_center = torch.arange(0, self.width).repeat([self.height])
+        y_center = y_center.reshape(self.height, self.width)
+        y_center = y_center.permute(1, 0) 
+        y_center = y_center.reshape([-1, self.width, self.height])   
+        y_center = y_center.repeat([self.num_points, 1, 1]).float()
+        y_center = y_center.unsqueeze(0) 
+   
+        x_center = torch.arange(0, self.height).repeat([self.width])
+        x_center = x_center.reshape(self.width, self.height)
+        x_center = x_center.permute(0, 1)
+        x_center = x_center.reshape([-1, self.width, self.height])  
+        x_center = x_center.repeat([self.num_points, 1, 1]).float()
+        x_center = x_center.unsqueeze(0)
+    
+        if self.morph == 0:
+            """
+            Initialize the kernel and flatten the kernel
+                y: only need 0
+                x: -num_points//2 ~ num_points//2 (Determined by the kernel size)
+                !!! The related PPT will be submitted later, and the PPT will contain the whole changes of each step
+            """
+            y = torch.linspace(0, 0, 1)
+            x = torch.linspace(     
+                -int(self.num_points // 2),
+                int(self.num_points // 2),
+                int(self.num_points),
+            )     
+
+            y, x = torch.meshgrid(y, x)
+            y_spread = y.reshape(-1, 1)    
+            x_spread = x.reshape(-1, 1)  
+   
+            y_grid = y_spread.repeat([1, self.width * self.height]) 
+            y_grid = y_grid.reshape([self.num_points, self.width, self.height])
+            y_grid = y_grid.unsqueeze(0)  # [B*K*K, W,H]
+  
+            x_grid = x_spread.repeat([1, self.width * self.height])  
+            x_grid = x_grid.reshape([self.num_points, self.width, self.height])
+            x_grid = x_grid.unsqueeze(0)  # [B*K*K, W,H] 
+
+            y_new = y_center + y_grid 
+            x_new = x_center + x_grid     
+     
+            y_new = y_new.repeat(self.num_batch, 1, 1, 1).to(device)    
+            x_new = x_new.repeat(self.num_batch, 1, 1, 1).to(device)    
+  
+            y_offset_new = y_offset.detach().clone()
+
+            if if_offset:    
+                y_offset = y_offset.permute(1, 0, 2, 3)    
+                y_offset_new = y_offset_new.permute(1, 0, 2, 3)
+                center = int(self.num_points // 2)
+
+                # The center position remains unchanged and the rest of the positions begin to swing  
+                # This part is quite simple. The main idea is that "offset is an iterative process"
+                y_offset_new[center] = 0
+                for index in range(1, center):
+                    y_offset_new[center + index] = (y_offset_new[center + index - 1] + y_offset[center + index])     
+                    y_offset_new[center - index] = (y_offset_new[center - index + 1] + y_offset[center - index])   
+                y_offset_new = y_offset_new.permute(1, 0, 2, 3).to(device)
+                y_new = y_new.add(y_offset_new.mul(self.extend_scope))     
+     
+            y_new = y_new.reshape(
+                [self.num_batch, self.num_points, 1, self.width, self.height])
+            y_new = y_new.permute(0, 3, 1, 4, 2) 
+            y_new = y_new.reshape([ 
+                self.num_batch, self.num_points * self.width, 1 * self.height   
+            ])
+            x_new = x_new.reshape(
+                [self.num_batch, self.num_points, 1, self.width, self.height])
+            x_new = x_new.permute(0, 3, 1, 4, 2)  
+            x_new = x_new.reshape([    
+                self.num_batch, self.num_points * self.width, 1 * self.height   
+            ])
+            return y_new, x_new
+
+        else:
+            """
+            Initialize the kernel and flatten the kernel
+                y: -num_points//2 ~ num_points//2 (Determined by the kernel size)    
+                x: only need 0
+            """
+            y = torch.linspace(    
+                -int(self.num_points // 2),    
+                int(self.num_points // 2),  
+                int(self.num_points),  
+            )
+            x = torch.linspace(0, 0, 1)
+   
+            y, x = torch.meshgrid(y, x)  
+            y_spread = y.reshape(-1, 1)
+            x_spread = x.reshape(-1, 1)  
+
+            y_grid = y_spread.repeat([1, self.width * self.height])   
+            y_grid = y_grid.reshape([self.num_points, self.width, self.height]) 
+            y_grid = y_grid.unsqueeze(0)     
+
+            x_grid = x_spread.repeat([1, self.width * self.height])
+            x_grid = x_grid.reshape([self.num_points, self.width, self.height])
+            x_grid = x_grid.unsqueeze(0)  
+
+            y_new = y_center + y_grid    
+            x_new = x_center + x_grid 
+
+            y_new = y_new.repeat(self.num_batch, 1, 1, 1)
+            x_new = x_new.repeat(self.num_batch, 1, 1, 1)
+ 
+            y_new = y_new.to(device) 
+            x_new = x_new.to(device)
+            x_offset_new = x_offset.detach().clone()  
+
+            if if_offset:
+                x_offset = x_offset.permute(1, 0, 2, 3)    
+                x_offset_new = x_offset_new.permute(1, 0, 2, 3)
+                center = int(self.num_points // 2) 
+                x_offset_new[center] = 0    
+                for index in range(1, center):
+                    x_offset_new[center + index] = (x_offset_new[center + index - 1] + x_offset[center + index]) 
+                    x_offset_new[center - index] = (x_offset_new[center - index + 1] + x_offset[center - index])  
+                x_offset_new = x_offset_new.permute(1, 0, 2, 3).to(device)
+                x_new = x_new.add(x_offset_new.mul(self.extend_scope)) 
+
+            y_new = y_new.reshape(   
+                [self.num_batch, 1, self.num_points, self.width, self.height])
+            y_new = y_new.permute(0, 3, 1, 4, 2)
+            y_new = y_new.reshape([
+                self.num_batch, 1 * self.width, self.num_points * self.height  
+            ])
+            x_new = x_new.reshape(
+                [self.num_batch, 1, self.num_points, self.width, self.height])
+            x_new = x_new.permute(0, 3, 1, 4, 2)
+            x_new = x_new.reshape([
+                self.num_batch, 1 * self.width, self.num_points * self.height
+            ])   
+            return y_new, x_new     
+    
+    """     
+    input: input feature map [N,C,D,W,H]；coordinate map [N,K*D,K*W,K*H] 
+    output: [N,1,K*D,K*W,K*H]  deformed feature map
+    """
+    def _bilinear_interpolate_3D(self, input_feature, y, x):
+        device = input_feature.device
+        y = y.reshape([-1]).float()   
+        x = x.reshape([-1]).float()
+
+        zero = torch.zeros([]).int()   
+        max_y = self.width - 1   
+        max_x = self.height - 1    
+
+        # find 8 grid locations  
+        y0 = torch.floor(y).int()
+        y1 = y0 + 1
+        x0 = torch.floor(x).int()
+        x1 = x0 + 1
+    
+        # clip out coordinates exceeding feature map volume  
+        y0 = torch.clamp(y0, zero, max_y) 
+        y1 = torch.clamp(y1, zero, max_y)
+        x0 = torch.clamp(x0, zero, max_x)  
+        x1 = torch.clamp(x1, zero, max_x) 
+
+        input_feature_flat = input_feature.flatten()
+        input_feature_flat = input_feature_flat.reshape(
+            self.num_batch, self.num_channels, self.width, self.height)
+        input_feature_flat = input_feature_flat.permute(0, 2, 3, 1)
+        input_feature_flat = input_feature_flat.reshape(-1, self.num_channels) 
+        dimension = self.height * self.width
+
+        base = torch.arange(self.num_batch) * dimension 
+        base = base.reshape([-1, 1]).float()
+
+        repeat = torch.ones([self.num_points * self.width * self.height     
+                             ]).unsqueeze(0)
+        repeat = repeat.float()
+ 
+        base = torch.matmul(base, repeat)    
+        base = base.reshape([-1])    
+
+        base = base.to(device)
+
+        base_y0 = base + y0 * self.height     
+        base_y1 = base + y1 * self.height
+
+        # top rectangle of the neighbourhood volume
+        index_a0 = base_y0 - base + x0
+        index_c0 = base_y0 - base + x1
+ 
+        # bottom rectangle of the neighbourhood volume
+        index_a1 = base_y1 - base + x0   
+        index_c1 = base_y1 - base + x1     
+ 
+        # get 8 grid values
+        value_a0 = input_feature_flat[index_a0.type(torch.int64)].to(device)   
+        value_c0 = input_feature_flat[index_c0.type(torch.int64)].to(device) 
+        value_a1 = input_feature_flat[index_a1.type(torch.int64)].to(device)
+        value_c1 = input_feature_flat[index_c1.type(torch.int64)].to(device)  
+
+        # find 8 grid locations   
+        y0 = torch.floor(y).int()
+        y1 = y0 + 1  
+        x0 = torch.floor(x).int()     
+        x1 = x0 + 1
+ 
+        # clip out coordinates exceeding feature map volume
+        y0 = torch.clamp(y0, zero, max_y + 1)  
+        y1 = torch.clamp(y1, zero, max_y + 1)
+        x0 = torch.clamp(x0, zero, max_x + 1)
+        x1 = torch.clamp(x1, zero, max_x + 1)  
+
+        x0_float = x0.float() 
+        x1_float = x1.float()     
+        y0_float = y0.float()
+        y1_float = y1.float()    
+
+        vol_a0 = ((y1_float - y) * (x1_float - x)).unsqueeze(-1).to(device)  
+        vol_c0 = ((y1_float - y) * (x - x0_float)).unsqueeze(-1).to(device)  
+        vol_a1 = ((y - y0_float) * (x1_float - x)).unsqueeze(-1).to(device)
+        vol_c1 = ((y - y0_float) * (x - x0_float)).unsqueeze(-1).to(device)   
+ 
+        outputs = (value_a0 * vol_a0 + value_c0 * vol_c0 + value_a1 * vol_a1 + 
+                   value_c1 * vol_c1)
+
+        if self.morph == 0:    
+            outputs = outputs.reshape([     
+                self.num_batch,
+                self.num_points * self.width,
+                1 * self.height,
+                self.num_channels,  
+            ]) 
+            outputs = outputs.permute(0, 3, 1, 2) 
+        else:   
+            outputs = outputs.reshape([
+                self.num_batch,
+                1 * self.width,
+                self.num_points * self.height,   
+                self.num_channels,     
+            ])
+            outputs = outputs.permute(0, 3, 1, 2)
+        return outputs   
+  
+    def deform_conv(self, input, offset, if_offset):
+        y, x = self._coordinate_map_3D(offset, if_offset)
+        deformed_feature = self._bilinear_interpolate_3D(input, y, x) 
+        return deformed_feature   
+  
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m" 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')     
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32   
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)    
+
+    module = DySnakeConv(in_channel, out_channel, 3).to(device)    
+
+    outputs = module(inputs) 
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+  
+    print(ORANGE)   
+    flops, macs, _ = calculate_flops(model=module,  
+                                     input_shape=(batch_size, in_channel, height, width),  
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/conv_module/gConv.py b/engine/extre_module/custom_nn/conv_module/gConv.py
new file mode 100644
index 00000000..829f8578
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/gConv.py
@@ -0,0 +1,68 @@
+'''
+本文件由BiliBili：魔傀面具整理 
+engine/extre_module/module_images/gConv.png 
+论文链接：https://arxiv.org/abs/2209.11448     
+'''
+   
+import os, sys    
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..') 
+    
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+     
+import torch, math     
+import torch.nn as nn     
+from torch.nn.init import trunc_normal_
+   
+from engine.extre_module.ultralytics_nn.conv import Conv  
+     
+class gConv(nn.Module): 
+	def __init__(self, in_dim, dim, kernel_size=3, gate_act=nn.Sigmoid):   
+		super().__init__()     
+		self.dim = dim
+
+		self.kernel_size = kernel_size
+
+		self.norm_layer = nn.BatchNorm2d(dim)   
+        
+		self.Wv = nn.Sequential(
+			nn.Conv2d(dim, dim, 1),   
+			nn.Conv2d(dim, dim, kernel_size=kernel_size, padding=kernel_size//2, groups=dim, padding_mode='reflect')  
+		)  
+     
+		self.Wg = nn.Sequential(
+			nn.Conv2d(dim, dim, 1),
+			gate_act() if gate_act in [nn.Sigmoid, nn.Tanh] else gate_act(inplace=True)
+		)
+    
+		self.proj = nn.Conv2d(dim, dim, 1) 
+     
+		self.conv1x1 = Conv(in_dim, dim, 1) if in_dim != dim else nn.Identity()
+
+	def forward(self, X):     
+		X = self.conv1x1(X)    
+		iden = X     
+		X = self.norm_layer(X)   
+		out = self.Wv(X) * self.Wg(X)     
+		out = self.proj(out)   
+		return out + iden 
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+  
+    module = gConv(in_channel, out_channel).to(device)
+   
+    outputs = module(inputs)     
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+  
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)  
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/conv_module/pconv.py b/engine/extre_module/custom_nn/conv_module/pconv.py
new file mode 100644
index 00000000..89327fbd
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/pconv.py
@@ -0,0 +1,70 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2023-partial convolution.png    
+论文链接：https://arxiv.org/pdf/2303.03667
+'''    
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings   
+warnings.filterwarnings('ignore') 
+from calflops import calculate_flops  
+
+   
+import torch
+import torch.nn as nn     
+from engine.extre_module.ultralytics_nn.conv import Conv
+
+class Partial_Conv(nn.Module): 
+    def __init__(self, inc, ouc, n_div=4, forward='split_cat'):
+        super().__init__()  
+        # 参与卷积的通道数
+        self.dim_conv3 = inc // n_div 
+        # 保持不变，不参与卷积
+        self.dim_untouched = inc - self.dim_conv3  
+        self.partial_conv3 = nn.Conv2d(self.dim_conv3, self.dim_conv3, 3, 1, 1, bias=False)  
+     
+        if inc != ouc: 
+            self.conv1x1 = Conv(inc, ouc, k=1) # 用作调整通道数
+        else:
+            self.conv1x1 = nn.Identity()
+
+        if forward == 'slicing':  
+            self.forward = self.forward_slicing     
+        elif forward == 'split_cat':     
+            self.forward = self.forward_split_cat
+        else:   
+            raise NotImplementedError
+
+    def forward_slicing(self, x):  
+        # only for inference
+        x = x.clone()   # !!! Keep the original input intact for the residual connection later 
+        x[:, :self.dim_conv3, :, :] = self.partial_conv3(x[:, :self.dim_conv3, :, :])   
+        return self.conv1x1(x)
+    
+    def forward_split_cat(self, x):
+        # for training/inference     
+        x1, x2 = torch.split(x, [self.dim_conv3, self.dim_untouched], dim=1)  
+        x1 = self.partial_conv3(x1)
+        x = torch.cat((x1, x2), 1)  
+        return self.conv1x1(x)     
+
+if __name__ == '__main__':   
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)  
+
+    module = Partial_Conv(in_channel, out_channel, n_div=4).to(device)
+  
+    outputs = module(inputs)  
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+   
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),     
+                                     output_as_string=True, 
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/conv_module/psconv.py b/engine/extre_module/custom_nn/conv_module/psconv.py
new file mode 100644
index 00000000..be9bae2c
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/psconv.py
@@ -0,0 +1,57 @@
+'''   
+本文件由BiliBili：魔傀面具整理    
+engine/extre_module/module_images/AAAI2025-PSConv.png 
+论文链接：https://arxiv.org/pdf/2412.16986    
+'''
+
+import os, sys    
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings
+warnings.filterwarnings('ignore')     
+from calflops import calculate_flops 
+   
+import torch     
+import torch.nn as nn
+import torch.nn.functional as F 
+import numpy as np
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad    
+
+class PSConv(nn.Module):  
+    ''' Pinwheel-shaped Convolution using the Asymmetric Padding method. '''   
+    
+    def __init__(self, c1, c2, k, s):   
+        super().__init__()
+
+        # self.k = k
+        p = [(k, 0, 1, 0), (0, k, 0, 1), (0, 1, k, 0), (1, 0, 0, k)]
+        self.pad = [nn.ZeroPad2d(padding=(p[g])) for g in range(4)]    
+        self.cw = Conv(c1, c2 // 4, (1, k), s=s, p=0)
+        self.ch = Conv(c1, c2 // 4, (k, 1), s=s, p=0)
+        self.cat = Conv(c2, c2, 2, s=1, p=0)
+
+    def forward(self, x):  
+        yw0 = self.cw(self.pad[0](x))    
+        yw1 = self.cw(self.pad[1](x))
+        yh0 = self.ch(self.pad[2](x))     
+        yh1 = self.ch(self.pad[3](x))     
+        return self.cat(torch.cat([yw0, yw1, yh0, yh1], dim=1))  
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32 
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+     
+    module = PSConv(in_channel, out_channel, k=3, s=1).to(device)  
+   
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+     
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,  
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True, 
+                                     output_precision=4,
+                                     print_detailed=True)    
+    print(RESET)  
diff --git a/engine/extre_module/custom_nn/conv_module/wdbb.py b/engine/extre_module/custom_nn/conv_module/wdbb.py
new file mode 100644
index 00000000..35401c5a
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/wdbb.py
@@ -0,0 +1,399 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/WideDBB.png     
+论文链接：https://www.sciencedirect.com/science/article/abs/pii/S1474034624003574     
+'''
+    
+import os, sys    
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+  
+import warnings     
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops 
+
+import torch
+import torch.nn as nn    
+import torch.nn.functional as F
+import numpy as np
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad
+     
+from engine.extre_module.torch_utils import model_fuse_test    
+
+def transI_fusebn(kernel, bn):  
+    gamma = bn.weight    
+    std = (bn.running_var + bn.eps).sqrt()  
+    return kernel * ((gamma / std).reshape(-1, 1, 1, 1)), bn.bias - bn.running_mean * gamma / std    
+     
+def transII_addbranch(kernels, biases):
+    return sum(kernels), sum(biases)
+  
+def transIII_1x1_kxk(k1, b1, k2, b2, groups):     
+    if groups == 1:  
+        k = F.conv2d(k2, k1.permute(1, 0, 2, 3))      #   
+        b_hat = (k2 * b1.reshape(1, -1, 1, 1)).sum((1, 2, 3))
+    else:     
+        k_slices = []   
+        b_slices = []     
+        k1_T = k1.permute(1, 0, 2, 3)   
+        k1_group_width = k1.size(0) // groups   
+        k2_group_width = k2.size(0) // groups   
+        for g in range(groups):
+            k1_T_slice = k1_T[:, g*k1_group_width:(g+1)*k1_group_width, :, :] 
+            k2_slice = k2[g*k2_group_width:(g+1)*k2_group_width, :, :, :]  
+            k_slices.append(F.conv2d(k2_slice, k1_T_slice))     
+            b_slices.append((k2_slice * b1[g*k1_group_width:(g+1)*k1_group_width].reshape(1, -1, 1, 1)).sum((1, 2, 3)))     
+        k, b_hat = transIV_depthconcat(k_slices, b_slices)
+    return k, b_hat + b2     
+
+def transIV_depthconcat(kernels, biases):
+    return torch.cat(kernels, dim=0), torch.cat(biases)    
+
+def transV_avg(channels, kernel_size, groups):   
+    input_dim = channels // groups
+    k = torch.zeros((channels, input_dim, kernel_size, kernel_size))
+    k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2
+    return k  
+
+#   This has not been tested with non-square kernels (kernel.size(2) != kernel.size(3)) nor even-size kernels 
+def transVI_multiscale(kernel, target_kernel_size):     
+    H_pixels_to_pad = (target_kernel_size - kernel.size(2)) // 2     
+    W_pixels_to_pad = (target_kernel_size - kernel.size(3)) // 2
+    return F.pad(kernel, [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad])
+
+def conv_bn(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1,  
+                   padding_mode='zeros'):
+    conv_layer = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,  
+                           stride=stride, padding=padding, dilation=dilation, groups=groups,
+                           bias=False, padding_mode=padding_mode)  
+    bn_layer = nn.BatchNorm2d(num_features=out_channels, affine=True)   
+    se = nn.Sequential() 
+    se.add_module('conv', conv_layer)     
+    se.add_module('bn', bn_layer) 
+    return se     
+
+ 
+class IdentityBasedConv1x1(nn.Module):  
+    def __init__(self, channels, groups=1):
+        super().__init__()   
+        assert channels % groups == 0  
+        input_dim = channels // groups
+        self.conv = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=1, groups=groups, bias=False)
+ 
+        id_value = np.zeros((channels, input_dim, 1, 1))
+        for i in range(channels):
+            id_value[i, i % input_dim, 0, 0] = 1
+        self.id_tensor = torch.from_numpy(id_value)
+        nn.init.zeros_(self.conv.weight)     
+        self.groups = groups    
+     
+    def forward(self, input):     
+        kernel = self.conv.weight + self.id_tensor.to(self.conv.weight.device).type_as(self.conv.weight)  
+        result = F.conv2d(input, kernel, None, stride=1, groups=self.groups)
+        return result 
+
+    def get_actual_kernel(self):
+        return self.conv.weight + self.id_tensor.to(self.conv.weight.device).type_as(self.conv.weight)  
+    
+class BNAndPadLayer(nn.Module):  
+    def __init__(self,
+                 pad_pixels, 
+                 num_features,  
+                 eps=1e-5,   
+                 momentum=0.1,     
+                 affine=True,
+                 track_running_stats=True):
+        super(BNAndPadLayer, self).__init__()
+        self.bn = nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats)
+        self.pad_pixels = pad_pixels  
+
+    def forward(self, input):
+        output = self.bn(input)
+        if self.pad_pixels > 0:
+            if self.bn.affine:
+                pad_values = self.bn.bias.detach() - self.bn.running_mean * self.bn.weight.detach() / torch.sqrt(self.bn.running_var + self.bn.eps)   
+            else:   
+                pad_values = - self.bn.running_mean / torch.sqrt(self.bn.running_var + self.bn.eps)  
+            output = F.pad(output, [self.pad_pixels] * 4)
+            pad_values = pad_values.view(1, -1, 1, 1)
+            output[:, :, 0:self.pad_pixels, :] = pad_values
+            output[:, :, -self.pad_pixels:, :] = pad_values
+            output[:, :, :, 0:self.pad_pixels] = pad_values     
+            output[:, :, :, -self.pad_pixels:] = pad_values
+        return output 
+
+    @property 
+    def weight(self):
+        return self.bn.weight  
+
+    @property
+    def bias(self):   
+        return self.bn.bias  
+
+    @property
+    def running_mean(self):
+        return self.bn.running_mean 
+   
+    @property
+    def running_var(self):    
+        return self.bn.running_var
+
+    @property  
+    def eps(self): 
+        return self.bn.eps
+    
+class WideDiverseBranchBlock(nn.Module): 
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=None, dilation=1, groups=1,
+                 internal_channels_1x1_3x3=None,
+                 deploy=False, single_init=False):  
+        super(WideDiverseBranchBlock, self).__init__()  
+        self.deploy = deploy     
+     
+        self.nonlinear = Conv.default_act  
+     
+        self.kernel_size = kernel_size  
+        self.out_channels = out_channels 
+        self.groups = groups
+
+        if padding is None:     
+            padding = autopad(kernel_size, padding, dilation)   
+        assert padding == kernel_size // 2  
+
+        if deploy:
+            self.dbb_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
+                                         stride=stride,    
+                                         padding=padding, dilation=dilation, groups=groups, bias=True)
+     
+        else:
+
+            self.dbb_origin = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,    
+                                      stride=stride, padding=padding, dilation=dilation, groups=groups)     
+   
+            self.dbb_avg = nn.Sequential()     
+            if groups < out_channels:     
+                self.dbb_avg.add_module('conv',
+                                        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1,  
+                                                  stride=1, padding=0, groups=groups, bias=False))  
+                self.dbb_avg.add_module('bn', BNAndPadLayer(pad_pixels=padding, num_features=out_channels))
+                self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=0))
+                self.dbb_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride,
+                                       padding=0, groups=groups)
+            else:   
+                self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=padding)) 
+ 
+            self.dbb_avg.add_module('avgbn', nn.BatchNorm2d(out_channels))
+
+            if internal_channels_1x1_3x3 is None:     
+                internal_channels_1x1_3x3 = in_channels if groups < out_channels else 2 * in_channels  # For mobilenet, it is better to have 2X internal channels
+    
+            self.dbb_1x1_kxk = nn.Sequential()    
+            if internal_channels_1x1_3x3 == in_channels:     
+                self.dbb_1x1_kxk.add_module('idconv1', IdentityBasedConv1x1(channels=in_channels, groups=groups))   
+            else:
+                self.dbb_1x1_kxk.add_module('conv1',
+                                            nn.Conv2d(in_channels=in_channels, out_channels=internal_channels_1x1_3x3,
+                                                      kernel_size=1, stride=1, padding=0, groups=groups, bias=False))     
+            self.dbb_1x1_kxk.add_module('bn1', BNAndPadLayer(pad_pixels=padding, num_features=internal_channels_1x1_3x3, 
+                                                             affine=True))  
+            self.dbb_1x1_kxk.add_module('conv2',     
+                                        nn.Conv2d(in_channels=internal_channels_1x1_3x3, out_channels=out_channels,
+                                                  kernel_size=kernel_size, stride=stride, padding=0, groups=groups,
+                                                  bias=False))   
+            self.dbb_1x1_kxk.add_module('bn2', nn.BatchNorm2d(out_channels))
+
+        #   The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases.
+        if single_init:   
+            #   Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting.
+            self.single_init()
+    
+        if padding - kernel_size // 2 >= 0:
+            self.crop = 0
+            hor_padding = [padding - kernel_size // 2, padding]
+            ver_padding = [padding, padding - kernel_size // 2]
+        else:   
+            self.crop = kernel_size // 2 - padding    
+            hor_padding = [0, padding] 
+            ver_padding = [padding, 0]   
+
+            # Vertical convolution(3x1) during training
+        self.ver_conv = nn.Conv2d(in_channels=in_channels,
+                                  out_channels=out_channels,
+                                  kernel_size=(kernel_size, 1),
+                                  stride=stride,   
+                                  padding=ver_padding,
+                                  dilation=dilation,
+                                  groups=groups,     
+                                  bias=False,    
+                                    )
+        # Horizontal convolution(1x3) during training
+        self.hor_conv = nn.Conv2d(in_channels=in_channels,
+                                  out_channels=out_channels,
+                                  kernel_size=(1, kernel_size),
+                                  stride=stride,
+                                  padding=hor_padding,
+                                  dilation=dilation,
+                                  groups=groups,    
+                                  bias=False,   
+                                  )   
+        # Batch normalization for vertical convolution
+        self.ver_bn = nn.BatchNorm2d(num_features=out_channels,
+                                     affine=True)
+        # Batch normalization for horizontal convolution     
+        self.hor_bn = nn.BatchNorm2d(num_features=out_channels,  
+                                     affine=True)
+     
+    def _add_to_square_kernel(self, square_kernel, asym_kernel):  
+        '''
+        Used to add an asymmetric kernel to the center of a square kernel
+        square_kernel : the square kernel to which the asymmetric kernel will be added
+        asym_kernel   : the asymmetric kernel that will be added to the square kernel
+        '''
+        # Get the height and width of the asymmetric kernel
+        asym_h = asym_kernel.size(2)   
+        asym_w = asym_kernel.size(3)     
+        # Get the height and width of the square kernel    
+        square_h = square_kernel.size(2)
+        square_w = square_kernel.size(3)   
+        # Add the asymmetric kernel to the center of the square kernel
+        square_kernel[:,    
+        :,   
+        square_h // 2 - asym_h // 2: square_h // 2 - asym_h // 2 + asym_h,
+        square_w // 2 - asym_w // 2: square_w // 2 - asym_w // 2 + asym_w] += asym_kernel
+   
+    def get_equivalent_kernel_bias_1xk_kx1_kxk(self):   
+        '''
+        Used to calculate the equivalent kernel and bias of    
+        the fused convolution layer in deploy mode   
+        '''   
+        # Fuse batch normalization with convolutional weights and biases  
+        hor_k, hor_b       = transI_fusebn(self.hor_conv.weight, self.hor_bn)     
+        ver_k, ver_b       = transI_fusebn(self.ver_conv.weight, self.ver_bn)    
+        square_k, square_b = transI_fusebn(self.dbb_origin.conv.weight, self.dbb_origin.bn)   
+
+     
+        # Add the fused horizontal and vertical kernels to the center of the square kernel
+        self._add_to_square_kernel(square_k, hor_k)
+        self._add_to_square_kernel(square_k, ver_k)    
+        # Return the square kernel and the sum of the biases for the three convolutional layers    
+        return square_k, hor_b + ver_b + square_b 
+
+ 
+    def get_equivalent_kernel_bias(self):
+        # k_origin, b_origin = transI_fusebn(self.dbb_origin.conv.weight, self.dbb_origin.bn)
+        k_origin, b_origin = self.get_equivalent_kernel_bias_1xk_kx1_kxk()    
+     
+
+        if hasattr(self, 'dbb_1x1'):  
+            k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight, self.dbb_1x1.bn)    
+            k_1x1 = transVI_multiscale(k_1x1, self.kernel_size)    
+        else:    
+            k_1x1, b_1x1 = 0, 0 
+
+        if hasattr(self.dbb_1x1_kxk, 'idconv1'):
+            k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel()     
+        else:
+            k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight   
+        k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(k_1x1_kxk_first, self.dbb_1x1_kxk.bn1)
+        k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2)
+        k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(k_1x1_kxk_first, b_1x1_kxk_first, k_1x1_kxk_second,
+                                                              b_1x1_kxk_second, groups=self.groups)
+
+        k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups)
+        k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg.to(self.dbb_avg.avgbn.weight.device),
+                                                           self.dbb_avg.avgbn)
+        if hasattr(self.dbb_avg, 'conv'):
+            k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(self.dbb_avg.conv.weight, self.dbb_avg.bn)   
+            k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(k_1x1_avg_first, b_1x1_avg_first, k_1x1_avg_second, 
+                                                                  b_1x1_avg_second, groups=self.groups)
+        else:  
+            k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second    
+
+        return transII_addbranch((k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged),
+                                 (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged))    
+
+    def convert_to_deploy(self):
+        if hasattr(self, 'dbb_reparam'):     
+            return 
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.dbb_reparam = nn.Conv2d(in_channels=self.dbb_origin.conv.in_channels,
+                                     out_channels=self.dbb_origin.conv.out_channels,
+                                     kernel_size=self.dbb_origin.conv.kernel_size, stride=self.dbb_origin.conv.stride,
+                                     padding=self.dbb_origin.conv.padding, dilation=self.dbb_origin.conv.dilation,
+                                     groups=self.dbb_origin.conv.groups, bias=True)
+        self.dbb_reparam.weight.data = kernel
+        self.dbb_reparam.bias.data = bias    
+        for para in self.parameters():
+            para.detach_()  
+        self.__delattr__('dbb_origin')
+        self.__delattr__('dbb_avg')
+        if hasattr(self, 'dbb_1x1'):    
+            self.__delattr__('dbb_1x1')
+        self.__delattr__('dbb_1x1_kxk')     
+        self.__delattr__('hor_conv')    
+        self.__delattr__('hor_bn')
+        self.__delattr__('ver_conv')
+        self.__delattr__('ver_bn')     
+     
+
+    def forward(self, inputs):
+        if hasattr(self, 'dbb_reparam'):     
+            return self.nonlinear(self.dbb_reparam(inputs))  
+ 
+        out = self.dbb_origin(inputs)   
+        if hasattr(self, 'dbb_1x1'): 
+            out += self.dbb_1x1(inputs)   
+        out += self.dbb_avg(inputs)
+        out += self.dbb_1x1_kxk(inputs)  
+
+        if self.crop > 0:
+            ver_input = inputs[:, :, :, self.crop:-self.crop]     
+            hor_input = inputs[:, :, self.crop:-self.crop, :] 
+        else:
+            ver_input = inputs
+            hor_input = inputs    
+        vertical_outputs = self.ver_conv(ver_input)     
+        vertical_outputs = self.ver_bn(vertical_outputs)
+        horizontal_outputs = self.hor_conv(hor_input)
+        horizontal_outputs = self.hor_bn(horizontal_outputs)
+        result = out + vertical_outputs + horizontal_outputs     
+     
+        return self.nonlinear(result)    
+
+    def init_gamma(self, gamma_value):
+        if hasattr(self, "dbb_origin"): 
+            torch.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value)  
+        if hasattr(self, "dbb_1x1"):    
+            torch.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value)
+        if hasattr(self, "dbb_avg"): 
+            torch.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1_kxk"):     
+            torch.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value)
+     
+    def single_init(self):
+        self.init_gamma(0.0)
+        if hasattr(self, "dbb_origin"):     
+            torch.nn.init.constant_(self.dbb_origin.bn.weight, 1.0)
+ 
+if __name__ == '__main__':   
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m" 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32   
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+
+    module = WideDiverseBranchBlock(in_channel, out_channel, kernel_size=3, stride=1).to(device) 
+
+    outputs = module(inputs)     
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+   
+    print(GREEN + 'test reparameterization.' + RESET)     
+    module = model_fuse_test(module)  
+    outputs = module(inputs)   
+    print(GREEN + 'test reparameterization done.' + RESET)
+
+    print(ORANGE)    
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,  
+                                     output_precision=4,
+                                     print_detailed=True)     
+    print(RESET)     
diff --git a/engine/extre_module/custom_nn/conv_module/wtconv2d.py b/engine/extre_module/custom_nn/conv_module/wtconv2d.py
new file mode 100644
index 00000000..5b07e52a
--- /dev/null
+++ b/engine/extre_module/custom_nn/conv_module/wtconv2d.py
@@ -0,0 +1,218 @@
+'''    
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/ECCV2024-WTConv2D.png   
+论文链接：https://arxiv.org/pdf/2407.05848  
+''' 
+ 
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')     
+
+import warnings     
+warnings.filterwarnings('ignore')   
+from calflops import calculate_flops  
+
+import torch   
+import torch.nn as nn
+import torch.nn.functional as F     
+from torch.autograd import Function
+import dill as pickle 
+     
+import pywt
+import pywt.data
+
+from engine.extre_module.ultralytics_nn.conv import Conv    
+
+def create_wavelet_filter(wave, in_size, out_size, type=torch.float): 
+    w = pywt.Wavelet(wave)    
+    dec_hi = torch.tensor(w.dec_hi[::-1], dtype=type)
+    dec_lo = torch.tensor(w.dec_lo[::-1], dtype=type)  
+    dec_filters = torch.stack([dec_lo.unsqueeze(0) * dec_lo.unsqueeze(1),
+                               dec_lo.unsqueeze(0) * dec_hi.unsqueeze(1), 
+                               dec_hi.unsqueeze(0) * dec_lo.unsqueeze(1),  
+                               dec_hi.unsqueeze(0) * dec_hi.unsqueeze(1)], dim=0)    
+
+    dec_filters = dec_filters[:, None].repeat(in_size, 1, 1, 1)
+
+    rec_hi = torch.tensor(w.rec_hi[::-1], dtype=type).flip(dims=[0])
+    rec_lo = torch.tensor(w.rec_lo[::-1], dtype=type).flip(dims=[0])
+    rec_filters = torch.stack([rec_lo.unsqueeze(0) * rec_lo.unsqueeze(1), 
+                               rec_lo.unsqueeze(0) * rec_hi.unsqueeze(1), 
+                               rec_hi.unsqueeze(0) * rec_lo.unsqueeze(1),
+                               rec_hi.unsqueeze(0) * rec_hi.unsqueeze(1)], dim=0)
+  
+    rec_filters = rec_filters[:, None].repeat(out_size, 1, 1, 1) 
+  
+    return dec_filters, rec_filters
+  
+def wavelet_transform(x, filters):
+    b, c, h, w = x.shape     
+    pad = (filters.shape[2] // 2 - 1, filters.shape[3] // 2 - 1)
+    x = F.conv2d(x, filters.to(x.dtype).to(x.device), stride=2, groups=c, padding=pad)     
+    x = x.reshape(b, c, 4, h // 2, w // 2)
+    return x
+
+
+def inverse_wavelet_transform(x, filters):    
+    b, c, _, h_half, w_half = x.shape
+    pad = (filters.shape[2] // 2 - 1, filters.shape[3] // 2 - 1)     
+    x = x.reshape(b, c * 4, h_half, w_half)
+    x = F.conv_transpose2d(x, filters.to(x.dtype).to(x.device), stride=2, groups=c, padding=pad)
+    return x
+
+    
+# Define the WaveletTransform class 
+class WaveletTransform(Function):     
+    @staticmethod
+    def forward(ctx, input, filters):
+        ctx.filters = filters   
+        with torch.no_grad():
+            x = wavelet_transform(input, filters)
+        return x 
+
+    @staticmethod  
+    def backward(ctx, grad_output):     
+        grad = inverse_wavelet_transform(grad_output, ctx.filters)
+        return grad, None
+
+# Define the InverseWaveletTransform class
+class InverseWaveletTransform(Function):
+    @staticmethod   
+    def forward(ctx, input, filters):
+        ctx.filters = filters
+        with torch.no_grad():     
+            x = inverse_wavelet_transform(input, filters)   
+        return x  
+    
+    @staticmethod
+    def backward(ctx, grad_output):    
+        grad = wavelet_transform(grad_output, ctx.filters)  
+        return grad, None 
+    
+# Initialize the WaveletTransform  
+def wavelet_transform_init(filters):
+    def apply(input): 
+        return WaveletTransform.apply(input, filters)    
+    return apply
+ 
+# Initialize the InverseWaveletTransform   
+def inverse_wavelet_transform_init(filters): 
+    def apply(input): 
+        return InverseWaveletTransform.apply(input, filters) 
+    return apply     
+
+class WTConv2d(nn.Module):    
+    def __init__(self, in_channels, out_channels, kernel_size=5, stride=1, bias=True, wt_levels=1, wt_type='db1'):    
+        super(WTConv2d, self).__init__()
+
+        self.in_channels = in_channels    
+        self.wt_levels = wt_levels
+        self.stride = stride    
+        self.dilation = 1   
+   
+        self.wt_filter, self.iwt_filter = create_wavelet_filter(wt_type, in_channels, in_channels, torch.float)    
+        self.wt_filter = nn.Parameter(self.wt_filter, requires_grad=False)  
+        self.iwt_filter = nn.Parameter(self.iwt_filter, requires_grad=False)
+     
+        self.wt_function = wavelet_transform_init(self.wt_filter)
+        self.iwt_function = inverse_wavelet_transform_init(self.iwt_filter)   
+     
+        self.base_conv = nn.Conv2d(in_channels, in_channels, kernel_size, padding='same', stride=1, dilation=1, groups=in_channels, bias=bias) 
+        self.base_scale = _ScaleModule([1,in_channels,1,1])  
+     
+        self.wavelet_convs = nn.ModuleList(
+            [nn.Conv2d(in_channels*4, in_channels*4, kernel_size, padding='same', stride=1, dilation=1, groups=in_channels*4, bias=False) for _ in range(self.wt_levels)]  
+        )
+        self.wavelet_scale = nn.ModuleList(
+            [_ScaleModule([1,in_channels*4,1,1], init_scale=0.1) for _ in range(self.wt_levels)] 
+        )
+  
+        if self.stride > 1:
+            self.stride_filter = nn.Parameter(torch.ones(in_channels, 1, 1, 1), requires_grad=False)
+            self.do_stride = lambda x_in: F.conv2d(x_in, self.stride_filter.to(x_in.dtype).to(x_in.device), bias=None, stride=self.stride, groups=in_channels)
+        else:
+            self.do_stride = None
+     
+        if in_channels != out_channels:
+            self.conv1x1 = Conv(in_channels, out_channels, 1)  
+        else:
+            self.conv1x1 = nn.Identity() 
+     
+    def forward(self, x):
+
+        x_ll_in_levels = []
+        x_h_in_levels = []
+        shapes_in_levels = [] 
+
+        curr_x_ll = x 
+    
+        for i in range(self.wt_levels):     
+            curr_shape = curr_x_ll.shape   
+            shapes_in_levels.append(curr_shape)
+            if (curr_shape[2] % 2 > 0) or (curr_shape[3] % 2 > 0):    
+                curr_pads = (0, curr_shape[3] % 2, 0, curr_shape[2] % 2)     
+                curr_x_ll = F.pad(curr_x_ll, curr_pads)     
+    
+            curr_x = self.wt_function(curr_x_ll)    
+            curr_x_ll = curr_x[:,:,0,:,:]
+            
+            shape_x = curr_x.shape
+            curr_x_tag = curr_x.reshape(shape_x[0], shape_x[1] * 4, shape_x[3], shape_x[4])    
+            curr_x_tag = self.wavelet_scale[i](self.wavelet_convs[i](curr_x_tag))
+            curr_x_tag = curr_x_tag.reshape(shape_x)
+
+            x_ll_in_levels.append(curr_x_tag[:,:,0,:,:])    
+            x_h_in_levels.append(curr_x_tag[:,:,1:4,:,:])   
+     
+        next_x_ll = 0  
+    
+        for i in range(self.wt_levels-1, -1, -1):   
+            curr_x_ll = x_ll_in_levels.pop()   
+            curr_x_h = x_h_in_levels.pop()
+            curr_shape = shapes_in_levels.pop()     
+     
+            curr_x_ll = curr_x_ll + next_x_ll   
+    
+            curr_x = torch.cat([curr_x_ll.unsqueeze(2), curr_x_h], dim=2)
+            next_x_ll = self.iwt_function(curr_x)
+   
+            next_x_ll = next_x_ll[:, :, :curr_shape[2], :curr_shape[3]]   
+
+        x_tag = next_x_ll
+        assert len(x_ll_in_levels) == 0
+  
+        x = self.base_scale(self.base_conv(x))
+        x = x + x_tag
+        
+        if self.do_stride is not None:
+            x = self.do_stride(x)     
+
+        return self.conv1x1(x)    
+    
+class _ScaleModule(nn.Module):
+    def __init__(self, dims, init_scale=1.0, init_bias=0):
+        super(_ScaleModule, self).__init__()
+        self.dims = dims
+        self.weight = nn.Parameter(torch.ones(*dims) * init_scale)
+        self.bias = None
+    
+    def forward(self, x):
+        return torch.mul(self.weight, x)     
+
+if __name__ == '__main__':     
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32  
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+     
+    module = WTConv2d(in_channel, out_channel, kernel_size=5, stride=1).to(device)
+   
+    outputs = module(inputs)     
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)   
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, in_channel, height, width),   
+                                     output_as_string=True, 
+                                     output_precision=4,    
+                                     print_detailed=True) 
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/downsample/ADown.py b/engine/extre_module/custom_nn/downsample/ADown.py
new file mode 100644
index 00000000..c8ded54b
--- /dev/null
+++ b/engine/extre_module/custom_nn/downsample/ADown.py
@@ -0,0 +1,50 @@
+'''
+本文件由BiliBili：魔傀面具整理
+论文链接：https://arxiv.org/pdf/2402.13616
+'''   
+  
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')    
+ 
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops    
+
+import torch   
+import torch.nn as nn    
+  
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad
+    
+class ADown(nn.Module): 
+    def __init__(self, c1, c2):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__() 
+        self.c = c2 // 2
+        self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
+        self.cv2 = Conv(c1 // 2, self.c, 1, 1, 0)
+
+    def forward(self, x):  
+        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
+        x1,x2 = x.chunk(2, 1)    
+        x1 = self.cv1(x1)    
+        x2 = torch.nn.functional.max_pool2d(x2, 3, 2, 1)
+        x2 = self.cv2(x2)  
+        return torch.cat((x1, x2), 1)     
+ 
+if __name__ == '__main__':     
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32     
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)    
+    
+    module = ADown(in_channel, out_channel).to(device)
+
+    outputs = module(inputs) 
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+ 
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True) 
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/downsample/DRFD.py b/engine/extre_module/custom_nn/downsample/DRFD.py
new file mode 100644
index 00000000..2bd00d10
--- /dev/null
+++ b/engine/extre_module/custom_nn/downsample/DRFD.py
@@ -0,0 +1,82 @@
+'''
+本文件由BiliBili：魔傀面具整理    
+engine/extre_module/module_images/DRFD.png
+论文链接：https://ieeexplore.ieee.org/document/10142024   
+''' 
+   
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops 
+     
+import torch    
+import torch.nn as nn
+    
+class Cut(nn.Module):  
+    def __init__(self, in_channels, out_channels):
+        super().__init__()  
+        self.conv_fusion = nn.Conv2d(in_channels * 4, out_channels, kernel_size=1, stride=1)
+        self.batch_norm = nn.BatchNorm2d(out_channels)
+ 
+    def forward(self, x):
+        x0 = x[:, :, 0::2, 0::2]  # x = [B, C, H/2, W/2] 
+        x1 = x[:, :, 1::2, 0::2]     
+        x2 = x[:, :, 0::2, 1::2]
+        x3 = x[:, :, 1::2, 1::2]
+        x = torch.cat([x0, x1, x2, x3], dim=1)  # x = [B, 4*C, H/2, W/2]
+        x = self.conv_fusion(x)     # x = [B, out_channels, H/2, W/2]
+        x = self.batch_norm(x)  
+        return x  
+ 
+class DRFD(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()     
+        self.cut_c = Cut(in_channels=in_channels, out_channels=out_channels)    
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, groups=in_channels)
+        self.conv_x = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1, groups=out_channels)     
+        self.act_x = nn.GELU()   
+        self.batch_norm_x = nn.BatchNorm2d(out_channels)  
+        self.batch_norm_m = nn.BatchNorm2d(out_channels)   
+        self.max_m = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.fusion = nn.Conv2d(3 * out_channels, out_channels, kernel_size=1, stride=1)   
+
+    def forward(self, x):       # input: x = [B, C, H, W]
+        c = x                   # c = [B, C, H, W]   
+        x = self.conv(x)        # x = [B, C, H, W] --> [B, 2C, H, W]
+        m = x                   # m = [B, 2C, H, W]  
+     
+        # CutD 
+        c = self.cut_c(c)       # c = [B, C, H, W] --> [B, 2C, H/2, W/2]
+
+        # ConvD 
+        x = self.conv_x(x)      # x = [B, 2C, H, W] --> [B, 2C, H/2, W/2]
+        x = self.act_x(x)  
+        x = self.batch_norm_x(x) 
+
+        # MaxD    
+        m = self.max_m(m)       # m = [B, 2C, H/2, W/2]     
+        m = self.batch_norm_m(m) 
+
+        # Concat + conv
+        x = torch.cat([c, x, m], dim=1)  # x = [B, 6C, H/2, W/2]
+        x = self.fusion(x)      # x = [B, 6C, H/2, W/2] --> [B, 2C, H/2, W/2]
+
+        return x                # x = [B, 2C, H/2, W/2]
+     
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"  
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')     
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32 
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)  
+    
+    module = DRFD(in_channel, out_channel).to(device)
+     
+    outputs = module(inputs)  
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)    
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,     
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/downsample/HWD.py b/engine/extre_module/custom_nn/downsample/HWD.py
new file mode 100644
index 00000000..d08f8d50
--- /dev/null
+++ b/engine/extre_module/custom_nn/downsample/HWD.py
@@ -0,0 +1,52 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/HWD.png
+论文链接：https://www.sciencedirect.com/science/article/pii/S0031320323005174
+'''
+  
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings    
+warnings.filterwarnings('ignore')  
+from calflops import calculate_flops    
+     
+import torch
+import torch.nn as nn
+from engine.extre_module.ultralytics_nn.conv import Conv
+    
+class HWD(nn.Module):
+    def __init__(self, in_ch, out_ch): 
+        super(HWD, self).__init__()
+        from pytorch_wavelets import DWTForward
+        self.wt = DWTForward(J=1, mode='zero', wave='haar')   
+        self.conv = Conv(in_ch * 4, out_ch, 1, 1)
+ 
+    def forward(self, x):
+        yL, yH = self.wt(x)    
+        y_HL = yH[0][:,:,0,::]   
+        y_LH = yH[0][:,:,1,::]
+        y_HH = yH[0][:,:,2,::]
+        x = torch.cat([yL, y_HL, y_LH, y_HH], dim=1)        
+        x = self.conv(x)
+   
+        return x
+   
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32   
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+ 
+    module = HWD(in_channel, out_channel).to(device)
+   
+    outputs = module(inputs)  
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+     
+    print(ORANGE) 
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,    
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/downsample/SPDConv.py b/engine/extre_module/custom_nn/downsample/SPDConv.py
new file mode 100644
index 00000000..161cc51e
--- /dev/null
+++ b/engine/extre_module/custom_nn/downsample/SPDConv.py
@@ -0,0 +1,47 @@
+''' 
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/SPDConv.png
+论文链接：https://arxiv.org/abs/2208.03641     
+'''
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings   
+warnings.filterwarnings('ignore')     
+from calflops import calculate_flops 
+     
+import torch    
+import torch.nn as nn
+from engine.extre_module.ultralytics_nn.conv import Conv
+    
+class SPDConv(nn.Module):   
+    # Changing the dimension of the Tensor   
+    def __init__(self, inc, ouc, dimension=1):  
+        super().__init__()
+        self.d = dimension
+        self.conv = Conv(inc * 4, ouc, k=3)    
+  
+    def forward(self, x):
+        x = torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)    
+        x = self.conv(x)
+        return x  
+   
+if __name__ == '__main__': 
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32 
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+ 
+    module = SPDConv(in_channel, out_channel).to(device)
+     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,   
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,  
+                                     output_precision=4,
+                                     print_detailed=True)    
+    print(RESET)  
diff --git a/engine/extre_module/custom_nn/downsample/WaveletPool.py b/engine/extre_module/custom_nn/downsample/WaveletPool.py
new file mode 100644
index 00000000..d9150199
--- /dev/null
+++ b/engine/extre_module/custom_nn/downsample/WaveletPool.py
@@ -0,0 +1,84 @@
+'''
+本文件由BiliBili：魔傀面具整理   
+engine/extre_module/module_images/ICLR2018-WaveletPool.png   
+论文链接：https://openreview.net/pdf?id=rkhlb8lCZ    
+'''   
+  
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings   
+warnings.filterwarnings('ignore')   
+from calflops import calculate_flops 
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F     
+import numpy as np    
+    
+from engine.extre_module.ultralytics_nn.conv import Conv
+
+class WaveletPool(nn.Module):   
+    def __init__(self):
+        """     
+        小波池化 (Wavelet Pooling) 层，使用 Haar 小波基进行 2x2 下采样。 
+        该层的作用是将输入特征图降采样，并将其转换为小波系数（低频 LL 和高频 LH、HL、HH 分量）。  
+        """
+        super(WaveletPool, self).__init__()
+        
+        # 定义 Haar 小波的变换滤波器（低频 LL、高频 LH、HL、HH 分量）
+        ll = np.array([[0.5, 0.5], [0.5, 0.5]])  # 低频分量     
+        lh = np.array([[-0.5, -0.5], [0.5, 0.5]])  # 垂直高频分量
+        hl = np.array([[-0.5, 0.5], [-0.5, 0.5]])  # 水平高频分量   
+        hh = np.array([[0.5, -0.5], [-0.5, 0.5]])  # 对角高频分量    
+     
+        # 组合所有滤波器，并沿第 0 维度 (输出通道维度) 堆叠    
+        filts = np.stack([
+            ll[None, ::-1, ::-1],  # 低频分量 (LL)
+            lh[None, ::-1, ::-1],  # 垂直高频分量 (LH)
+            hl[None, ::-1, ::-1],  # 水平高频分量 (HL)  
+            hh[None, ::-1, ::-1]   # 对角高频分量 (HH)
+        ], axis=0)
+        
+        # 将滤波器转换为 PyTorch 张量，并设为不可训练参数   
+        self.weight = nn.Parameter(   
+            torch.tensor(filts).to(torch.get_default_dtype()),  # 转换为默认数据类型
+            requires_grad=False  # 该参数在训练过程中不进行更新
+        )
+
+    def forward(self, x): 
+        """    
+        前向传播函数，执行小波变换池化操作。  
+        :param x: 输入特征图，形状为 (B, C, H, W)，其中 C 是通道数。   
+        :return: 下采样后的特征图，形状为 (B, 4C, H/2, W/2)，其中 4C 代表 4 个小波分量。
+        """  
+   
+        # 获取输入的通道数 C，每个通道都会被分解为 4 个小波分量    
+        C = x.shape[1]  # 输入特征图的通道数   
+        
+        # 复制滤波器，使其适用于所有通道，并扩展到完整的通道数 
+        filters = torch.cat([self.weight, ] * C, dim=0)
+        
+        # 进行 2D 卷积 (相当于小波变换)，步长 2 进行 2x2 下采样
+        y = F.conv2d(x, filters, groups=C, stride=2)   
+     
+        return y   
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"  
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+
+    module = WaveletPool().to(device)
+    
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+  
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)    
diff --git a/engine/extre_module/custom_nn/downsample/YOLOV7Down.py b/engine/extre_module/custom_nn/downsample/YOLOV7Down.py
new file mode 100644
index 00000000..aed3c9d5
--- /dev/null
+++ b/engine/extre_module/custom_nn/downsample/YOLOV7Down.py
@@ -0,0 +1,51 @@
+'''
+本文件由BiliBili：魔傀面具整理
+论文链接：https://arxiv.org/pdf/2207.02696 
+''' 
+    
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')  
+
+import warnings 
+warnings.filterwarnings('ignore')     
+from calflops import calculate_flops 
+     
+import torch    
+import torch.nn as nn
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad
+ 
+class V7DownSampling(nn.Module):     
+    def __init__(self, inc, ouc) -> None:   
+        super(V7DownSampling, self).__init__()
+ 
+        ouc = ouc // 2    
+        self.maxpool = nn.Sequential(  
+            nn.MaxPool2d(kernel_size=2, stride=2),  
+            Conv(inc, ouc, k=1)
+        )
+        self.conv = nn.Sequential(     
+            Conv(inc, ouc, k=1),
+            Conv(ouc, ouc, k=3, s=2),
+        )
+    
+    def forward(self, x):
+        return torch.cat([self.maxpool(x), self.conv(x)], dim=1)  
+   
+if __name__ == '__main__':   
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)  
+   
+    module = V7DownSampling(in_channel, out_channel).to(device)    
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+    
+    print(ORANGE)   
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, in_channel, height, width),  
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)   
diff --git a/engine/extre_module/custom_nn/downsample/gcnet.py b/engine/extre_module/custom_nn/downsample/gcnet.py
new file mode 100644
index 00000000..9b207e53
--- /dev/null
+++ b/engine/extre_module/custom_nn/downsample/gcnet.py
@@ -0,0 +1,129 @@
+'''  
+本文件由BiliBili：魔傀面具整理     
+engine/extre_module/module_images/IEEETIP2020-ContextGuidedBlock_Down.png   
+论文链接：https://arxiv.org/pdf/1811.08201
+'''
+     
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+     
+import warnings
+warnings.filterwarnings('ignore')    
+from calflops import calculate_flops     
+   
+import torch 
+import torch.nn as nn    
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad 
+    
+class FGlo(nn.Module):
+    """
+    the FGlo class is employed to refine the joint feature of both local feature and surrounding context.    
+    """     
+    def __init__(self, channel, reduction=16):   
+        super(FGlo, self).__init__()     
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+                nn.Linear(channel, channel // reduction),
+                nn.ReLU(inplace=True),
+                nn.Linear(channel // reduction, channel),
+                nn.Sigmoid() 
+        )
+  
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+class ContextGuidedBlock(nn.Module): 
+    def __init__(self, nIn, nOut, dilation_rate=2, reduction=16, add=True):
+        """
+        args:
+           nIn: number of input channels  
+           nOut: number of output channels, 
+           add: if true, residual learning 
+        """    
+        super().__init__() 
+        n= int(nOut/2) 
+        self.conv1x1 = Conv(nIn, n, 1, 1)  #1x1 Conv is employed to reduce the computation
+        self.F_loc = nn.Conv2d(n, n, 3, padding=1, groups=n)   
+        self.F_sur = nn.Conv2d(n, n, 3, padding=autopad(3, None, dilation_rate), dilation=dilation_rate, groups=n) # surrounding context 
+        self.bn_act = nn.Sequential(    
+            nn.BatchNorm2d(nOut),    
+            Conv.default_act
+        )
+        self.add = add  
+        self.F_glo= FGlo(nOut, reduction)  
+    
+    def forward(self, input):
+        output = self.conv1x1(input) 
+        loc = self.F_loc(output)   
+        sur = self.F_sur(output) 
+        
+        joi_feat = torch.cat([loc, sur], 1) 
+
+        joi_feat = self.bn_act(joi_feat)   
+
+        output = self.F_glo(joi_feat)  #F_glo is employed to refine the joint feature   
+        # if residual version  
+        if self.add:
+            output  = input + output 
+        return output  
+
+class ContextGuidedBlock_Down(nn.Module):     
+    """
+    the size of feature map divided 2, (H,W,C)---->(H/2, W/2, 2C)
+    """
+    def __init__(self, nIn, nOut, dilation_rate=2, reduction=16):
+        """
+        args:
+           nIn: the channel of input feature map     
+           nOut: the channel of output feature map  
+        """   
+        super().__init__()  
+        self.conv1x1 = Conv(nIn, nOut, 3, s=2)  #  size/2, channel: nIn--->nOut  
+        
+        self.F_loc = nn.Conv2d(nOut, nOut, 3, padding=1, groups=nOut)    
+        self.F_sur = nn.Conv2d(nOut, nOut, 3, padding=autopad(3, None, dilation_rate), dilation=dilation_rate, groups=nOut) 
+        
+        self.bn = nn.BatchNorm2d(2 * nOut, eps=1e-3)
+        self.act = Conv.default_act 
+        self.reduce = Conv(2 * nOut, nOut,1,1)  #reduce dimension: 2*nOut--->nOut
+     
+        self.F_glo = FGlo(nOut, reduction)  
+  
+    def forward(self, input):
+        print(1111111111111111)
+        output = self.conv1x1(input)
+        loc = self.F_loc(output)
+        sur = self.F_sur(output)     
+
+        joi_feat = torch.cat([loc, sur],1)  #  the joint feature
+        joi_feat = self.bn(joi_feat)    
+        joi_feat = self.act(joi_feat)
+        joi_feat = self.reduce(joi_feat)     #channel= nOut 
+        
+        output = self.F_glo(joi_feat)  # F_glo is employed to refine the joint feature   
+
+        return output
+ 
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32    
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+
+    module = ContextGuidedBlock_Down(in_channel, out_channel).to(device)
+   
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)    
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,  
+                                     print_detailed=True)   
+    print(RESET)
+    
+    # python tools/benchmark/get_info.py -c configs/test/deim_hgnetv2_n_visdrone.yml
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/downsample/lawds.py b/engine/extre_module/custom_nn/downsample/lawds.py
new file mode 100644
index 00000000..f207a12e
--- /dev/null
+++ b/engine/extre_module/custom_nn/downsample/lawds.py
@@ -0,0 +1,86 @@
+'''    
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/自研模块-Light Adaptive-weight downsampling.png   
+自研模块：Light Adaptive-weight downsampling
+'''   
+     
+import os, sys   
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..') 
+
+import warnings 
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops 
+
+import torch 
+import torch.nn as nn
+from einops import rearrange
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad
+
+# LAWDS模块描述   
+   
+# 1. LAWDS模块适合的任务及解决的问题
+     
+# 轻量自适应权重下采样（Light Adaptive-Weight Downsampling，简称LAWDS）模块专为高维视觉数据的特征提取与降维任务设计，特别适用于需要高效空间分辨率压缩的计算机视觉场景，如图像分类、目标检测和语义分割等。该模块通过引入自适应权重机制，解决了传统下采样方法（如最大池化或卷积下采样）在信息保留与计算效率之间难以平衡的问题，尤其在处理复杂纹理或高频细节时，能够显著减少信息丢失。
+     
+# LAWDS模块的核心目标是优化下采样过程中的特征选择，使其在降低空间分辨率的同时，动态保留对任务至关重要的语义信息。这对于轻量化模型设计尤为重要，能够在边缘设备或资源受限环境中实现高效推理，同时保持高性能表现。     
+
+# 2. LAWDS模块的创新点与优点 
+
+# 创新点:   
+
+# 自适应权重生成机制：LAWDS模块通过结合全局上下文的注意力机制（基于平均池化和1x1卷积），动态生成空间自适应权重。这种机制突破了传统固定核下采样的局限性，能够根据输入特征的语义内容自适应地调整下采样策略，从而在不同场景下实现更优的特征保留。   
+  
+# 分组卷积与通道重组：模块采用分组卷积（group convolution）结合通道重组（rearrange）操作，在扩展通道维度的同时降低计算复杂度。这种设计不仅增强了特征表达能力，还通过高效的通道交互保留了跨通道的语义关联。
+    
+# 多尺度信息融合：通过对下采样特征进行多尺度（s1×s2）重组并施加软最大化（softmax）权重，LAWDS能够在空间维度上实现细粒度的信息加权融合。这种方法在理论上等价于一种局部自注意力机制，但计算开销显著降低，具有更高的工程实用性。
+  
+# 优点:    
+
+# 高效性与轻量化：LAWDS在保持高性能的同时，通过分组卷积和高效注意力机制大幅减少了参数量和计算量，使其非常适合资源受限的部署场景，如移动端或嵌入式设备。  
+
+# 鲁棒性与通用性：自适应权重机制赋予了模块强大的泛化能力，使其在多样化的视觉任务和数据分布中均能表现出色，尤其是在处理具有高动态范围或复杂背景的图像时。
+   
+# 综上所述，LAWDS模块通过创新的自适应权重生成与高效特征重组机制，为计算机视觉任务提供了一种兼具高效性、鲁棒性和通用性的下采样解决方案，为轻量化模型设计和边缘计算领域开辟了新的可能性。
+     
+class LAWDS(nn.Module):
+    # Light Adaptive-weight downsampling     
+    def __init__(self, in_ch, out_ch, group=16) -> None:
+        super().__init__()
+     
+        self.softmax = nn.Softmax(dim=-1)   
+        self.attention = nn.Sequential(
+            nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
+            Conv(in_ch, in_ch, k=1)  
+        )
+        
+        self.ds_conv = Conv(in_ch, in_ch * 4, k=3, s=2, g=(in_ch // group))     
+        self.conv1x1 = Conv(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()   
+    
+    def forward(self, x):     
+        # bs, ch, 2*h, 2*w => bs, ch, h, w, 4
+        att = rearrange(self.attention(x), 'bs ch (s1 h) (s2 w) -> bs ch h w (s1 s2)', s1=2, s2=2)
+        att = self.softmax(att)    
+     
+        # bs, 4 * ch, h, w => bs, ch, h, w, 4
+        x = rearrange(self.ds_conv(x), 'bs (s ch) h w -> bs ch h w s', s=4)    
+        x = torch.sum(x * att, dim=-1)
+        return self.conv1x1(x)     
+   
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32   
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)     
+
+    module = LAWDS(in_channel, out_channel, group=16).to(device)
+     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+    
+    print(ORANGE)  
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True, 
+                                     output_precision=4,     
+                                     print_detailed=True)   
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/featurefusion/CSFCN.py b/engine/extre_module/custom_nn/featurefusion/CSFCN.py
new file mode 100644
index 00000000..e159eb34
--- /dev/null
+++ b/engine/extre_module/custom_nn/featurefusion/CSFCN.py
@@ -0,0 +1,221 @@
+'''   
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/IEEETIP2023-CSFCN.png
+论文链接：https://ieeexplore.ieee.x-lib.xyz/document/10268334  
+'''
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+     
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+
+import torch     
+import torch.nn as nn  
+import torch.nn.functional as F
+  
+from engine.extre_module.ultralytics_nn.conv import Conv
+    
+class PSPModule(nn.Module):
+    # (1, 2, 3, 6)
+    # (1, 3, 6, 8)     
+    # (1, 4, 8,12)     
+    def __init__(self, grids=(1, 2, 3, 6), channels=256):
+        super(PSPModule, self).__init__()    
+
+        self.grids = grids
+        self.channels = channels
+
+    def forward(self, feats):
+
+        b, c , h , w = feats.size()
+        ar = w / h 
+
+        return torch.cat([  
+            F.adaptive_avg_pool2d(feats, (self.grids[0], max(1, round(ar * self.grids[0])))).view(b, self.channels, -1),
+            F.adaptive_avg_pool2d(feats, (self.grids[1], max(1, round(ar * self.grids[1])))).view(b, self.channels, -1),
+            F.adaptive_avg_pool2d(feats, (self.grids[2], max(1, round(ar * self.grids[2])))).view(b, self.channels, -1),
+            F.adaptive_avg_pool2d(feats, (self.grids[3], max(1, round(ar * self.grids[3])))).view(b, self.channels, -1)
+        ], dim=2)
+    
+class LocalAttenModule(nn.Module):    
+    def __init__(self, in_channels=256, inter_channels=32):
+        super(LocalAttenModule, self).__init__()    
+
+        self.conv = nn.Sequential(   
+            Conv(in_channels, inter_channels,1),
+            nn.Conv2d(inter_channels, in_channels, kernel_size=3, padding=1, bias=False)) 
+
+        self.tanh_spatial = nn.Tanh()
+        self.conv[1].weight.data.zero_()
+        self.keras_init_weight()     
+    def keras_init_weight(self):  
+        for ly in self.children():
+            if isinstance(ly, (nn.Conv2d,nn.Conv1d)):
+                nn.init.xavier_normal_(ly.weight)
+                # nn.init.xavier_normal_(ly.weight,gain=nn.init.calculate_gain('relu')) 
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)    
+
+    def forward(self, x):     
+        res1 = x 
+        res2 = x   
+
+        x = self.conv(x)
+        x_mask = self.tanh_spatial(x)
+    
+        res1 = res1 * x_mask    
+  
+        return res1 + res2
+
+class CFC_CRB(nn.Module): 
+    def __init__(self, in_channels=512, grids=(6, 3, 2, 1)): # 先ce后ffm  
+
+        super(CFC_CRB, self).__init__()  
+        self.grids = grids  
+        inter_channels = in_channels    
+        self.inter_channels = inter_channels
+     
+        self.reduce_channel = Conv(in_channels, inter_channels, 3)  
+        self.query_conv = nn.Conv2d(in_channels=inter_channels, out_channels=32, kernel_size=1)
+        self.key_conv = nn.Conv1d(in_channels=inter_channels, out_channels=32, kernel_size=1) 
+        self.value_conv = nn.Conv1d(in_channels=inter_channels, out_channels=self.inter_channels, kernel_size=1)     
+        self.key_channels = 32   
+
+        self.value_psp = PSPModule(grids, inter_channels)   
+        self.key_psp = PSPModule(grids, inter_channels)
+
+        self.softmax = nn.Softmax(dim=-1)    
+
+        self.local_attention = LocalAttenModule(inter_channels, inter_channels//8)
+        self.keras_init_weight()
+        
+    def keras_init_weight(self):
+        for ly in self.children(): 
+            if isinstance(ly, (nn.Conv2d,nn.Conv1d)):
+                nn.init.xavier_normal_(ly.weight)  
+                # nn.init.xavier_normal_(ly.weight,gain=nn.init.calculate_gain('relu'))  
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def forward(self, x):
+
+        x = self.reduce_channel(x) # 降维- 128
+
+        m_batchsize,_,h,w = x.size() 
+     
+        query = self.query_conv(x).view(m_batchsize,32,-1).permute(0,2,1) ##  b c n ->  b n c  
+
+        key = self.key_conv(self.key_psp(x))  ## b c s    
+
+        sim_map = torch.matmul(query,key)
+
+        sim_map = self.softmax(sim_map)
+        # sim_map = self.attn_drop(sim_map)
+        value = self.value_conv(self.value_psp(x)) #.permute(0,2,1)  ## b c s 
+
+        # context = torch.matmul(sim_map,value) ## B N S * B S C ->  B N C 
+        context = torch.bmm(value,sim_map.permute(0,2,1))  #  B C S * B S N - >  B C N
+
+        # context = context.permute(0,2,1).view(m_batchsize,self.inter_channels,h,w) 
+        context = context.view(m_batchsize,self.inter_channels,h,w)
+        # out = x + self.gamma * context     
+        context = self.local_attention(context)
+   
+        out = x + context
+
+        return out
+    
+class SFC_G2(nn.Module):
+    def __init__(self, inc, ouc):
+        super(SFC_G2, self).__init__()  
+     
+        self.groups = 2
+        self.conv_8 = Conv(inc[1], ouc, 3)     
+        self.conv_32 = Conv(inc[0], ouc, 3)   
+
+        self.conv_offset = nn.Sequential(     
+            Conv(ouc * 2, 64),
+            nn.Conv2d(64, self.groups * 4 + 2, kernel_size=3, padding=1, bias=False)
+        )
+     
+        self.keras_init_weight()
+        self.conv_offset[1].weight.data.zero_()
+     
+    def keras_init_weight(self):   
+        for ly in self.children():
+            if isinstance(ly, (nn.Conv2d, nn.Conv1d)):
+                nn.init.xavier_normal_(ly.weight)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)     
+                
+    def forward(self, x):   
+        sp, cp = x
+        n, _, out_h, out_w = cp.size()   
+    
+        # x_32
+        sp = self.conv_32(sp)  # 语义特征  1 / 8  256
+        sp = F.interpolate(sp, cp.size()[2:], mode='bilinear', align_corners=True)
+        # x_8
+        cp = self.conv_8(cp)
+   
+        conv_results = self.conv_offset(torch.cat([cp, sp], 1))
+     
+        sp = sp.reshape(n*self.groups,-1,out_h,out_w)
+        cp = cp.reshape(n*self.groups,-1,out_h,out_w)
+
+        offset_l = conv_results[:, 0:self.groups*2, :, :].reshape(n*self.groups,-1,out_h,out_w)
+        offset_h = conv_results[:, self.groups*2:self.groups*4, :, :].reshape(n*self.groups,-1,out_h,out_w)
+  
+        norm = torch.tensor([[[[out_w, out_h]]]]).type_as(sp).to(sp.device)  
+        w = torch.linspace(-1.0, 1.0, out_h).view(-1, 1).repeat(1, out_w)   
+        h = torch.linspace(-1.0, 1.0, out_w).repeat(out_h, 1)     
+        grid = torch.cat((h.unsqueeze(2), w.unsqueeze(2)), 2)    
+        grid = grid.repeat(n*self.groups, 1, 1, 1).type_as(sp).to(sp.device)    
+
+        grid_l = grid + offset_l.permute(0, 2, 3, 1) / norm   
+        grid_h = grid + offset_h.permute(0, 2, 3, 1) / norm     
+  
+        cp = F.grid_sample(cp, grid_l , align_corners=True)  ## 考虑是否指定align_corners
+        sp = F.grid_sample(sp, grid_h , align_corners=True)  ## 考虑是否指定align_corners
+
+        cp = cp.reshape(n, -1, out_h, out_w)    
+        sp = sp.reshape(n, -1, out_h, out_w)     
+
+        att = 1 + torch.tanh(conv_results[:, self.groups*4:, :, :])
+        sp = sp * att[:, 0:1, :, :] + cp * att[:, 1:2, :, :]
+
+        return sp
+
+class CSFCN(nn.Module):
+    def __init__(self, inc, ouc) -> None: 
+        super().__init__()    
+
+        self.CFC_CRB = CFC_CRB(inc[0])
+        self.SFC_G2 = SFC_G2(inc, ouc)  
+    
+    def forward(self, x):
+        p3, p5 = x     
+        p3 = self.CFC_CRB(p3)
+        p5 = self.SFC_G2((p3, p5))
+        return p3, p5
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"  
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')   
+    batch_size, channel_p3, height_p3, width_p3 = 1, 32, 80, 80
+    batch_size, channel_p5, height_p5, width_p5 = 1, 64, 20, 20
+    ouc_channel = 32   
+    inputs_1 = torch.randn((batch_size, channel_p3, height_p3, width_p3)).to(device) 
+    inputs_2 = torch.randn((batch_size, channel_p5, height_p5, width_p5)).to(device)  
+     
+    module = CSFCN([channel_p3, channel_p5], ouc_channel).to(device)  
+
+    outputs = module([inputs_1, inputs_2])
+    print(GREEN + f'p3.size:{inputs_1.size()} p5.size:{inputs_2.size()} outputs_p3.size:{outputs[0].size()} outputs_p5.size:{outputs[1].size()}' + RESET)   
+    
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module,
+                                     args=[[inputs_1, inputs_2]],
+                                     output_as_string=True,
+                                     output_precision=4,  
+                                     print_detailed=True)     
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/featurefusion/cgfm.py b/engine/extre_module/custom_nn/featurefusion/cgfm.py
new file mode 100644
index 00000000..86fc7dfe
--- /dev/null
+++ b/engine/extre_module/custom_nn/featurefusion/cgfm.py
@@ -0,0 +1,116 @@
+'''    
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/自研模块-ContextGuideFusionModule.png    
+自研模块：ContextGuideFusionModule
+公开讲解视频：https://www.bilibili.com/video/BV1Vx4y1n7hZ/
+'''
+   
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+    
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops  
+
+import torch   
+import torch.nn as nn
+from torch.nn import init
+
+from engine.extre_module.ultralytics_nn.conv import Conv
+     
+# ContextGuideFusionModule
+# 1. 适用任务与解决问题     
+# ContextGuideFusionModule 模块专为深度学习框架中的复杂特征融合任务而设计，特别适用于计算机视觉领域，如图像分割、目标检测以及多模态数据整合等任务。该模块旨在解决如何高效融合来自多个源或网络分支的异构特征表示这一核心问题，这些特征通常在通道维度或上下文重点上存在差异。通过协调这些特征，模块有效缓解了信息丢失、特征不对齐以及融合效果欠佳等问题，这些问题往往会削弱模型在需要精确捕捉空间与语义关系的任务中的表现。其自适应的架构确保了互补性上下文线索的稳健整合，使其成为处理特征间复杂相互依赖关系的理想选择。
+# 2. 创新点与优势
+# ContextGuideFusionModule 引入了一系列开创性的设计，与传统特征融合方法相比，具有显著的创新性和性能优势：    
+
+# 动态通道适配：有别于假设输入维度一致的传统方法，该模块通过自适应卷积层（adjust_conv）对齐不匹配的输入通道。这一创新使其能够无缝融合来自不同网络阶段或模态的特征，极大地拓宽了其在多样化架构中的适用性，且无需繁琐的预处理。
+   
+# 上下文感知的特征重校准：模块利用挤压-激励（SE）注意力机制，对拼接后的特征进行智能重校准，突出上下文相关的关键信息。这种有针对性的特征增强确保融合过程优先考虑有意义的模式，从而提升融合特征的判别能力。     
+
+# 双向特征增强：模块的一个独特创新在于其跨分支引导机制，即通过互补分支重校准的特征对每个输入特征图进行加权。这种双向交互促进了上下文信息的协同交换，能够捕捉到单向或简单加性融合策略难以发现的复杂依赖关系。
+     
+# 灵活的输出映射：模块通过条件性的 1x1 卷积（conv1x1）实现输出维度的灵活适配，确保与下游层的兼容性，同时在维持融合表示完整性的前提下提升计算效率。    
+
+# 这些创新带来的优势包括更强的泛化能力，使模块能够稳健处理多样的输入配置；更高的性能表现，通过生成信息丰富的融合特征图显著提升复杂视觉任务的模型精度。此外，其模块化设计便于集成到现有架构中，提供了一种兼顾计算效率与表达能力的可扩展解决方案。
+
+class SEAttention(nn.Module):
+    def __init__(self, channel=512,reduction=16):   
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)   
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),    
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.Sigmoid()
+        )  
+     
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)   
+            elif isinstance(m, nn.BatchNorm2d):   
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.001)   
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+  
+    def forward(self, x):
+        b, c, _, _ = x.size()     
+        y = self.avg_pool(x).view(b, c)     
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y.expand_as(x)
+
+class ContextGuideFusionModule(nn.Module):     
+    def __init__(self, inc, ouc) -> None:     
+        super().__init__()
+    
+        self.adjust_conv = nn.Identity() 
+        if inc[0] != inc[1]:     
+            # 对齐inc[1]通道数
+            self.adjust_conv = Conv(inc[0], inc[1], k=1)    
+        
+        self.se = SEAttention(inc[1] * 2)  
+  
+        if (inc[1] * 2) != ouc: 
+            # 对齐outc通道数
+            self.conv1x1 = Conv(inc[1] * 2, ouc)     
+        else: 
+            self.conv1x1 = nn.Identity() 
+    
+    def forward(self, x): 
+        x0, x1 = x
+        # 如果 x0 和 x1 的通道数不同，则调整 x0 的通道数，使其与 x1 相同。(self.adjust_conv里面设定了)
+        x0 = self.adjust_conv(x0)
+   
+        x_concat = torch.cat([x0, x1], dim=1) # n c h w
+        x_concat = self.se(x_concat) 
+        x0_weight, x1_weight = torch.split(x_concat, [x0.size()[1], x1.size()[1]], dim=1) 
+        x0_weight = x0 * x0_weight     
+        x1_weight = x1 * x1_weight     
+        return self.conv1x1(torch.cat([x0 + x1_weight, x1 + x0_weight], dim=1))
+   
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel_1, channel_2, height, width = 1, 32, 16, 32, 32
+    ouc_channel = 32  
+    inputs_1 = torch.randn((batch_size, channel_1, height, width)).to(device)
+    inputs_2 = torch.randn((batch_size, channel_2, height, width)).to(device)
+     
+    module = ContextGuideFusionModule([channel_1, channel_2], ouc_channel).to(device)  
+
+    outputs = module([inputs_1, inputs_2])
+    print(GREEN + f'inputs1.size:{inputs_1.size()} inputs2.size:{inputs_2.size()} outputs.size:{outputs.size()}' + RESET)    
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,  
+                                     args=[[inputs_1, inputs_2]],  
+                                     output_as_string=True,     
+                                     output_precision=4,
+                                     print_detailed=True) 
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/featurefusion/mfm.py b/engine/extre_module/custom_nn/featurefusion/mfm.py
new file mode 100644
index 00000000..5cf5a30c
--- /dev/null
+++ b/engine/extre_module/custom_nn/featurefusion/mfm.py
@@ -0,0 +1,79 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2024-MFM.png 
+论文链接：https://arxiv.org/pdf/2403.01105
+'''
+   
+import os, sys     
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+ 
+import torch   
+import torch.nn as nn     
+import torch.nn.functional as F
+  
+from engine.extre_module.ultralytics_nn.conv import Conv 
+
+class MFM(nn.Module):
+    def __init__(self, inc, dim, reduction=8):
+        super(MFM, self).__init__()   
+
+        self.height = len(inc)
+        d = max(int(dim/reduction), 4)
+ 
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.mlp = nn.Sequential(    
+            nn.Conv2d(dim, d, 1, bias=False),     
+            nn.ReLU(),
+            nn.Conv2d(d, dim * self.height, 1, bias=False)
+        )
+
+        self.softmax = nn.Softmax(dim=1)
+
+        self.conv1x1 = nn.ModuleList([])
+        for i in inc:
+            if i != dim:    
+                self.conv1x1.append(Conv(i, dim, 1))
+            else:    
+                self.conv1x1.append(nn.Identity())     
+
+    def forward(self, in_feats_):  
+        in_feats = []
+        for idx, layer in enumerate(self.conv1x1):
+            in_feats.append(layer(in_feats_[idx]))   
+
+        B, C, H, W = in_feats[0].shape  
+
+        in_feats = torch.cat(in_feats, dim=1)   
+        in_feats = in_feats.view(B, self.height, C, H, W)    
+
+        feats_sum = torch.sum(in_feats, dim=1)
+        attn = self.mlp(self.avg_pool(feats_sum))     
+        attn = self.softmax(attn.view(B, self.height, C, 1, 1))  
+     
+        out = torch.sum(in_feats*attn, dim=1)
+        return out
+
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, channel_1, channel_2, height, width = 1, 32, 16, 32, 32
+    ouc_channel = 32  
+    inputs_1 = torch.randn((batch_size, channel_1, height, width)).to(device)  
+    inputs_2 = torch.randn((batch_size, channel_2, height, width)).to(device)    
+
+    module = MFM([channel_1, channel_2], ouc_channel).to(device)
+     
+    outputs = module([inputs_1, inputs_2])
+    print(GREEN + f'inputs1.size:{inputs_1.size()} inputs2.size:{inputs_2.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE) 
+    flops, macs, _ = calculate_flops(model=module,     
+                                     args=[[inputs_1, inputs_2]],
+                                     output_as_string=True, 
+                                     output_precision=4,
+                                     print_detailed=True)   
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/featurefusion/msga.py b/engine/extre_module/custom_nn/featurefusion/msga.py
new file mode 100644
index 00000000..06c89ba6
--- /dev/null
+++ b/engine/extre_module/custom_nn/featurefusion/msga.py
@@ -0,0 +1,191 @@
+'''
+本文件由BiliBili：魔傀面具整理   
+engine/extre_module/module_images/BMVC2024-MASAG.png    
+论文链接：https://arxiv.org/abs/2407.21640
+'''
+
+import os, sys  
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+ 
+import warnings
+warnings.filterwarnings('ignore')  
+from calflops import calculate_flops  
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+   
+from engine.extre_module.ultralytics_nn.conv import Conv
+    
+class GlobalExtraction(nn.Module):
+    def __init__(self, dim = None):    
+        super().__init__()
+        self.avgpool = self.globalavgchannelpool
+        self.maxpool = self.globalmaxchannelpool 
+        self.proj = nn.Sequential(
+            nn.Conv2d(2, 1, 1,1),
+            nn.BatchNorm2d(1)
+        )
+    def globalavgchannelpool(self, x):   
+        x = x.mean(1, keepdim = True)
+        return x
+  
+    def globalmaxchannelpool(self, x):   
+        x = x.max(dim = 1, keepdim=True)[0]
+        return x    
+     
+    def forward(self, x):    
+        x_ = x.clone()  
+        x = self.avgpool(x)
+        x2 = self.maxpool(x_)     
+     
+        cat = torch.cat((x,x2), dim = 1)   
+  
+        proj = self.proj(cat)
+        return proj   
+  
+class ContextExtraction(nn.Module):  
+    def __init__(self, dim, reduction = None):
+        super().__init__()
+        self.reduction = 1 if reduction == None else 2     
+
+        self.dconv = self.DepthWiseConv2dx2(dim) 
+        self.proj = self.Proj(dim)   
+
+    def DepthWiseConv2dx2(self, dim):     
+        dconv = nn.Sequential(
+            nn.Conv2d(in_channels = dim,     
+                out_channels = dim,  
+                kernel_size = 3,
+                padding = 1, 
+                groups = dim),  
+            nn.BatchNorm2d(num_features = dim),
+            nn.ReLU(inplace = True),  
+            nn.Conv2d(in_channels = dim,   
+                out_channels = dim, 
+                kernel_size = 3,
+                padding = 2,    
+                dilation = 2),     
+            nn.BatchNorm2d(num_features = dim),
+            nn.ReLU(inplace = True)
+        )
+        return dconv  
+    
+    def Proj(self, dim):   
+        proj = nn.Sequential(
+            nn.Conv2d(in_channels = dim,
+                out_channels = dim //self.reduction,  
+                kernel_size = 1
+                ),  
+            nn.BatchNorm2d(num_features = dim//self.reduction)
+        )
+        return proj  
+    def forward(self,x):
+        x = self.dconv(x)
+        x = self.proj(x)   
+        return x
+
+class MultiscaleFusion(nn.Module):
+    def __init__(self, dim):   
+        super().__init__()    
+        self.local= ContextExtraction(dim)
+        self.global_ = GlobalExtraction()    
+        self.bn = nn.BatchNorm2d(num_features=dim)
+  
+    def forward(self, x, g,):
+        x = self.local(x)    
+        g = self.global_(g)
+     
+        fuse = self.bn(x + g)    
+        return fuse
+     
+    
+class MultiScaleGatedAttn(nn.Module): 
+    # Version 1    
+    def __init__(self, inc, ouc):
+        super().__init__()
+        dim = ouc    
+
+        if inc[0] != ouc:
+            self.conv1 = Conv(inc[0], ouc)
+        else:    
+            self.conv1 = nn.Identity()    
+    
+        if inc[1] != ouc:
+            self.conv2 = Conv(inc[1], ouc)
+        else:     
+            self.conv2 = nn.Identity()
+
+        self.multi = MultiscaleFusion(dim)    
+        self.selection = nn.Conv2d(dim, 2,1)
+        self.proj = nn.Conv2d(dim, dim,1) 
+        self.bn = nn.BatchNorm2d(dim)   
+        self.bn_2 = nn.BatchNorm2d(dim)   
+        self.conv_block = nn.Sequential(     
+            nn.Conv2d(in_channels=dim, out_channels=dim, 
+                    kernel_size=1, stride=1))  
+  
+    def forward(self, inputs):     
+        x, g = inputs
+        x = self.conv1(x)   
+        g = self.conv2(g)
+        x_ = x.clone()  
+        g_ = g.clone()   
+ 
+        #stacked = torch.stack((x_, g_), dim = 1) # B, 2, C, H, W  
+
+        multi = self.multi(x, g) # B, C, H, W
+    
+        ### Option 2 ###     
+        multi = self.selection(multi) # B, num_path, H, W    
+
+        attention_weights = F.softmax(multi, dim=1)  # Shape: [B, 2, H, W]    
+        #attention_weights = torch.sigmoid(multi)    
+        A, B = attention_weights.split(1, dim=1)  # Each will have shape [B, 1, H, W] 
+
+        x_att = A.expand_as(x_) * x_  # Using expand_as to match the channel dimensions
+        g_att = B.expand_as(g_) * g_     
+
+        x_att = x_att + x_    
+        g_att = g_att + g_
+        ## Bidirectional Interaction 
+  
+        x_sig = torch.sigmoid(x_att)
+        g_att_2 = x_sig * g_att
+
+   
+        g_sig = torch.sigmoid(g_att)    
+        x_att_2 = g_sig * x_att
+
+        interaction = x_att_2 * g_att_2
+    
+        projected = torch.sigmoid(self.bn(self.proj(interaction)))    
+ 
+        weighted = projected * x_
+
+        y = self.conv_block(weighted)
+
+        #y = self.bn_2(weighted + y)   
+        y = self.bn_2(y) 
+        return y
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel_1, channel_2, height, width = 1, 32, 16, 32, 32    
+    ouc_channel = 32  
+    inputs_1 = torch.randn((batch_size, channel_1, height, width)).to(device) 
+    inputs_2 = torch.randn((batch_size, channel_2, height, width)).to(device)
+
+    module = MultiScaleGatedAttn([channel_1, channel_2], ouc_channel).to(device)
+
+    outputs = module([inputs_1, inputs_2])
+    print(GREEN + f'inputs1.size:{inputs_1.size()} inputs2.size:{inputs_2.size()} outputs.size:{outputs.size()}' + RESET)
+ 
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module,
+                                     args=[[inputs_1, inputs_2]],    
+                                     output_as_string=True,  
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/mlp/ConvolutionalGLU.py b/engine/extre_module/custom_nn/mlp/ConvolutionalGLU.py
new file mode 100644
index 00000000..82ecf984
--- /dev/null
+++ b/engine/extre_module/custom_nn/mlp/ConvolutionalGLU.py
@@ -0,0 +1,102 @@
+'''
+本文件由BiliBili：魔傀面具整理    
+engine/extre_module/module_images/CVPR2024-CGLU.png
+论文链接：https://arxiv.org/pdf/2311.17132   
+'''    
+ 
+import warnings     
+warnings.filterwarnings('ignore') 
+from calflops import calculate_flops   
+     
+import torch 
+import torch.nn as nn
+
+class ConvolutionalGLU(nn.Module):
+    """
+    ConvolutionalGLU（卷积门控线性单元）模块
+    
+    该模块结合了通道分割、深度可分离卷积和门控机制，以提高特征表达能力。
+    
+    参数：    
+        in_features (int): 输入通道数。     
+        hidden_features (int, 可选): 隐藏层通道数，默认为输入通道数。 
+        out_features (int, 可选): 输出通道数，默认为输入通道数。
+        act_layer (nn.Module, 可选): 激活函数，默认使用 GELU。     
+        drop (float, 可选): Dropout 概率，默认值为 0。
+    """     
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.) -> None:  
+        super().__init__()
+        
+        # 如果未指定 out_features 和 hidden_features，则默认与 in_features 一致
+        out_features = out_features or in_features 
+        hidden_features = hidden_features or in_features     
+        
+        # 计算隐藏层通道数，并保证为 2/3 的比例
+        hidden_features = int(2 * hidden_features / 3) 
+    
+        # 1x1 卷积用于通道扩展，并进行通道分割（GLU 机制）
+        self.fc1 = nn.Conv2d(in_features, hidden_features * 2, kernel_size=1)     
+   
+        # 深度可分离卷积层，提取局部特征
+        self.dwconv = nn.Sequential(
+            nn.Conv2d(hidden_features, hidden_features, kernel_size=3, stride=1, padding=1, bias=True, groups=hidden_features),    
+            act_layer()  # 激活函数（默认使用 GELU）
+        )
+  
+        # 1x1 卷积用于恢复通道数 
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)  
+        
+        # Dropout 层，防止过拟合
+        self.drop = nn.Dropout(drop)
+    
+    def forward(self, x):
+        """
+        前向传播过程：  
+        1. 先存储输入 x 作为残差连接的 shortcut。
+        2. 通过 1x1 卷积 self.fc1，将输入通道扩展为 2 倍，并分成两个部分 (x, v)。
+        3. x 经过深度可分离卷积 self.dwconv 处理后，与门控信号 v 相乘，实现门控机制。
+        4. 经过 Dropout 防止过拟合。
+        5. 通过 1x1 卷积 self.fc2 将通道数恢复到输出通道数。
+        6. 再次进行 Dropout。
+        7. 残差连接，将原始输入 x_shortcut 与处理后的 x 相加。   
+        """
+        
+        # 残差连接的快捷分支
+        x_shortcut = x    
+     
+        # 1x1 卷积，通道扩展并分割为 x 和门控信号 v
+        x, v = self.fc1(x).chunk(2, dim=1)    
+     
+        # 深度可分离卷积处理，并通过门控信号 v 进行调制    
+        x = self.dwconv(x) * v
+        
+        # Dropout 以减少过拟合  
+        x = self.drop(x)  
+        
+        # 通过 1x1 卷积恢复通道数
+        x = self.fc2(x)   
+        
+        # 再次进行 Dropout 
+        x = self.drop(x)    
+    
+        # 残差连接，最终输出     
+        return x_shortcut + x  
+   
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"   
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+   
+    module = ConvolutionalGLU(in_features=in_channel, out_features=out_channel).to(device) 
+ 
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+     
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,    
+                                     output_precision=4,
+                                     print_detailed=True)     
+    print(RESET)  
diff --git a/engine/extre_module/custom_nn/mlp/DFFN.py b/engine/extre_module/custom_nn/mlp/DFFN.py
new file mode 100644
index 00000000..c292c45d
--- /dev/null
+++ b/engine/extre_module/custom_nn/mlp/DFFN.py
@@ -0,0 +1,131 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/IJCAI2024-DFFN.png
+论文链接：https://www.ijcai.org/proceedings/2024/0081.pdf
+'''
+
+import warnings     
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+     
+import torch
+import torch.nn as nn    
+   
+class LayerNormGeneral(nn.Module):    
+    r""" General LayerNorm for different situations.     
+
+    Args: 
+        affine_shape (int, list or tuple): The shape of affine weight and bias.     
+            Usually the affine_shape=C, but in some implementation, like torch.nn.LayerNorm,
+            the affine_shape is the same as normalized_dim by default.  
+            To adapt to different situations, we offer this argument here.  
+        normalized_dim (tuple or list): Which dims to compute mean and variance. 
+        scale (bool): Flag indicates whether to use scale or not.
+        bias (bool): Flag indicates whether to use scale or not.
+
+        We give several examples to show how to specify the arguments.     
+
+        LayerNorm (https://arxiv.org/abs/1607.06450): 
+            For input shape of (B, *, C) like (B, N, C) or (B, H, W, C),
+                affine_shape=C, normalized_dim=(-1, ), scale=True, bias=True;    
+            For input shape of (B, C, H, W),
+                affine_shape=(C, 1, 1), normalized_dim=(1, ), scale=True, bias=True.
+     
+        Modified LayerNorm (https://arxiv.org/abs/2111.11418)
+            that is idental to partial(torch.nn.GroupNorm, num_groups=1):  
+            For input shape of (B, N, C),   
+                affine_shape=C, normalized_dim=(1, 2), scale=True, bias=True;
+            For input shape of (B, H, W, C),
+                affine_shape=C, normalized_dim=(1, 2, 3), scale=True, bias=True;    
+            For input shape of (B, C, H, W),
+                affine_shape=(C, 1, 1), normalized_dim=(1, 2, 3), scale=True, bias=True.
+
+        For the several metaformer baslines,    
+            IdentityFormer, RandFormer and PoolFormerV2 utilize Modified LayerNorm without bias (bias=False);
+            ConvFormer and CAFormer utilizes LayerNorm without bias (bias=False).
+    """    
+    def __init__(self, affine_shape=None, normalized_dim=(-1, ), scale=True,    
+        bias=True, eps=1e-5):
+        super().__init__()
+        self.normalized_dim = normalized_dim    
+        self.use_scale = scale
+        self.use_bias = bias
+        self.weight = nn.Parameter(torch.ones(affine_shape)) if scale else None   
+        self.bias = nn.Parameter(torch.zeros(affine_shape)) if bias else None
+        self.eps = eps
+    
+    def forward(self, x):    
+        c = x - x.mean(self.normalized_dim, keepdim=True)    
+        s = c.pow(2).mean(self.normalized_dim, keepdim=True)
+        x = c / torch.sqrt(s + self.eps)
+        if self.use_scale:    
+            x = x * self.weight   
+        if self.use_bias:
+            x = x + self.bias
+        return x
+     
+class FrequencyGate(nn.Module):
+    """ Frequency-Gate. 
+    Args:   
+        dim (int): Input channels.   
+    """   
+    def __init__(self, dim):
+        super().__init__()     
+        self.norm = LayerNormGeneral((dim, 1, 1), normalized_dim=(1, 2, 3))
+        self.conv = nn.Sequential( 
+            nn.Conv2d(dim, dim, 1, 1, 0),
+            nn.Conv2d(dim, dim, 3, 1, 1, groups=dim),
+        )   
+
+    def forward(self, x): 
+        x1, x2 = x.chunk(2, dim =1)
+        x2 = self.conv(self.norm(x2))  
+        return x1 * x2
+
+class DFFN(nn.Module):
+    """ Dual frequency aggregation Feed-Forward Network.
+    Args:
+        in_features (int): Number of input channels.  
+        hidden_features (int | None): Number of hidden channels. Default: None 
+        out_features (int | None): Number of output channels. Default: None    
+        act_layer (nn.Module): Activation layer. Default: nn.GELU 
+        drop (float): Dropout rate. Default: 0.0
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):    
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features     
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)  
+        self.act = act_layer()     
+        self.fg = FrequencyGate(hidden_features//2)
+        self.fc2 = nn.Conv2d(hidden_features//2, out_features, 1)  
+        self.drop = nn.Dropout(drop)   
+ 
+    def forward(self, x):     
+        x = self.fc1(x)    
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fg(x)
+        x = self.drop(x) 
+        x = self.fc2(x)
+        x = self.drop(x)  
+        return x
+   
+if __name__ == '__main__':    
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32 
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+     
+    module = DFFN(in_features=in_channel, out_features=out_channel).to(device) 
+  
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, in_channel, height, width),  
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)    
diff --git a/engine/extre_module/custom_nn/mlp/FMFFN.py b/engine/extre_module/custom_nn/mlp/FMFFN.py
new file mode 100644
index 00000000..f3608380
--- /dev/null
+++ b/engine/extre_module/custom_nn/mlp/FMFFN.py
@@ -0,0 +1,72 @@
+'''   
+本文件由BiliBili：魔傀面具整理   
+engine/extre_module/module_images/ICLR2024-FMFFN.png
+论文链接：https://arxiv.org/pdf/2310.16387
+'''   
+
+import warnings    
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+     
+import torch     
+import torch.nn as nn
+from einops import rearrange 
+
+class WindowFrequencyModulation(nn.Module): 
+    def __init__(self, dim, window_size):
+        super().__init__() 
+        self.dim = dim
+        self.window_size = window_size
+        self.ratio = 1    
+        self.complex_weight= nn.Parameter(torch.cat((torch.ones(self.window_size, self.window_size//2+1, self.ratio*dim, 1, dtype=torch.float32),\
+        torch.zeros(self.window_size, self.window_size//2+1, self.ratio*dim, 1, dtype=torch.float32)),dim=-1))     
+ 
+    def forward(self, x):
+        x = rearrange(x, 'b c (w1 p1) (w2 p2) -> b w1 w2 p1 p2 c', p1=self.window_size, p2=self.window_size)    
+  
+        x = x.to(torch.float32)    
+        
+        x= torch.fft.rfft2(x,dim=(3, 4), norm='ortho')
+     
+        weight = torch.view_as_complex(self.complex_weight)
+        x = x * weight
+        x = torch.fft.irfft2(x, s=(self.window_size, self.window_size), dim=(3, 4), norm='ortho')  
+
+        x = rearrange(x, 'b w1 w2 p1 p2 c -> b c (w1 p1) (w2 p2)')
+        return x    
+
+class FMFFN(nn.Module):
+    def __init__(self,in_features, hidden_features=None, out_features=None, window_size=4, act_layer=nn.GELU) -> None:   
+        super().__init__()
+        out_features = out_features or in_features 
+        hidden_features = hidden_features or in_features
+
+        self.ffn = nn.Sequential(   
+            nn.Conv2d(in_features, hidden_features, 1),    
+            act_layer(),   
+            nn.Conv2d(hidden_features, out_features, 1)  
+        )
+  
+        self.fm = WindowFrequencyModulation(out_features, window_size)     
+    
+    def forward(self, x):    
+        return self.fm(self.ffn(x))
+
+if __name__ == '__main__':    
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32 
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+ 
+    module = FMFFN(in_features=in_channel, out_features=out_channel).to(device)
+ 
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+    
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,  
+                                     input_shape=(batch_size, in_channel, height, width),     
+                                     output_as_string=True,     
+                                     output_precision=4,     
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/module/APBottleneck.py b/engine/extre_module/custom_nn/module/APBottleneck.py
new file mode 100644
index 00000000..cba43281
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/APBottleneck.py
@@ -0,0 +1,56 @@
+'''
+本文件由BiliBili：魔傀面具整理 
+engine/extre_module/module_images/AAAI2025-PSConv.png
+论文链接：https://arxiv.org/pdf/2412.16986   
+'''
+
+import os, sys     
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')  
+
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops     
+     
+import torch
+import torch.nn as nn
+from engine.extre_module.ultralytics_nn.conv import Conv   
+
+class APBottleneck(nn.Module):    
+    """Asymmetric Padding bottleneck."""   
+    
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
+        """Initializes a bottleneck module with given input/output channels, shortcut option, group, kernels, and   
+        expansion.    
+        """
+        super().__init__()     
+        c_ = int(c2 * e)  # hidden channels     
+        p = [(2,0,2,0),(0,2,0,2),(0,2,2,0),(2,0,0,2)]  
+        self.pad = [nn.ZeroPad2d(padding=(p[g])) for g in range(4)]
+        self.cv1 = Conv(c1, c_ // 4, k[0], 1, p=0) 
+        # self.cv1 = nn.ModuleList([nn.Conv2d(c1, c_, k[0], stride=1, padding= p[g], bias=False) for g in range(4)])
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2   
+ 
+    def forward(self, x):
+        """'forward()' applies the YOLO FPN to input data."""     
+        # y = self.pad[g](x) for g in range(4)
+        return x + self.cv2((torch.cat([self.cv1(self.pad[g](x)) for g in range(4)], 1))) if self.add else self.cv2((torch.cat([self.cv1(self.pad[g](x)) for g in range(4)], 1)))   
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"   
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+     
+    module = APBottleneck(in_channel, out_channel).to(device)     
+
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+ 
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,   
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,     
+                                     output_precision=4,
+                                     print_detailed=True)    
+    print(RESET)   
diff --git a/engine/extre_module/custom_nn/module/DWR.py b/engine/extre_module/custom_nn/module/DWR.py
new file mode 100644
index 00000000..a46831e2
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/DWR.py
@@ -0,0 +1,61 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/DWR.png
+论文链接：https://arxiv.org/pdf/2212.01173
+'''
+   
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')   
+
+import warnings     
+warnings.filterwarnings('ignore') 
+from calflops import calculate_flops  
+    
+import torch
+import torch.nn as nn 
+
+from engine.extre_module.ultralytics_nn.conv import Conv 
+
+class DWR(nn.Module):
+    def __init__(self, inc, ouc) -> None: 
+        super().__init__()
+     
+        if inc != ouc:
+            self.conv1x1 = Conv(inc, ouc, 1) 
+        else:
+            self.conv1x1 = nn.Identity()     
+
+        self.conv_3x3 = Conv(ouc, ouc // 2, 3) 
+    
+        self.conv_3x3_d1 = Conv(ouc // 2, ouc, 3, d=1)    
+        self.conv_3x3_d3 = Conv(ouc // 2, ouc // 2, 3, d=3)     
+        self.conv_3x3_d5 = Conv(ouc // 2, ouc // 2, 3, d=5)
+   
+        self.conv_1x1 = Conv(ouc * 2, ouc, k=1)
+        
+    def forward(self, x):  
+        x = self.conv1x1(x)   
+        conv_3x3 = self.conv_3x3(x) 
+        x1, x2, x3 = self.conv_3x3_d1(conv_3x3), self.conv_3x3_d3(conv_3x3), self.conv_3x3_d5(conv_3x3)
+        x_out = torch.cat([x1, x2, x3], dim=1)     
+        x_out = self.conv_1x1(x_out) + x
+        return x_out
+
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+
+    module = DWR(in_channel, out_channel).to(device)
+   
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+ 
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),   
+                                     output_as_string=True,   
+                                     output_precision=4,     
+                                     print_detailed=True)     
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/module/DynamicFilter.py b/engine/extre_module/custom_nn/module/DynamicFilter.py
new file mode 100644
index 00000000..67af802a
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/DynamicFilter.py
@@ -0,0 +1,149 @@
+'''    
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/AAAI2024-DynamicFilter.png
+论文链接：https://arxiv.org/pdf/2303.03932
+''' 
+     
+import os, sys  
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')   
+     
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+
+import torch
+import torch.nn as nn
+from timm.layers import to_2tuple
+
+from engine.extre_module.ultralytics_nn.conv import Conv 
+
+def resize_complex_weight(origin_weight, new_h, new_w): 
+    h, w, num_heads = origin_weight.shape[0:3]  # size, w, c, 2   
+    origin_weight = origin_weight.reshape(1, h, w, num_heads * 2).permute(0, 3, 1, 2)  
+    new_weight = torch.nn.functional.interpolate( 
+        origin_weight,
+        size=(new_h, new_w),
+        mode='bicubic',
+        align_corners=True
+    ).permute(0, 2, 3, 1).reshape(new_h, new_w, num_heads, 2) 
+    return new_weight    
+     
+class StarReLU(nn.Module):  
+    """
+    StarReLU: s * relu(x) ** 2 + b
+    """
+
+    def __init__(self, scale_value=1.0, bias_value=0.0,
+                 scale_learnable=True, bias_learnable=True,   
+                 mode=None, inplace=False):    
+        super().__init__() 
+        self.inplace = inplace    
+        self.relu = nn.ReLU(inplace=inplace)  
+        self.scale = nn.Parameter(scale_value * torch.ones(1),  
+                                  requires_grad=scale_learnable) 
+        self.bias = nn.Parameter(bias_value * torch.ones(1),  
+                                 requires_grad=bias_learnable)  
+  
+    def forward(self, x): 
+        return self.scale * self.relu(x) ** 2 + self.bias    
+ 
+class DynamicFilterMlp(nn.Module):    
+    """ MLP as used in MetaFormer models, eg Transformer, MLP-Mixer, PoolFormer, MetaFormer baslines and related networks.
+    Mostly copied from timm.
+    """    
+
+    def __init__(self, dim, mlp_ratio=4, out_features=None, act_layer=StarReLU, drop=0.,   
+                 bias=False, **kwargs):   
+        super().__init__()
+        in_features = dim
+        out_features = out_features or in_features
+        hidden_features = int(mlp_ratio * in_features)  
+        drop_probs = to_2tuple(drop)
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()  
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)     
+        self.drop2 = nn.Dropout(drop_probs[1])
+ 
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)   
+        x = self.drop2(x)    
+        return x
+
+class DynamicFilter(nn.Module):
+    def __init__(self, inc, dim, size=14, expansion_ratio=2, reweight_expansion_ratio=.25, 
+                 act1_layer=StarReLU, act2_layer=nn.Identity,
+                 bias=False, num_filters=4, weight_resize=False,   
+                 **kwargs):
+        super().__init__()
+        size = to_2tuple(size)
+        self.size = size[0]
+        self.filter_size = size[1] // 2 + 1
+        self.num_filters = num_filters     
+        self.dim = dim   
+        self.med_channels = int(expansion_ratio * dim)
+        self.weight_resize = weight_resize    
+        self.pwconv1 = nn.Linear(dim, self.med_channels, bias=bias) 
+        self.act1 = act1_layer()     
+        self.reweight = DynamicFilterMlp(dim, reweight_expansion_ratio, num_filters * self.med_channels) 
+        self.complex_weights = nn.Parameter(
+            torch.randn(self.size, self.filter_size, num_filters, 2,  
+                        dtype=torch.float32) * 0.02)
+        self.act2 = act2_layer()   
+        self.pwconv2 = nn.Linear(self.med_channels, dim, bias=bias)   
+     
+        self.conv1x1 = Conv(inc, dim, 1) if inc != dim else nn.Identity()  
+     
+    def forward(self, x):
+        B, _, H, W, = x.shape
+
+        x = self.conv1x1(x)    
+        x = x.permute(0, 2, 3, 1)
+        routeing = self.reweight(x.mean(dim=(1, 2))).view(B, self.num_filters,   
+                                                          -1).softmax(dim=1)
+        x = self.pwconv1(x)    
+        x = self.act1(x)  
+        x = torch.fft.rfft2(x, dim=(1, 2), norm='ortho')
+
+        if self.weight_resize:
+            complex_weights = resize_complex_weight(self.complex_weights, x.shape[1],     
+                                                    x.shape[2])
+            complex_weights = torch.view_as_complex(complex_weights.contiguous())
+        else:  
+            complex_weights = torch.view_as_complex(self.complex_weights) 
+        routeing = routeing.to(torch.complex64)   
+        weight = torch.einsum('bfc,hwf->bhwc', routeing, complex_weights)   
+        if self.weight_resize:    
+            weight = weight.view(-1, x.shape[1], x.shape[2], self.med_channels)     
+        else:
+            weight = weight.view(-1, self.size, self.filter_size, self.med_channels)    
+        x = x * weight
+        x = torch.fft.irfft2(x, s=(H, W), dim=(1, 2), norm='ortho')  
+
+        x = self.act2(x)
+        x = self.pwconv2(x)    
+        return x.permute(0, 3, 1, 2)
+
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32    
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)    
+ 
+    # 此模块不支持多尺度训练，且需要height=width，size参数可以填height或者width
+    module = DynamicFilter(in_channel, out_channel, size=height).to(device)    
+
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+    
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module,     
+                                     input_shape=(batch_size, in_channel, height, width),    
+                                     output_as_string=True,  
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET) 
diff --git a/engine/extre_module/custom_nn/module/FATBlock.py b/engine/extre_module/custom_nn/module/FATBlock.py
new file mode 100644
index 00000000..65bcdece
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/FATBlock.py
@@ -0,0 +1,365 @@
+'''  
+本文件由BiliBili：魔傀面具整理 
+engine/extre_module/module_images/ICLR2024-FATBlock.png
+论文链接：https://arxiv.org/pdf/2310.16387
+''' 
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+     
+import warnings
+warnings.filterwarnings('ignore') 
+from calflops import calculate_flops     
+
+import torch    
+import torch.nn as nn 
+from torch import Tensor 
+from einops import rearrange  
+from einops.layers.torch import Rearrange
+from timm.layers import  DropPath  
+    
+from engine.extre_module.ultralytics_nn.conv import Conv
+
+def img2windows(img, H_sp, W_sp):  
+    """ 
+    Input: Image (B, C, H, W)
+    Output: Window Partition (B', N, C)
+    """
+    B, C, H, W = img.shape
+    img_reshape = img.view(B, C, H // H_sp, H_sp, W // W_sp, W_sp)    
+    img_perm = img_reshape.permute(0, 2, 4, 3, 5, 1).contiguous().reshape(-1, H_sp* W_sp, C)     
+    return img_perm 
+
+
+def windows2img(img_splits_hw, H_sp, W_sp, H, W):   
+    """ 
+    Input: Window Partition (B', N, C)
+    Output: Image (B, H, W, C)
+    """
+    B = int(img_splits_hw.shape[0] / (H * W / H_sp / W_sp))
+ 
+    img = img_splits_hw.view(B, H // H_sp, W // W_sp, H_sp, W_sp, -1) 
+    img = img.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)     
+    return img
+ 
+def ste_round(x: Tensor) -> Tensor:     
+    return torch.round(x) - x.detach() + x   
+ 
+   
+def window_partition(x, window_size):
+    B, H, W, C = x.shape    
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)    
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+def window_reverse(windows, window_size, H, W):
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)   
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+ 
+
+class WindowAttention(nn.Module):
+    def __init__(self, dim, idx, split_size=8, dim_out=None, num_heads=6, attn_drop=0., proj_drop=0., qk_scale=None, position_bias=True):     
+        super().__init__()
+        self.dim = dim     
+        self.dim_out = dim_out or dim   
+        self.split_size = split_size
+        self.num_heads = num_heads 
+        self.idx = idx
+        self.position_bias = position_bias
+   
+        head_dim = dim // num_heads 
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        if idx == 0:    
+            H_sp, W_sp = self.split_size*2, self.split_size*2    
+        elif idx == 1:
+            H_sp, W_sp = self.split_size//2, self.split_size//2 
+        elif idx == 2:     
+            H_sp, W_sp = self.split_size//2, self.split_size*2
+        elif idx == 3:
+            H_sp, W_sp = self.split_size*2, self.split_size//2
+        else: 
+            print ("ERROR MODE", idx)
+            exit(0)
+        self.H_sp = H_sp
+        self.W_sp = W_sp  
+        window_size = [H_sp,W_sp]     
+        self.attn_drop = nn.Dropout(attn_drop)   
+        self.window_size = window_size
+        self.relative_position_bias_table = nn.Parameter(  
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH    
+     
+        # get pair-wise relative position index for each token inside the window    
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])  
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww 
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww  
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0     
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww 
+        self.register_buffer("relative_position_index", relative_position_index)     
+   
+     
+    def im2win(self, x, H, W):
+        B, N, C = x.shape     
+        x = x.transpose(-2,-1).contiguous().view(B, C, H, W)     
+        x = img2windows(x, self.H_sp, self.W_sp)  
+        x = x.reshape(-1, self.H_sp* self.W_sp, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3).contiguous()    
+        return x
+
+    def forward(self, qkv, H, W, mask=None):    
+        """   
+        Input: qkv: (B, 3*L, C), H, W, mask: (B, N, N), N is the window size
+        Output: x (B, H, W, C) 
+        """
+        q,k,v = qkv[0], qkv[1], qkv[2]  
+
+        B, L, C = q.shape 
+        assert L == H * W, "flatten img_tokens has wrong size"  
+   
+        # partition the q,k,v, image to window    
+        q = self.im2win(q, H, W)
+        k = self.im2win(k, H, W) 
+        v = self.im2win(v, H, W)   
+    
+        q = q * self.scale  
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(    
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH    
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        N = attn.shape[3]
+
+        # use mask for shift window
+        if mask is not None: 
+            nW = mask.shape[0]
+    
+            attn = attn.view(B, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = nn.functional.softmax(attn, dim=-1, dtype=attn.dtype)
+        attn = self.attn_drop(attn)
+    
+        x = (attn @ v)   
+        x = x.transpose(1, 2).reshape(-1, self.H_sp* self.W_sp, C)  # B head N N @ B head N C
+     
+        # merge the window, window to image
+        x = windows2img(x, self.H_sp, self.W_sp, H, W)  # B H' W' C
+     
+        return x 
+  
+class Swin_FDWA(nn.Module):
+   
+    def __init__(self, dim,  num_heads,
+                 window_size=8,  window_size_fm=16, shift_size=4, mlp_ratio=4., qkv_bias=False, qk_scale=None,
+                 drop=0., attn_drop=0., drop_path=0.,     
+                norm_layer=nn.LayerNorm):
+        super().__init__() 
+        self.dim = dim   
+        self.num_heads = num_heads
+        self.split_size = window_size 
+        self.shift_size = shift_size 
+        self.mlp_ratio = mlp_ratio    
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm1 = norm_layer(dim)     
+        self.branch_num = 4
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(drop)
+     
+        self.attns = nn.ModuleList([
+                WindowAttention(    
+                    dim//self.branch_num ,  idx = i,     
+                    split_size=window_size, num_heads=num_heads//self.branch_num , dim_out=dim//self.branch_num ,  
+                    qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, position_bias=True)
+                for i in range(self.branch_num)])     
+     
+      
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()    
+        self.fm = WindowFrequencyModulation(dim, window_size_fm)
+    
+        self.ffn = nn.Sequential( 
+            nn.Linear(dim, 4 * dim),  
+            nn.GELU(),   
+            nn.Linear(4 * dim, dim),
+        )
+        self.norm2 = norm_layer(dim)     
+
+
+    def calculate_mask(self, H, W,split_size=[8,8]):     
+        # The implementation builds on Swin Transformer code https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py   
+        # calculate attention mask for Rwin  
+        img_mask_0 = torch.zeros((1, H, W, 1)).cpu() # 1 H W 1 idx=0     
+        shift_size =(split_size[0]//2,split_size[1]//2)
+    
+        h_slices_0 = (slice(0, -split_size[0]),
+                    slice(-split_size[0], -shift_size[0]),   
+                    slice(-shift_size[0], None))
+        w_slices_0 = (slice(0, -split_size[1]),
+                    slice(-split_size[1], -shift_size[1]),
+                    slice(-shift_size[1], None))
+   
+        cnt = 0
+        for h in h_slices_0:
+            for w in w_slices_0:  
+                img_mask_0[:, h, w, :] = cnt   
+                cnt +=1
+ 
+   
+        # calculate mask for H-Shift    
+        img_mask_0 = img_mask_0.view(1, H // split_size[0], split_size[0], W // split_size[1], split_size[1], 1)     
+        img_mask_0 = img_mask_0.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, split_size[0], split_size[1], 1) # nW, sw[0], sw[1], 1 
+        mask_windows_0 = img_mask_0.view(-1, split_size[0] * split_size[1])
+        attn_mask_0 = mask_windows_0.unsqueeze(1) - mask_windows_0.unsqueeze(2)    
+        attn_mask_0 = attn_mask_0.masked_fill(attn_mask_0 != 0, float(-100.0)).masked_fill(attn_mask_0 == 0, float(0.0))     
+    
+        # calculate mask for V-Shift
+    
+        return attn_mask_0
+
+    def forward(self, x, x_size): 
+        H , W = x_size
+        B, L, C = x.shape 
+        assert L == H * W, "flatten img_tokens has wrong size"
+        img = self.norm1(x)   
+        qkv = self.qkv(img).reshape(B, -1, 3, C).permute(2, 0, 1, 3) # 3, B, HW, C
+
+        if self.shift_size>0:  
+            qkv = qkv.view(3, B, H, W, C) 
+     
+            qkv0,qkv1,qkv2,qkv3= qkv.chunk(4,4)     
+            qkv_0 = torch.roll(qkv0, shifts=(-self.split_size,-self.split_size), dims=(2, 3))  
+            qkv_0 = qkv_0.view(3, B, L, C//4)   
+
+            qkv_1 = torch.roll(qkv1, shifts=(-self.split_size//4,-self.split_size//4), dims=(2, 3))
+            qkv_1 = qkv_1.view(3, B, L, C//4)
+         
+            qkv_2 = torch.roll(qkv2, shifts=(-self.split_size//4,-self.split_size), dims=(2, 3))    
+            qkv_2 = qkv_2.view(3, B, L, C//4)     
+         
+            qkv_3 = torch.roll(qkv3, shifts=(-self.split_size,-self.split_size//4), dims=(2, 3))    
+            qkv_3 = qkv_3.view(3, B, L, C//4)  
+  
+            x1_shift = self.attns[0](qkv_0, H, W)  
+            x2_shift = self.attns[1](qkv_1, H, W)  
+            x3_shift = self.attns[2](qkv_2, H, W)   
+            x4_shift = self.attns[3](qkv_3, H, W)    
+                
+            x1 = torch.roll(x1_shift, shifts=(self.split_size, self.split_size), dims=(1, 2))   
+            x2 = torch.roll(x2_shift, shifts=(self.split_size//4, self.split_size//4), dims=(1, 2))  
+            x3 = torch.roll(x3_shift, shifts=(self.split_size//4, self.split_size), dims=(1, 2))   
+            x4 = torch.roll(x4_shift, shifts=(self.split_size, self.split_size//4), dims=(1, 2))
+     
+            x1 = x1.view(B, L, C//4)   
+            x2 = x2.view(B, L, C//4)     
+            x3 = x3.view(B, L, C//4)
+            x4 = x4.view(B, L, C//4)     
+            # Concat
+            attened_x = torch.cat([x1,x2,x3,x4], dim=2)     
+         
+        else:
+            qkv0,qkv1,qkv2,qkv3= qkv.chunk(4,3)  
+            x1 = self.attns[0](qkv0, H, W).view(B, L, C//4)
+            x2 = self.attns[1](qkv1, H, W).view(B, L, C//4)
+            x3 = self.attns[2](qkv2, H, W).view(B, L, C//4)
+            x4 = self.attns[3](qkv3, H, W).view(B, L, C//4)     
+            # Concat
+            attened_x = torch.cat([x1,x2,x3,x4], dim=2)     
+
+        attened_x = self.proj(attened_x)  
+        x = x + self.drop_path(attened_x)    
+        x = x + self.fm(self.ffn(self.norm2(x)),H,W)
+  
+        return x 
+
+class WindowFrequencyModulation(nn.Module):
+    def __init__(self, dim, window_size):   
+        super().__init__()
+        self.dim = dim 
+        self.window_size = window_size
+        self.ratio = 1  
+        self.complex_weight= nn.Parameter(torch.cat((torch.ones(self.window_size, self.window_size//2+1, self.ratio*dim, 1, dtype=torch.float32),\
+        torch.zeros(self.window_size, self.window_size//2+1, self.ratio*dim, 1, dtype=torch.float32)),dim=-1))
+  
+    def forward(self, x, H, W, spatial_size=None):
+        B,L,C = x.shape 
+   
+        x = x.view(B,H,W,self.ratio*C)
+        B, H,W, C = x.shape
+  
+        x = rearrange(x, 'b (w1 p1) (w2 p2) c -> b w1 w2 p1 p2 c', p1=self.window_size, p2=self.window_size)  
+
+        x = x.to(torch.float32)
+        
+        x= torch.fft.rfft2(x,dim=(3, 4), norm='ortho')    
+      
+        weight = torch.view_as_complex(self.complex_weight) 
+        x = x * weight
+        x = torch.fft.irfft2(x, s=(self.window_size, self.window_size), dim=(3, 4), norm='ortho')     
+
+        x = rearrange(x, 'b w1 w2 p1 p2 c -> b (w1 p1) (w2 p2) c ')
+     
+        x = x.view(B, -1, C)
+        return x   
+
+class FAT_Block(nn.Module):   
+    def __init__(self, in_channel, trans_dim, head_dim=4, window_size=4, window_size_fm = 4, drop_path=0.01, type='W'):   
+        """ SwinTransformer and Conv Block 
+        """ 
+        super(FAT_Block, self).__init__()   
+        self.trans_dim = trans_dim    
+        self.head_dim = head_dim
+   
+        self.drop_path = drop_path    
+        self.type = type
+        assert self.type in ['W', 'SW']
+
+        self.trans_block =  Swin_FDWA(
+                dim=trans_dim,
+                num_heads=head_dim, 
+                window_size=window_size,
+                window_size_fm=window_size_fm,
+                shift_size=0 if (type=='W') else window_size//2)    
+    
+        
+        self.conv1_1 = nn.Conv2d(self.trans_dim, self.trans_dim, 1, 1, 0, bias=True)
+        self.conv1_2 = nn.Conv2d(self.trans_dim, self.trans_dim, 1, 1, 0, bias=True)
+   
+        self.conv1x1 = Conv(in_channel, trans_dim, 1) if in_channel != trans_dim else nn.Identity()    
+
+    def forward(self, x):     
+        x = self.conv1x1(x)
+        trans_x = self.conv1_1(x)
+        b,c,h,w = trans_x.shape
+        trans_x = Rearrange('b c h w -> b (h w)c')(trans_x)
+        trans_x = self.trans_block(trans_x,(h,w))    
+        trans_x = Rearrange('b (h w) c -> b c h w', h=h, w=w)(trans_x)
+        
+        res = self.conv1_2(trans_x)
+        
+        x = x + res 
+        return x     
+  
+if __name__ == '__main__':     
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"  
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+
+    module = FAT_Block(in_channel, out_channel).to(device)  
+ 
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+     
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)  
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/module/LEGM.py b/engine/extre_module/custom_nn/module/LEGM.py
new file mode 100644
index 00000000..3d31ff14
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/LEGM.py
@@ -0,0 +1,280 @@
+'''   
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2024-LEGM.png
+论文链接：https://arxiv.org/pdf/2403.01105   
+'''
+    
+import os, sys     
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops   
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F     
+from timm.layers import trunc_normal_
+  
+from engine.extre_module.ultralytics_nn.conv import Conv 
+ 
+# LEGM模块描述     
+# 1. LEGM模块的应用场景与解决的问题
+# Local Feature-Embedded Global Feature Extraction Module（LEGM）是一种高效的特征提取架构，专为需要兼顾局部细节与全局上下文的复杂视觉任务设计。该模块特别适用于目标检测、图像分类、语义分割以及视频分析等计算机视觉任务。LEGM通过其独特的多尺度特征融合与注意力机制，解决了传统模型在处理复杂场景时对局部特征与全局特征整合不足的问题，为高精度和高效能的特征表示提供了全新的解决方案。 
+# 具体而言，LEGM能够有效应对以下挑战：
+
+# 局部与全局特征的平衡：传统卷积神经网络或Transformer架构在捕捉全局上下文时可能忽略局部细节，而LEGM通过嵌入局部特征的全局特征提取机制，实现了两者的无缝融合。   
+# 计算效率与性能的矛盾：LEGM结合了卷积操作与窗口化注意力机制，显著降低了计算复杂度，同时保持了强大的特征表达能力，适用于资源受限的实时应用场景。
+# 尺度变化与背景干扰的鲁棒性：通过动态窗口移位和多头注意力设计，LEGM增强了模型对多尺度目标和复杂背景的适应能力，特别适合处理高分辨率或动态场景的视觉任务。   
+   
+# 2. LEGM模块的创新点与优点
+# LEGM模块在设计上融合了多种前沿理念，展现出显著的创新性与工程价值。其创新点和优点包括以下几个方面： 
+# 创新点:  
+   
+# 局部特征嵌入的全局注意力机制LEGM通过WATT（Window-based Attention with Relative Position Bias）模块，将局部特征的卷积处理与全局特征的注意力机制深度结合。这种局部嵌入式全局提取策略，不仅增强了特征的语义丰富性，还显著提升了模型对复杂场景的理解能力。    
+
+# 动态窗口移位与混合卷积策略模块创新性地引入了动态窗口移位机制（shift_size），配合多种卷积类型（如深度可分离卷积和标准卷积），实现了灵活的特征捕捉方式。这种设计有效扩展了感受野，同时降低了计算开销。
+
+# 自适应层归一化（LayNormal）设计LEGM引入了LayNormal模块，通过学习输入特征的均值和方差动态调整归一化参数，增强了模型对不同数据分布的适应性。这种自适应归一化机制为特征处理提供了更高的灵活性和鲁棒性。
+ 
+
+# 优点:
+
+# 高效的多尺度特征融合相较于传统的Transformer或CNN模块，LEGM在保持全局特征提取能力的同时，通过局部特征的嵌入式设计，显著提升了多尺度目标的检测精度，尤其适用于高分辨率图像处理。 
+  
+# 模块化与灵活性LEGM采用高度模块化的设计，参数（如窗口大小、头数、卷积类型等）可根据任务需求灵活调整，使其易于集成到多种主流架构（如ResNet、YOLO或Swin Transformer）中，具有广泛的适用性。
+    
+# 轻量化和高性能的协同优化通过结合深度可分离卷积和窗口化注意力机制，LEGM在大幅降低计算量和参数量的同时，依然保持了卓越的性能表现，非常适合边缘设备或实时推理场景。 
+ 
+# 综上所述，LEGM模块以其创新的局部嵌入式全局特征提取机制、动态窗口化注意力设计以及自适应归一化策略，为计算机视觉领域提供了一种高效、灵活且鲁棒的特征提取解决方案。其独特的设计理念不仅推动了高性能视觉模型的发展，也为学术研究与工业应用开辟了新的可能性。     
+    
+
+    
+def window_partition(x, window_size):
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)    
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size**2, C)
+    return windows
+     
+
+def window_reverse(windows, window_size, H, W):
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)     
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)     
+    return x  
+
+def get_relative_positions(window_size): 
+    coords_h = torch.arange(window_size)
+    coords_w = torch.arange(window_size)
+    # coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing='xy'))   
+    coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+    coords_flatten = torch.flatten(coords, 1)
+    relative_positions = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+    relative_positions = relative_positions.permute(1, 2, 0).contiguous()   
+    relative_positions_log  = torch.sign(relative_positions) * torch.log(1. + relative_positions.abs())    
+    return relative_positions_log
+ 
+class WATT(nn.Module): 
+    def __init__(self, dim, window_size, num_heads): 
+  
+        super().__init__()    
+        self.dim = dim    
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        relative_positions = get_relative_positions(self.window_size)  
+        self.register_buffer("relative_positions", relative_positions)   
+        self.meta = nn.Sequential( 
+            nn.Linear(2, 256, bias=True),     
+            nn.ReLU(True),
+            nn.Linear(256, num_heads, bias=True)     
+        )
+    
+        self.softmax = nn.Softmax(dim=-1)
+  
+    def forward(self, qkv):
+        B_, N, _ = qkv.shape    
+        qkv = qkv.reshape(B_, N, 3, self.num_heads, self.dim // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale 
+        attn = (q @ k.transpose(-2, -1))     
+        relative_position_bias = self.meta(self.relative_positions)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attn = attn + relative_position_bias.unsqueeze(0)
+        attn = self.softmax(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, self.dim)
+        return x   
+  
+class Att(nn.Module):
+    def __init__(self, dim, num_heads, window_size, shift_size, use_attn=False, conv_type=None):
+        super().__init__()     
+        self.dim = dim
+        self.head_dim = int(dim // num_heads) 
+        self.num_heads = num_heads
+        self.window_size = window_size    
+        self.shift_size = shift_size    
+        self.use_attn = use_attn   
+        self.conv_type = conv_type    
+
+        if self.conv_type == 'Conv':    
+            self.conv = nn.Sequential(
+                nn.Conv2d(dim, dim, kernel_size=3, padding=1, padding_mode='reflect'),
+                nn.ReLU(True),
+                nn.Conv2d(dim, dim, kernel_size=3, padding=1, padding_mode='reflect') 
+            ) 
+
+        if self.conv_type == 'DWConv':     
+            self.conv = nn.Conv2d(dim, dim, kernel_size=5, padding=2, groups=dim, padding_mode='reflect') 
+        if self.conv_type == 'DWConv' or self.use_attn:
+            self.V = nn.Conv2d(dim, dim, 1)
+            self.proj = nn.Conv2d(dim, dim, 1)   
+        if self.use_attn:
+            self.QK = nn.Conv2d(dim, dim * 2, 1)
+            self.attn = WATT(dim, window_size, num_heads) 
+ 
+    def check_size(self, x, shift=False): 
+        _, _, h, w = x.size()  
+        mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
+        mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
+
+        if shift:
+            x = F.pad(x, (self.shift_size, (self.window_size-self.shift_size+mod_pad_w) % self.window_size,
+                          self.shift_size, (self.window_size-self.shift_size+mod_pad_h) % self.window_size), mode='reflect')    
+        else: 
+            x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect')   
+        return x    
+  
+    def forward(self, X):
+        B, C, H, W = X.shape
+    
+        if self.conv_type == 'DWConv' or self.use_attn:  
+            V = self.V(X)   
+
+        if self.use_attn:  
+            QK = self.QK(X)
+            QKV = torch.cat([QK, V], dim=1)     
+ 
+            # shift
+            shifted_QKV = self.check_size(QKV, self.shift_size > 0) 
+            Ht, Wt = shifted_QKV.shape[2:]
+
+            # partition windows
+            shifted_QKV = shifted_QKV.permute(0, 2, 3, 1)    
+            qkv = window_partition(shifted_QKV, self.window_size)  # nW*B, window_size**2, C     
+   
+            attn_windows = self.attn(qkv)
+ 
+            # merge windows   
+            shifted_out = window_reverse(attn_windows, self.window_size, Ht, Wt)  # B H' W' C
+
+            # reverse cyclic shift  
+            out = shifted_out[:, self.shift_size:(self.shift_size+H), self.shift_size:(self.shift_size+W), :]
+            attn_out = out.permute(0, 3, 1, 2)
+     
+            if self.conv_type in ['Conv', 'DWConv']:
+                conv_out = self.conv(V)
+                out = self.proj(conv_out + attn_out)
+            else: 
+                out = self.proj(attn_out) 
+
+        else:
+            if self.conv_type == 'Conv': 
+                out = self.conv(X)
+            elif self.conv_type == 'DWConv':     
+                out = self.proj(self.conv(V))
+  
+        return out   
+    
+class Mlp(nn.Module):  
+    def __init__(self, in_features, hidden_features=None, out_features=None):  
+        super().__init__()   
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.mlp = nn.Sequential(   
+            nn.Conv2d(in_features, hidden_features, 1),
+            nn.ReLU(True),
+            nn.Conv2d(hidden_features, out_features, 1)
+        )
+    
+    def forward(self, x):  
+        return self.mlp(x)
+
+class LayNormal(nn.Module):  
+    def __init__(self, dim, eps=1e-5, detach_grad=False): 
+        super(LayNormal, self).__init__()     
+        self.eps = eps
+        self.detach_grad = detach_grad
+        self.weight = nn.Parameter(torch.ones((1, dim, 1, 1)))
+        self.bias = nn.Parameter(torch.zeros((1, dim, 1, 1)))
+        self.meta1 = nn.Conv2d(1, dim, 1)
+        self.meta2 = nn.Conv2d(1, dim, 1)
+        trunc_normal_(self.meta1.weight, std=.02)    
+        nn.init.constant_(self.meta1.bias, 1)   
+        trunc_normal_(self.meta2.weight, std=.02)
+        nn.init.constant_(self.meta2.bias, 0)
+
+    def forward(self, input):
+        mean = torch.mean(input, dim=(1, 2, 3), keepdim=True)     
+        std = torch.sqrt((input - mean).pow(2).mean(dim=(1, 2, 3), keepdim=True) + self.eps)     
+        normalized_input = (input - mean) / std
+        if self.detach_grad:   
+            rescale, rebias = self.meta1(std.detach()), self.meta2(mean.detach())
+        else:
+            rescale, rebias = self.meta1(std), self.meta2(mean) 
+        out = normalized_input * self.weight + self.bias
+        return out, rescale, rebias
+  
+class LEGM(nn.Module):
+    def __init__(self, inc, dim, num_heads=8, mlp_ratio=4.,
+                 norm_layer=LayNormal, mlp_norm=False,
+                 window_size=8, shift_size=0, use_attn=True, conv_type=None):
+        super().__init__()
+        self.use_attn = use_attn    
+        self.mlp_norm = mlp_norm
+
+        self.norm1 = norm_layer(dim) if use_attn else nn.Identity()   
+        self.attn = Att(dim, num_heads=num_heads, window_size=window_size,  
+                              shift_size=shift_size, use_attn=use_attn, conv_type=conv_type) 
+     
+        self.norm2 = norm_layer(dim) if use_attn and mlp_norm else nn.Identity() 
+        self.mlp = Mlp(dim, hidden_features=int(dim * mlp_ratio))    
+
+        self.conv1x1 = Conv(inc, dim, 1) if inc != dim else nn.Identity()
+   
+    def forward(self, x):
+        x = self.conv1x1(x)  
+
+        identity = x
+        if self.use_attn: x, rescale, rebias = self.norm1(x)     
+        x = self.attn(x)    
+        if self.use_attn: x = x * rescale + rebias 
+        x = identity + x   
+
+        identity = x  
+        if self.use_attn and self.mlp_norm: x, rescale, rebias = self.norm2(x)
+        x = self.mlp(x)
+        if self.use_attn and self.mlp_norm: x = x * rescale + rebias  
+        x = identity + x   
+ 
+        return x
+
+if __name__ == '__main__':     
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')     
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)     
+
+    module = LEGM(in_channel, out_channel).to(device)
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),   
+                                     output_as_string=True,
+                                     output_precision=4,  
+                                     print_detailed=True)
+    print(RESET)  
diff --git a/engine/extre_module/custom_nn/module/MSBlock.py b/engine/extre_module/custom_nn/module/MSBlock.py
new file mode 100644
index 00000000..30ec8772
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/MSBlock.py
@@ -0,0 +1,83 @@
+'''
+本文件由BiliBili：魔傀面具整理    
+engine/extre_module/module_images/TPAMI2025-MSBlock.png
+论文链接：https://arxiv.org/abs/2308.05480 
+'''
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')    
+   
+import warnings   
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+     
+import torch   
+import torch.nn as nn
+
+from engine.extre_module.ultralytics_nn.conv import Conv   
+
+class MSBlockLayer(nn.Module):
+    def __init__(self, inc, ouc, k) -> None:   
+        super().__init__()
+  
+        self.in_conv = Conv(inc, ouc, 1)
+        self.mid_conv = Conv(ouc, ouc, k, g=ouc)
+        self.out_conv = Conv(ouc, inc, 1)    
+    
+    def forward(self, x):
+        return self.out_conv(self.mid_conv(self.in_conv(x)))
+ 
+class MSBlock(nn.Module):
+    def __init__(self, inc, ouc, kernel_sizes=[1, 3, 3], in_expand_ratio=3., mid_expand_ratio=2., layers_num=3, in_down_ratio=2.) -> None:
+        super().__init__()
+        
+        in_channel = int(inc * in_expand_ratio // in_down_ratio)
+        self.mid_channel = in_channel // len(kernel_sizes)
+        groups = int(self.mid_channel * mid_expand_ratio)  
+        self.in_conv = Conv(inc, in_channel)     
+        
+        self.mid_convs = []   
+        for kernel_size in kernel_sizes:
+            if kernel_size == 1:  
+                self.mid_convs.append(nn.Identity())
+                continue 
+            mid_convs = [MSBlockLayer(self.mid_channel, groups, k=kernel_size) for _ in range(int(layers_num))]
+            self.mid_convs.append(nn.Sequential(*mid_convs))
+        self.mid_convs = nn.ModuleList(self.mid_convs)
+        self.out_conv = Conv(in_channel, ouc, 1) 
+        
+        self.attention = None
+    
+    def forward(self, x):
+        out = self.in_conv(x)  
+        channels = []   
+        for i,mid_conv in enumerate(self.mid_convs):
+            channel = out[:,i * self.mid_channel:(i+1) * self.mid_channel,...]
+            if i >= 1:    
+                channel = channel + channels[i-1]    
+            channel = mid_conv(channel)     
+            channels.append(channel)
+        out = torch.cat(channels, dim=1)
+        out = self.out_conv(out)
+        if self.attention is not None:
+            out = self.attention(out)     
+        return out     
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"   
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32    
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+ 
+    module = MSBlock(in_channel, out_channel, kernel_sizes=[1, 3, 3], layers_num=3).to(device)
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)     
+
+    print(ORANGE)  
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4, 
+                                     print_detailed=True)
+    print(RESET)  
diff --git a/engine/extre_module/custom_nn/module/MSCB.py b/engine/extre_module/custom_nn/module/MSCB.py
new file mode 100644
index 00000000..4ddf8bfa
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/MSCB.py
@@ -0,0 +1,162 @@
+'''  
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2024-MSCB.png  
+论文链接：https://arxiv.org/abs/2405.06880
+'''    
+    
+import os, sys   
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')    
+    
+import warnings     
+warnings.filterwarnings('ignore')  
+from calflops import calculate_flops     
+
+import torch, math 
+import torch.nn as nn    
+    
+from engine.extre_module.ultralytics_nn.conv import Conv  
+
+# MSCB模块     
+# 1. MSCB模块适合的任务及解决的问题    
+# 多尺度卷积模块（MSCB, Multi-Scale Convolution Block）是一种创新性的深度学习架构，特别适用于需要高效捕获多尺度特征的计算机视觉任务。该模块在图像分类、目标检测、语义分割以及场景理解等任务中表现出色，尤其是在处理复杂场景或具有多样化尺度特征的图像数据时，能够显著提升模型性能。
+# MSCB模块通过结合点态卷积（pointwise convolution）和多尺度深度可分离卷积（depthwise separable convolution），解决了传统卷积神经网络在特征提取中的两个关键问题：尺度不变性不足和计算复杂度高。传统卷积操作通常采用单一尺度的卷积核，难以同时捕捉图像中不同尺度的语义信息，而MSCB通过并行或串行的多尺度卷积核设计，能够灵活适应目标对象的尺度变化，从而增强模型对复杂场景的鲁棒性。此外，MSCB模块通过深度可分离卷积显著降低了计算量和参数量，使其特别适合在资源受限的设备（如移动端或嵌入式系统）上部署高效的视觉模型。     
+# 在实际应用中，MSCB模块尤其适用于以下场景： 
+
+# 高分辨率图像处理：如医学影像分析中，MSCB能够捕获从细微病灶到整体器官结构的多样化特征。
+# 实时视觉任务：如自动驾驶中的目标检测，MSCB能够在保持高精度的同时降低推理延迟。
+# 轻量化网络设计：在边缘设备上运行的模型中，MSCB通过高效的多尺度特征提取实现性能与效率的平衡。 
+    
+# 2. MSCB模块的创新点与优点     
+# MSCB模块的创新性体现在其独特的多尺度特征融合机制、高效的计算设计以及灵活的架构适配能力。以下是其核心创新点与优点的详细分析： 
+# 创新点   
+   
+# 多尺度特征自适应融合MSCB通过引入多尺度深度可分离卷积（MSDC），在单一模块内并行或串行地处理多种卷积核尺度（如1x1、3x3、5x5），从而捕获从局部细节到全局语义的多层次特征。与传统的多尺度方法（如Inception模块）不同，MSCB通过深度可分离卷积大幅降低了计算复杂度，同时通过通道混洗（channel shuffle）操作优化了跨尺度特征的交互与融合，提升了特征表达能力。   
+
+# 动态并行/串行模式MSCB模块支持并行或串行的深度卷积模式（通过dw_parallel参数控制），这种灵活性使其能够根据任务需求动态调整特征提取策略。并行模式适合需要同时提取多种尺度特征的场景，而串行模式则通过逐层累加特征增强了模型的深度表达能力。这种设计为网络架构的定制化提供了新的可能性。
+ 
+# 通道混洗优化特征交互MSCB在多尺度特征融合后引入了通道混洗机制，通过分组重排通道的方式增强了不同尺度特征之间的信息流动。这种操作不仅降低了通道间的冗余性，还显著提升了模型对复杂模式的学习能力，尤其是在处理高维特征图时表现突出。     
+
+# 高效的残差连接与扩展因子MSCB通过引入扩展因子（expansion factor）在点态卷积中动态调整通道数，从而在保持轻量化的同时增强了特征表达能力。此外，当步幅为1时，MSCB支持残差连接（skip connection），通过身份映射保留原始输入信息，进一步缓解了深层网络中的梯度消失问题，同时提升了训练稳定性。
+   
+# 优点     
+   
+# 高效性与轻量化MSCB采用深度可分离卷积替代传统卷积操作，显著降低了参数量和计算复杂度。例如，与标准3x3卷积相比，深度可分离卷积的计算量可减少至原来的1/9。这种高效性使得MSCB非常适合资源受限场景，同时在高性能硬件上也能实现更快的推理速度。   
+ 
+# 多尺度特征提取的鲁棒性通过多尺度卷积核的协同工作，MSCB能够有效捕获图像中不同尺度和语义层次的特征。这种能力在处理具有尺度变化的目标（如远近不同的物体）或复杂背景的场景时尤为重要，显著提升了模型的泛化能力。     
+
+# 模块化与通用性MSCB作为一个高度模块化的组件，可以无缝集成到现有的卷积神经网络架构中（如ResNet、MobileNet等），无需对整体网络结构进行大幅修改。其灵活的参数配置（如卷积核大小、步幅、扩展因子等）使其能够适配多种任务需求。
+  
+# 支持边缘部署由于其低计算复杂度和高效的特征提取能力，MSCB特别适合在边缘设备上部署轻量化模型。例如，在移动端的目标检测任务中，MSCB能够在保持高精度的同时显著降低功耗和延迟。   
+
+# 总结   
+# MSCB模块通过创新性的多尺度特征提取、高效的深度可分离卷积以及灵活的架构设计，为计算机视觉任务提供了一种兼具高性能和低复杂度的解决方案。其在尺度不变性、计算效率和模型鲁棒性方面的突破，使其在学术研究和工业应用中均具有广阔的前景。无论是用于高精度的图像分类，还是资源受限的边缘计算，MSCB都能为深度学习模型注入新的活力。   
+
+
+class MSDC(nn.Module):
+    def __init__(self, in_channels, kernel_sizes, stride, dw_parallel=True): 
+        super(MSDC, self).__init__()
+  
+        self.in_channels = in_channels     
+        self.kernel_sizes = kernel_sizes 
+        self.dw_parallel = dw_parallel
+
+        self.dwconvs = nn.ModuleList([
+            nn.Sequential(
+                Conv(self.in_channels, self.in_channels, kernel_size, s=stride, g=self.in_channels)   
+            )
+            for kernel_size in self.kernel_sizes  
+        ])
+
+    def forward(self, x):
+        # Apply the convolution layers in a loop
+        outputs = []  
+        for dwconv in self.dwconvs:   
+            dw_out = dwconv(x)
+            outputs.append(dw_out)
+            if self.dw_parallel == False:    
+                x = x+dw_out
+        # You can return outputs based on what you intend to do with them
+        return outputs    
+
+class MSCB(nn.Module):
+    """     
+    Multi-scale convolution block (MSCB) 
+    """   
+    def __init__(self, in_channels, out_channels, kernel_sizes=[1,3,5], stride=1, expansion_factor=2, dw_parallel=True, add=True):
+        super(MSCB, self).__init__() 
+        
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+        self.kernel_sizes = kernel_sizes   
+        self.expansion_factor = expansion_factor
+        self.dw_parallel = dw_parallel     
+        self.add = add
+        self.n_scales = len(self.kernel_sizes)
+        # check stride value    
+        assert self.stride in [1, 2] 
+        # Skip connection if stride is 1     
+        self.use_skip_connection = True if self.stride == 1 else False
+   
+        # expansion factor
+        self.ex_channels = int(self.in_channels * self.expansion_factor) 
+        self.pconv1 = nn.Sequential(
+            # pointwise convolution
+            Conv(self.in_channels, self.ex_channels, 1)
+        )
+        self.msdc = MSDC(self.ex_channels, self.kernel_sizes, self.stride, dw_parallel=self.dw_parallel)
+        if self.add == True:  
+            self.combined_channels = self.ex_channels*1
+        else: 
+            self.combined_channels = self.ex_channels*self.n_scales
+        self.pconv2 = nn.Sequential(   
+            # pointwise convolution
+            Conv(self.combined_channels, self.out_channels, 1, act=False)  
+        )
+        if self.use_skip_connection and (self.in_channels != self.out_channels):
+            self.conv1x1 = nn.Conv2d(self.in_channels, self.out_channels, 1, 1, 0, bias=False)
+ 
+    def forward(self, x):
+        pout1 = self.pconv1(x)
+        msdc_outs = self.msdc(pout1)
+        if self.add == True:     
+            dout = 0
+            for dwout in msdc_outs:    
+                dout = dout + dwout    
+        else:
+            dout = torch.cat(msdc_outs, dim=1)
+        dout = self.channel_shuffle(dout, math.gcd(self.combined_channels,self.out_channels)) 
+        out = self.pconv2(dout)
+        if self.use_skip_connection:   
+            if self.in_channels != self.out_channels:
+                x = self.conv1x1(x)
+            return x + out
+        else:
+            return out   
+    
+    def channel_shuffle(self, x, groups):
+        batchsize, num_channels, height, width = x.data.size() 
+        channels_per_group = num_channels // groups   
+        x = x.view(batchsize, groups, channels_per_group, height, width)    
+        x = torch.transpose(x, 1, 2).contiguous()  
+        x = x.view(batchsize, -1, height, width)
+        return x
+
+if __name__ == '__main__':    
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')   
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32 
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)   
+
+    module = MSCB(in_channel, out_channel, kernel_sizes=[1, 3, 5]).to(device)    
+  
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET) 
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),  
+                                     output_as_string=True,   
+                                     output_precision=4, 
+                                     print_detailed=True)  
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/module/StripBlock.py b/engine/extre_module/custom_nn/module/StripBlock.py
new file mode 100644
index 00000000..2a2ce5d9
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/StripBlock.py
@@ -0,0 +1,114 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/StripBlock.png
+论文链接：https://arxiv.org/pdf/2501.03775
+'''
+
+import os, sys 
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')    
+     
+import warnings 
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops   
+  
+import torch
+import torch.nn as nn
+from timm.layers import DropPath  
+
+from engine.extre_module.ultralytics_nn.conv import Conv, DWConv   
+
+class StripMlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features   
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)    
+        self.dwconv = DWConv(hidden_features, hidden_features) 
+        self.act = act_layer()     
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+    
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class Strip_Block(nn.Module):   
+    def __init__(self, dim, k1, k2):     
+        super().__init__()     
+        self.conv0 = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
+        self.conv_spatial1 = nn.Conv2d(dim,dim,kernel_size=(k1, k2), stride=1, padding=(k1//2, k2//2), groups=dim)     
+        self.conv_spatial2 = nn.Conv2d(dim,dim,kernel_size=(k2, k1), stride=1, padding=(k2//2, k1//2), groups=dim)
+  
+        self.conv1 = nn.Conv2d(dim, dim, 1)
+ 
+    def forward(self, x): 
+        attn = self.conv0(x)
+        attn = self.conv_spatial1(attn)  
+        attn = self.conv_spatial2(attn)
+        attn = self.conv1(attn)
+
+        return x * attn
+   
+class Strip_Attention(nn.Module):    
+    def __init__(self, d_model,k1,k2):   
+        super().__init__()
+        self.proj_1 = nn.Conv2d(d_model, d_model, 1)    
+        self.activation = nn.GELU()
+        self.spatial_gating_unit = Strip_Block(d_model,k1,k2)
+        self.proj_2 = nn.Conv2d(d_model, d_model, 1)
+
+    def forward(self, x):    
+        shorcut = x.clone() 
+        x = self.proj_1(x)
+        x = self.activation(x)
+        # x = self.spatial_gating_unit(x)
+        x = self.proj_2(x)
+        x = x + shorcut    
+        return x
+
+class StripBlock(nn.Module):  
+    def __init__(self, inc, dim, mlp_ratio=4., k1=1, k2=19, drop=0.,drop_path=0., act_layer=nn.GELU):
+        super().__init__()
+        self.norm1 = nn.BatchNorm2d(dim)
+        self.norm2 = nn.BatchNorm2d(dim)
+        self.attn = Strip_Attention(dim, k1, k2)     
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()    
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = StripMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)    
+        layer_scale_init_value = 1e-2    
+        self.layer_scale_1 = nn.Parameter(
+            layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+        self.layer_scale_2 = nn.Parameter( 
+            layer_scale_init_value * torch.ones((dim)), requires_grad=True)   
+
+        self.conv1x1 = Conv(inc, dim, k=1) if inc != dim else nn.Identity()    
+  
+    def forward(self, x):  
+        x = self.conv1x1(x) 
+        x = x + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x)))   
+        return x
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m" 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)     
+
+    module = StripBlock(in_channel, out_channel).to(device) 
+
+    outputs = module(inputs)     
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+     
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, in_channel, height, width),     
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)   
diff --git a/engine/extre_module/custom_nn/module/UniRepLKBlock.py b/engine/extre_module/custom_nn/module/UniRepLKBlock.py
new file mode 100644
index 00000000..aaea8ffc
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/UniRepLKBlock.py
@@ -0,0 +1,362 @@
+
+'''
+本文件由BiliBili：魔傀面具整理 
+engine/extre_module/module_images/CVPR2024-UniRepLKNetBlock.png
+论文链接：https://arxiv.org/abs/2311.15599  
+'''
+   
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')  
+  
+import warnings   
+warnings.filterwarnings('ignore')   
+from calflops import calculate_flops     
+  
+import torch  
+import torch.nn as nn
+import torch.nn.functional as F  
+import torch.utils.checkpoint as checkpoint  
+from timm.layers import trunc_normal_, DropPath, to_2tuple
+    
+from engine.extre_module.ultralytics_nn.conv import Conv
+from engine.extre_module.torch_utils import model_fuse_test
+   
+#================== This function decides which conv implementation (the native or iGEMM) to use
+#   Note that iGEMM large-kernel conv impl will be used if   
+#       -   you attempt to do so (attempt_to_use_large_impl=True), and   
+#       -   it has been installed (follow https://github.com/AILab-CVC/UniRepLKNet), and
+#       -   the conv layer is depth-wise, stride = 1, non-dilated, kernel_size > 5, and padding == kernel_size // 2
+def get_conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias,
+               attempt_use_lk_impl=True):
+    kernel_size = to_2tuple(kernel_size)
+    if padding is None:
+        padding = (kernel_size[0] // 2, kernel_size[1] // 2)
+    else:
+        padding = to_2tuple(padding)   
+    need_large_impl = kernel_size[0] == kernel_size[1] and kernel_size[0] > 5 and padding == (kernel_size[0] // 2, kernel_size[1] // 2)     
+
+    if attempt_use_lk_impl and need_large_impl:   
+        # print('---------------- trying to import iGEMM implementation for large-kernel conv')    
+        try:    
+            from depthwise_conv2d_implicit_gemm import DepthWiseConv2dImplicitGEMM
+            # print('---------------- found iGEMM implementation ')     
+        except:    
+            DepthWiseConv2dImplicitGEMM = None   
+            # print('---------------- found no iGEMM. use original conv. follow https://github.com/AILab-CVC/UniRepLKNet to install it.')    
+        if DepthWiseConv2dImplicitGEMM is not None and need_large_impl and in_channels == out_channels \
+                and out_channels == groups and stride == 1 and dilation == 1:   
+            # print(f'===== iGEMM Efficient Conv Impl, channels {in_channels}, kernel size {kernel_size} =====')
+            return DepthWiseConv2dImplicitGEMM(in_channels, kernel_size, bias=bias)
+    return nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,  
+                     padding=padding, dilation=dilation, groups=groups, bias=bias)
+    
+def get_bn(dim, use_sync_bn=False):
+    if use_sync_bn:
+        return nn.SyncBatchNorm(dim)   
+    else:
+        return nn.BatchNorm2d(dim)   
+   
+def fuse_bn(conv, bn):
+    conv_bias = 0 if conv.bias is None else conv.bias   
+    std = (bn.running_var + bn.eps).sqrt()  
+    return conv.weight * (bn.weight / std).reshape(-1, 1, 1, 1), bn.bias + (conv_bias - bn.running_mean) * bn.weight / std
+  
+def convert_dilated_to_nondilated(kernel, dilate_rate): 
+    identity_kernel = torch.ones((1, 1, 1, 1)).to(kernel.device)   
+    if kernel.size(1) == 1:
+        #   This is a DW kernel     
+        dilated = F.conv_transpose2d(kernel, identity_kernel, stride=dilate_rate)    
+        return dilated 
+    else:    
+        #   This is a dense or group-wise (but not DW) kernel  
+        slices = []
+        for i in range(kernel.size(1)):   
+            dilated = F.conv_transpose2d(kernel[:,i:i+1,:,:], identity_kernel, stride=dilate_rate) 
+            slices.append(dilated) 
+        return torch.cat(slices, dim=1)
+
+def merge_dilated_into_large_kernel(large_kernel, dilated_kernel, dilated_r): 
+    large_k = large_kernel.size(2)
+    dilated_k = dilated_kernel.size(2)
+    equivalent_kernel_size = dilated_r * (dilated_k - 1) + 1
+    equivalent_kernel = convert_dilated_to_nondilated(dilated_kernel, dilated_r)
+    rows_to_pad = large_k // 2 - equivalent_kernel_size // 2
+    merged_kernel = large_kernel + F.pad(equivalent_kernel, [rows_to_pad] * 4) 
+    return merged_kernel
+
+class NCHWtoNHWC(nn.Module):   
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x): 
+        return x.permute(0, 2, 3, 1)
+
+class NHWCtoNCHW(nn.Module):
+    def __init__(self):   
+        super().__init__()   
+
+    def forward(self, x):  
+        return x.permute(0, 3, 1, 2)  
+
+class GRNwithNHWC(nn.Module): 
+    """ GRN (Global Response Normalization) layer     
+    Originally proposed in ConvNeXt V2 (https://arxiv.org/abs/2301.00808)
+    This implementation is more efficient than the original (https://github.com/facebookresearch/ConvNeXt-V2) 
+    We assume the inputs to this layer are (N, H, W, C)
+    """
+    def __init__(self, dim, use_bias=True):
+        super().__init__()
+        self.use_bias = use_bias
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        if self.use_bias:   
+            self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):   
+        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)    
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        if self.use_bias:     
+            return (self.gamma * Nx + 1) * x + self.beta
+        else:
+            return (self.gamma * Nx + 1) * x  
+  
+class SEBlock(nn.Module): 
+    """     
+    Squeeze-and-Excitation Block proposed in SENet (https://arxiv.org/abs/1709.01507)  
+    We assume the inputs to this layer are (N, C, H, W)    
+    """     
+    def __init__(self, input_channels, internal_neurons):
+        super(SEBlock, self).__init__()
+        self.down = nn.Conv2d(in_channels=input_channels, out_channels=internal_neurons,   
+                              kernel_size=1, stride=1, bias=True)
+        self.up = nn.Conv2d(in_channels=internal_neurons, out_channels=input_channels,
+                            kernel_size=1, stride=1, bias=True)  
+        self.input_channels = input_channels   
+        self.nonlinear = nn.ReLU(inplace=True)
+        self.gap = nn.AdaptiveAvgPool2d((1, 1))
+
+    def forward(self, inputs):  
+        x = self.gap(inputs)  
+        x = self.down(x)
+        x = self.nonlinear(x)     
+        x = self.up(x)     
+        x = F.sigmoid(x)
+        return inputs * x.view(-1, self.input_channels, 1, 1)
+  
+class DilatedReparamBlock(nn.Module):   
+    """   
+    Dilated Reparam Block proposed in UniRepLKNet (https://github.com/AILab-CVC/UniRepLKNet)
+    We assume the inputs to this block are (N, C, H, W)     
+    """
+    def __init__(self, channels, kernel_size, deploy=False, use_sync_bn=False, attempt_use_lk_impl=True):   
+        super().__init__()   
+        self.lk_origin = get_conv2d(channels, channels, kernel_size, stride=1,
+                                    padding=kernel_size//2, dilation=1, groups=channels, bias=deploy,
+                                    attempt_use_lk_impl=attempt_use_lk_impl)
+        self.attempt_use_lk_impl = attempt_use_lk_impl
+    
+        #   Default settings. We did not tune them carefully. Different settings may work better.
+        if kernel_size == 17:     
+            self.kernel_sizes = [5, 9, 3, 3, 3]
+            self.dilates = [1, 2, 4, 5, 7] 
+        elif kernel_size == 15:     
+            self.kernel_sizes = [5, 7, 3, 3, 3]   
+            self.dilates = [1, 2, 3, 5, 7]
+        elif kernel_size == 13:
+            self.kernel_sizes = [5, 7, 3, 3, 3]
+            self.dilates = [1, 2, 3, 4, 5]  
+        elif kernel_size == 11:
+            self.kernel_sizes = [5, 5, 3, 3, 3]
+            self.dilates = [1, 2, 3, 4, 5]
+        elif kernel_size == 9:
+            self.kernel_sizes = [5, 5, 3, 3]
+            self.dilates = [1, 2, 3, 4]   
+        elif kernel_size == 7:
+            self.kernel_sizes = [5, 3, 3]
+            self.dilates = [1, 2, 3]  
+        elif kernel_size == 5:
+            self.kernel_sizes = [3, 3]
+            self.dilates = [1, 2]    
+        else:    
+            raise ValueError('Dilated Reparam Block requires kernel_size >= 5')     
+   
+        if not deploy:
+            self.origin_bn = get_bn(channels, use_sync_bn)  
+            for k, r in zip(self.kernel_sizes, self.dilates):  
+                self.__setattr__('dil_conv_k{}_{}'.format(k, r),
+                                 nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=k, stride=1,     
+                                           padding=(r * (k - 1) + 1) // 2, dilation=r, groups=channels, 
+                                           bias=False))    
+                self.__setattr__('dil_bn_k{}_{}'.format(k, r), get_bn(channels, use_sync_bn=use_sync_bn))
+   
+    def forward(self, x):     
+        if not hasattr(self, 'origin_bn'):      # deploy mode   
+            return self.lk_origin(x)     
+        out = self.origin_bn(self.lk_origin(x))
+        for k, r in zip(self.kernel_sizes, self.dilates):  
+            conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r))
+            bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r)) 
+            out = out + bn(conv(x))  
+        return out 
+
+    def convert_to_deploy(self):  
+        if hasattr(self, 'origin_bn'):    
+            origin_k, origin_b = fuse_bn(self.lk_origin, self.origin_bn)  
+            for k, r in zip(self.kernel_sizes, self.dilates):
+                conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r))     
+                bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r))
+                branch_k, branch_b = fuse_bn(conv, bn)
+                origin_k = merge_dilated_into_large_kernel(origin_k, branch_k, r)
+                origin_b += branch_b
+            merged_conv = get_conv2d(origin_k.size(0), origin_k.size(0), origin_k.size(2), stride=1,
+                                    padding=origin_k.size(2)//2, dilation=1, groups=origin_k.size(0), bias=True,
+                                    attempt_use_lk_impl=self.attempt_use_lk_impl) 
+            merged_conv.weight.data = origin_k    
+            merged_conv.bias.data = origin_b
+            self.lk_origin = merged_conv
+            self.__delattr__('origin_bn') 
+            for k, r in zip(self.kernel_sizes, self.dilates):     
+                self.__delattr__('dil_conv_k{}_{}'.format(k, r))
+                self.__delattr__('dil_bn_k{}_{}'.format(k, r))
+    
+
+class UniRepLKNetBlock(nn.Module):
+    def __init__(self,    
+                 inc,   
+                 dim,    
+                 kernel_size,    
+                 drop_path=0.,
+                 layer_scale_init_value=1e-6,
+                 deploy=False,
+                 attempt_use_lk_impl=True,
+                 with_cp=False, 
+                 use_sync_bn=False,
+                 ffn_factor=4): 
+        super().__init__()     
+        self.with_cp = with_cp  
+        # if deploy:  
+        #     print('------------------------------- Note: deploy mode')
+        # if self.with_cp:    
+        #     print('****** note with_cp = True, reduce memory consumption but may slow down training ******')
+ 
+        if inc != dim:
+            self.conv1x1 = Conv(inc, dim, 1)     
+        else:
+            self.conv1x1 = nn.Identity()    
+
+        self.need_contiguous = (not deploy) or kernel_size >= 7
+ 
+        if kernel_size == 0: 
+            self.dwconv = nn.Identity()     
+            self.norm = nn.Identity() 
+        elif deploy:
+            self.dwconv = get_conv2d(dim, dim, kernel_size=kernel_size, stride=1, padding=kernel_size // 2,   
+                                     dilation=1, groups=dim, bias=True,     
+                                     attempt_use_lk_impl=attempt_use_lk_impl)  
+            self.norm = nn.Identity()    
+        elif kernel_size >= 7:
+            self.dwconv = DilatedReparamBlock(dim, kernel_size, deploy=deploy,
+                                              use_sync_bn=use_sync_bn,     
+                                              attempt_use_lk_impl=attempt_use_lk_impl)
+            self.norm = get_bn(dim, use_sync_bn=use_sync_bn)
+        elif kernel_size == 1:
+            self.dwconv = nn.Conv2d(dim, dim, kernel_size=kernel_size, stride=1, padding=kernel_size // 2,
+                                    dilation=1, groups=1, bias=deploy)
+            self.norm = get_bn(dim, use_sync_bn=use_sync_bn)   
+        else:   
+            assert kernel_size in [3, 5]
+            self.dwconv = nn.Conv2d(dim, dim, kernel_size=kernel_size, stride=1, padding=kernel_size // 2,
+                                    dilation=1, groups=dim, bias=deploy)
+            self.norm = get_bn(dim, use_sync_bn=use_sync_bn) 
+
+        self.se = SEBlock(dim, dim // 4)    
+
+        ffn_dim = int(ffn_factor * dim) 
+        self.pwconv1 = nn.Sequential(
+            NCHWtoNHWC(),
+            nn.Linear(dim, ffn_dim))
+        self.act = nn.Sequential(
+            nn.GELU(),    
+            GRNwithNHWC(ffn_dim, use_bias=not deploy))
+        if deploy: 
+            self.pwconv2 = nn.Sequential(
+                nn.Linear(ffn_dim, dim),    
+                NHWCtoNCHW())
+        else:
+            self.pwconv2 = nn.Sequential(   
+                nn.Linear(ffn_dim, dim, bias=False),     
+                NHWCtoNCHW(),    
+                get_bn(dim, use_sync_bn=use_sync_bn))
+
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones(dim),
+                                  requires_grad=True) if (not deploy) and layer_scale_init_value is not None \
+                                                         and layer_scale_init_value > 0 else None  
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 
+
+    def forward(self, inputs):
+        def _f(x):   
+            if self.need_contiguous:     
+                x = x.contiguous()
+            y = self.se(self.norm(self.dwconv(x)))
+            y = self.pwconv2(self.act(self.pwconv1(y)))    
+            if self.gamma is not None:  
+                y = self.gamma.view(1, -1, 1, 1) * y
+            return self.drop_path(y) + x   
+   
+        inputs = self.conv1x1(inputs)
+    
+        if self.with_cp and inputs.requires_grad:  
+            return checkpoint.checkpoint(_f, inputs)
+        else:
+            return _f(inputs)
+    
+    def convert_to_deploy(self):
+        if hasattr(self.dwconv, 'convert_to_deploy'):   
+            self.dwconv.convert_to_deploy()   
+        if hasattr(self.norm, 'running_var') and hasattr(self.dwconv, 'lk_origin'):
+            std = (self.norm.running_var + self.norm.eps).sqrt()
+            self.dwconv.lk_origin.weight.data *= (self.norm.weight / std).view(-1, 1, 1, 1)
+            self.dwconv.lk_origin.bias.data = self.norm.bias + (self.dwconv.lk_origin.bias - self.norm.running_mean) * self.norm.weight / std
+            self.norm = nn.Identity()
+        if self.gamma is not None:
+            final_scale = self.gamma.data     
+            self.gamma = None   
+        else:  
+            final_scale = 1   
+        if self.act[1].use_bias and len(self.pwconv2) == 3:
+            grn_bias = self.act[1].beta.data
+            self.act[1].__delattr__('beta')
+            self.act[1].use_bias = False
+            linear = self.pwconv2[0]    
+            grn_bias_projected_bias = (linear.weight.data @ grn_bias.view(-1, 1)).squeeze()     
+            bn = self.pwconv2[2]
+            std = (bn.running_var + bn.eps).sqrt()
+            new_linear = nn.Linear(linear.in_features, linear.out_features, bias=True)   
+            new_linear.weight.data = linear.weight * (bn.weight / std * final_scale).view(-1, 1)
+            linear_bias = 0 if linear.bias is None else linear.bias.data
+            linear_bias += grn_bias_projected_bias
+            new_linear.bias.data = (bn.bias + (linear_bias - bn.running_mean) * bn.weight / std) * final_scale     
+            self.pwconv2 = nn.Sequential(new_linear, self.pwconv2[1])
+   
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')   
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32     
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)     
+     
+    module = UniRepLKNetBlock(in_channel, out_channel, 11).to(device)
+
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(GREEN + 'test reparameterization.' + RESET)    
+    module = model_fuse_test(module) 
+    outputs = module(inputs)
+    print(GREEN + 'test reparameterization done.' + RESET)    
+
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,     
+                                     print_detailed=True)    
+    print(RESET) 
diff --git a/engine/extre_module/custom_nn/module/efficientVIM.py b/engine/extre_module/custom_nn/module/efficientVIM.py
new file mode 100644
index 00000000..60811191
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/efficientVIM.py
@@ -0,0 +1,283 @@
+'''    
+本文件由BiliBili：魔傀面具整理 
+engine/extre_module/module_images/CVPR2025-EfficientVIM.png
+论文链接：https://arxiv.org/abs/2411.15241 
+论文链接：https://arxiv.org/abs/2311.17132     
+'''
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings
+warnings.filterwarnings('ignore')    
+from calflops import calculate_flops 
+
+import torch
+import torch.nn as nn
+  
+from engine.extre_module.ultralytics_nn.conv import Conv   
+
+class LayerNorm2D(nn.Module):    
+    """LayerNorm for channels of 2D tensor(B C H W)"""
+    def __init__(self, num_channels, eps=1e-5, affine=True):    
+        super(LayerNorm2D, self).__init__()    
+        self.num_channels = num_channels    
+        self.eps = eps
+        self.affine = affine
+   
+        if self.affine: 
+            self.weight = nn.Parameter(torch.ones(1, num_channels, 1, 1))
+            self.bias = nn.Parameter(torch.zeros(1, num_channels, 1, 1))     
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)    
+ 
+    def forward(self, x):     
+        mean = x.mean(dim=1, keepdim=True)  # (B, 1, H, W) 
+        var = x.var(dim=1, keepdim=True, unbiased=False)  # (B, 1, H, W)
+   
+        x_normalized = (x - mean) / torch.sqrt(var + self.eps)  # (B, C, H, W)  
+  
+        if self.affine:  
+            x_normalized = x_normalized * self.weight + self.bias
+
+        return x_normalized
+     
+
+class LayerNorm1D(nn.Module):     
+    """LayerNorm for channels of 1D tensor(B C L)"""   
+    def __init__(self, num_channels, eps=1e-5, affine=True):  
+        super(LayerNorm1D, self).__init__()
+        self.num_channels = num_channels   
+        self.eps = eps     
+        self.affine = affine
+
+        if self.affine:   
+            self.weight = nn.Parameter(torch.ones(1, num_channels, 1)) 
+            self.bias = nn.Parameter(torch.zeros(1, num_channels, 1))   
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None) 
+ 
+    def forward(self, x):
+        mean = x.mean(dim=1, keepdim=True)  # (B, 1, H, W)
+        var = x.var(dim=1, keepdim=True, unbiased=False)  # (B, 1, H, W)
+
+        x_normalized = (x - mean) / torch.sqrt(var + self.eps)  # (B, C, H, W)
+    
+        if self.affine:     
+            x_normalized = x_normalized * self.weight + self.bias
+
+        return x_normalized
+    
+   
+class ConvLayer2D(nn.Module):   
+    def __init__(self, in_dim, out_dim, kernel_size=3, stride=1, padding=0, dilation=1, groups=1, norm=nn.BatchNorm2d, act_layer=nn.ReLU, bn_weight_init=1):    
+        super(ConvLayer2D, self).__init__()
+        self.conv = nn.Conv2d(
+            in_dim,
+            out_dim, 
+            kernel_size=(kernel_size, kernel_size),
+            stride=(stride, stride),
+            padding=(padding, padding),
+            dilation=(dilation, dilation),
+            groups=groups,   
+            bias=False    
+        ) 
+        self.norm = norm(num_features=out_dim) if norm else None 
+        self.act = act_layer() if act_layer else None
+  
+        if self.norm:
+            torch.nn.init.constant_(self.norm.weight, bn_weight_init)   
+            torch.nn.init.constant_(self.norm.bias, 0)  
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:    
+        x = self.conv(x)
+        if self.norm: 
+            x = self.norm(x)
+        if self.act:     
+            x = self.act(x) 
+        return x
+    
+    
+class ConvLayer1D(nn.Module):     
+    def __init__(self, in_dim, out_dim, kernel_size=3, stride=1, padding=0, dilation=1, groups=1, norm=nn.BatchNorm1d, act_layer=nn.ReLU, bn_weight_init=1):
+        super(ConvLayer1D, self).__init__() 
+        self.conv = nn.Conv1d(  
+            in_dim,
+            out_dim,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,  
+            dilation=dilation,  
+            groups=groups,  
+            bias=False
+        )
+        self.norm = norm(num_features=out_dim) if norm else None    
+        self.act = act_layer() if act_layer else None  
+     
+        if self.norm:   
+            torch.nn.init.constant_(self.norm.weight, bn_weight_init)
+            torch.nn.init.constant_(self.norm.bias, 0)   
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:   
+        x = self.conv(x)  
+        if self.norm:
+            x = self.norm(x)
+        if self.act:
+            x = self.act(x)   
+        return x
+    
+
+class FFN(nn.Module):
+    def __init__(self, in_dim, dim):
+        super().__init__()     
+        self.fc1 = ConvLayer2D(in_dim, dim, 1)
+        self.fc2 = ConvLayer2D(dim, in_dim, 1, act_layer=None, bn_weight_init=0)   
+        
+    def forward(self, x):  
+        x = self.fc2(self.fc1(x))
+        return x  
+   
+class HSMSSD(nn.Module):
+    def __init__(self, d_model, ssd_expand=1, A_init_range=(1, 16), state_dim = 64):   
+        super().__init__() 
+        self.ssd_expand = ssd_expand
+        self.d_inner = int(self.ssd_expand * d_model)
+        self.state_dim = state_dim
+
+        self.BCdt_proj = ConvLayer1D(d_model, 3*state_dim, 1, norm=None, act_layer=None) 
+        conv_dim = self.state_dim*3   
+        self.dw = ConvLayer2D(conv_dim, conv_dim, 3,1,1, groups=conv_dim, norm=None, act_layer=None, bn_weight_init=0)    
+        self.hz_proj = ConvLayer1D(d_model, 2*self.d_inner, 1, norm=None, act_layer=None)
+        self.out_proj = ConvLayer1D(self.d_inner, d_model, 1, norm=None, act_layer=None, bn_weight_init=0)
+
+        A = torch.empty(self.state_dim, dtype=torch.float32).uniform_(*A_init_range)     
+        self.A = torch.nn.Parameter(A)
+        self.act = nn.SiLU()     
+        self.D = nn.Parameter(torch.ones(1))
+        self.D._no_weight_decay = True
+     
+    def forward(self, x, size):     
+        batch, _, L= x.shape 
+        
+        BCdt = self.dw(self.BCdt_proj(x).view(batch,-1, size[0], size[1])).flatten(2)
+        B,C,dt = torch.split(BCdt, [self.state_dim, self.state_dim,  self.state_dim], dim=1)   
+        A = (dt.contiguous() + self.A.view(1,-1,1)).softmax(-1)   
+ 
+        AB = (A * B.contiguous())   
+        h = x @ AB.transpose(-2,-1)     
+        
+        h, z = torch.split(self.hz_proj(h), [self.d_inner, self.d_inner], dim=1) 
+        h = self.out_proj(h.contiguous() * self.act(z.contiguous())+ h.contiguous() * self.D)
+        y = h @ C.contiguous() # B C N, B C L -> B C L    
+        
+        y = y.view(batch,-1, size[0], size[1]).contiguous()# + x * self.D  # B C H W 
+        return y, h  
+
+class ConvolutionalGLU(nn.Module):    
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.) -> None:   
+        super().__init__()
+        out_features = out_features or in_features   
+        hidden_features = hidden_features or in_features  
+        hidden_features = int(2 * hidden_features / 3)     
+        self.fc1 = nn.Conv2d(in_features, hidden_features * 2, 1)
+        self.dwconv = nn.Sequential(
+            nn.Conv2d(hidden_features, hidden_features, kernel_size=3, stride=1, padding=1, bias=True, groups=hidden_features),
+            act_layer()  
+        )     
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x_shortcut = x
+        x, v = self.fc1(x).chunk(2, dim=1)
+        x = self.dwconv(x) * v   
+        x = self.drop(x)    
+        x = self.fc2(x)     
+        x = self.drop(x)
+        return x_shortcut + x
+     
+class EfficientViMBlock(nn.Module):  
+    def __init__(self, inc, ouc, mlp_ratio=4., ssd_expand=1, state_dim=64):    
+        super().__init__()
+        self.dim = inc  
+        self.mlp_ratio = mlp_ratio
+        
+        self.mixer = HSMSSD(d_model=inc, ssd_expand=ssd_expand,state_dim=state_dim)  
+        self.norm = LayerNorm1D(inc)
+        
+        self.dwconv1 = ConvLayer2D(inc, inc, 3, padding=1, groups=inc, bn_weight_init=0, act_layer = None)  
+        self.dwconv2 = ConvLayer2D(inc, inc, 3, padding=1, groups=inc, bn_weight_init=0, act_layer = None)
+  
+        self.ffn = FFN(in_dim=inc, dim=int(inc * mlp_ratio))
+        
+        #LayerScale   
+        self.alpha = nn.Parameter(1e-4 * torch.ones(4, inc), requires_grad=True)     
+
+        if inc != ouc:    
+            self.conv1x1 = Conv(inc, ouc)
+        else:
+            self.conv1x1 = nn.Identity()   
+        
+    def forward(self, x):  
+        alpha = torch.sigmoid(self.alpha).view(4,-1,1,1)     
+   
+        # DWconv1
+        x = (1-alpha[0]) * x + alpha[0] * self.dwconv1(x)  
+        
+        # HSM-SSD   
+        x_prev = x
+        _, _, H, W = x.size()  
+        x, h = self.mixer(self.norm(x.flatten(2)), (H, W))    
+        x = (1-alpha[1]) * x_prev + alpha[1] * x  
+        
+        # DWConv2   
+        x = (1-alpha[2]) * x + alpha[2] * self.dwconv2(x)
+   
+        # FFN
+        x = (1-alpha[3]) * x + alpha[3] * self.ffn(x)
+
+        return self.conv1x1(x) 
+
+class EfficientViMBlock_CGLU(EfficientViMBlock):
+    def __init__(self, inc, ouc, mlp_ratio=4, ssd_expand=1, state_dim=64):     
+        super().__init__(inc, ouc, mlp_ratio, ssd_expand, state_dim)    
+  
+        self.ffn = ConvolutionalGLU(inc, hidden_features=int(inc * mlp_ratio))
+
+if __name__ == '__main__':     
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"     
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32     
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device) 
+     
+    print(RED + '-'*20 + " EfficientViMBlock " + '-'*20 + RESET)
+
+    module = EfficientViMBlock(in_channel, out_channel).to(device)   
+   
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+
+    print(ORANGE)    
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,    
+                                     print_detailed=True) 
+    print(RESET)  
+
+    print(RED + '-'*20 + " EfficientViMBlock_CGLU " + '-'*20 + RESET)    
+
+    module = EfficientViMBlock_CGLU(in_channel, out_channel).to(device) 
+    
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)  
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, in_channel, height, width),    
+                                     output_as_string=True,     
+                                     output_precision=4,  
+                                     print_detailed=True)
+    print(RESET)   
diff --git a/engine/extre_module/custom_nn/module/elgca.py b/engine/extre_module/custom_nn/module/elgca.py
new file mode 100644
index 00000000..1cf6bebd
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/elgca.py
@@ -0,0 +1,179 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/IEEETGRS2024-ELGCA.png
+论文链接：https://arxiv.org/abs/2403.17909
+'''
+     
+import os, sys 
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings 
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops    
+  
+import torch, numbers
+import torch.nn as nn
+from einops import rearrange 
+
+from engine.extre_module.ultralytics_nn.conv import Conv     
+   
+def to_3d(x):
+    return rearrange(x, 'b c h w -> b (h w) c')     
+     
+def to_4d(x,h,w):   
+    return rearrange(x, 'b (h w) c -> b c h w',h=h,w=w)
+
+class BiasFree_LayerNorm(nn.Module):  
+    def __init__(self, normalized_shape):    
+        super(BiasFree_LayerNorm, self).__init__()   
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+
+        assert len(normalized_shape) == 1
+
+        self.weight = nn.Parameter(torch.ones(normalized_shape))   
+        self.normalized_shape = normalized_shape     
+   
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma+1e-5) * self.weight
+
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):     
+        super(WithBias_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+    
+        assert len(normalized_shape) == 1
+  
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)  
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma+1e-5) * self.weight + self.bias
+   
+class LayerNorm(nn.Module): 
+    def __init__(self, dim, LayerNorm_type='BiasFree'):
+        super(LayerNorm, self).__init__()   
+        if LayerNorm_type =='BiasFree':  
+            self.body = BiasFree_LayerNorm(dim)
+        else:     
+            self.body = WithBias_LayerNorm(dim)  
+    
+    def forward(self, x):    
+        h, w = x.shape[-2:]
+        return to_4d(self.body(to_3d(x)), h, w)    
+
+class ELGCA_MLP(nn.Module):     
+    def __init__(self, dim, mlp_ratio=4):
+        super().__init__()
+        
+        self.fc1 = nn.Conv2d(dim, dim * mlp_ratio, 1)    
+        self.pos = nn.Conv2d(dim * mlp_ratio, dim * mlp_ratio, 3, padding=1, groups=dim * mlp_ratio)
+        self.fc2 = nn.Conv2d(dim * mlp_ratio, dim, 1)     
+        self.act = nn.GELU() 
+    
+    def forward(self, x):     
+        x = self.fc1(x)     
+        x = self.act(x)
+        x = x + self.act(self.pos(x))    
+        x = self.fc2(x)    
+        return x
+    
+class ELGCA(nn.Module):     
+    """ 
+    Efficient local global context aggregation module
+    dim: number of channels of input
+    heads: number of heads utilized in computing attention   
+    """
+    def __init__(self, dim, heads=4):     
+        super().__init__()
+        self.heads = heads    
+        self.dwconv = nn.Conv2d(dim//2, dim//2, 3, padding=1, groups=dim//2) 
+        self.qkvl = nn.Conv2d(dim//2, (dim//4)*self.heads, 1, padding=0)
+        self.pool_q = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)     
+        self.pool_k = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)   
+    
+        self.act = nn.GELU()   
+
+    def forward(self, x):
+        B, C, H, W = x.shape     
+        
+        x1, x2 = torch.split(x, [C//2, C//2], dim=1)
+        # apply depth-wise convolution on half channels
+        x1 = self.act(self.dwconv(x1)) 
+ 
+        # linear projection of other half before computing attention
+        x2 = self.act(self.qkvl(x2))
+  
+        x2 = x2.reshape(B, self.heads, C//4, H, W)    
+   
+        q = torch.sum(x2[:, :-3, :, :, :], dim=1)    
+        k = x2[:,-3, :, :, :]
+ 
+        q = self.pool_q(q)  
+        k = self.pool_k(k)   
+     
+        v = x2[:,-2,:,:,:].flatten(2)
+        lfeat = x2[:,-1,:,:,:]
+        
+        qk = torch.matmul(q.flatten(2), k.flatten(2).transpose(1,2))
+        qk = torch.softmax(qk, dim=1).transpose(1,2)
+   
+        x2 = torch.matmul(qk, v).reshape(B, C//4, H, W) 
+        
+        x = torch.cat([x1, lfeat, x2], dim=1)    
+     
+        return x
+ 
+class ELGCA_EncoderBlock(nn.Module):  
+    """   
+    dim: number of channels of input features  
+    """    
+    def __init__(self, inc, dim, mlp_ratio=4, heads=4):    
+        super().__init__()   
+
+        self.layer_norm1 = LayerNorm(dim, 'BiasFree') 
+        self.layer_norm2 = LayerNorm(dim, 'BiasFree')
+        self.mlp = ELGCA_MLP(dim=dim, mlp_ratio=mlp_ratio)    
+        self.attn = ELGCA(dim, heads=heads) 
+
+        self.conv1x1 = Conv(inc, dim, 1) if inc != dim else nn.Identity()  
+        
+    def forward(self, x):  
+        x = self.conv1x1(x)
+        inp_copy = x   
+    
+        x = self.layer_norm1(inp_copy)
+        x = self.attn(x)
+        out = x + inp_copy
+
+        x = self.layer_norm2(out)
+        x = self.mlp(x)
+        out = out + x  
+     
+        return out
+   
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m" 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32   
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+    
+    module = ELGCA_EncoderBlock(in_channel, out_channel).to(device) 
+ 
+    outputs = module(inputs) 
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,   
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/module/example.py b/engine/extre_module/custom_nn/module/example.py
new file mode 100644
index 00000000..03fadc9a
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/example.py
@@ -0,0 +1,62 @@
+'''
+本文件由BiliBili：魔傀面具整理
+论文链接：https://arxiv.org/pdf/2412.16986
+'''   
+
+import os, sys   
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')   
+    
+import torch
+import torch.nn as nn 
+import torch.nn.functional as F
+import numpy as np  
+from functools import partial
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad
+from engine.extre_module.ultralytics_nn.block import C3_Block, C2f_Block, C3k2_Block
+  
+class APBottleneck(nn.Module):
+    """Asymmetric Padding bottleneck."""
+     
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):   
+        """Initializes a bottleneck module with given input/output channels, shortcut option, group, kernels, and   
+        expansion.  
+        """     
+        super().__init__()    
+        c_ = int(c2 * e)  # hidden channels   
+        p = [(2,0,2,0),(0,2,0,2),(0,2,2,0),(2,0,0,2)]  
+        self.pad = [nn.ZeroPad2d(padding=(p[g])) for g in range(4)]
+        self.cv1 = Conv(c1, c_ // 4, k[0], 1, p=0)
+        # self.cv1 = nn.ModuleList([nn.Conv2d(c1, c_, k[0], stride=1, padding= p[g], bias=False) for g in range(4)])  
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)  
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        """'forward()' applies the YOLO FPN to input data."""
+        # y = self.pad[g](x) for g in range(4)     
+        return x + self.cv2((torch.cat([self.cv1(self.pad[g](x)) for g in range(4)], 1))) if self.add else self.cv2((torch.cat([self.cv1(self.pad[g](x)) for g in range(4)], 1)))
+
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+
+    module = APBottleneck(in_channel, out_channel).to(device)
+
+    outputs = module(inputs) 
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+   
+    print(YELLOW + '-'*40 + ' C3 ' + '-'*40 + RESET)
+    module = C3_Block(in_channel, out_channel, partial(APBottleneck, e=1.0), 2).to(device)
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET) 
+
+    print(YELLOW + '-'*40 + ' C2f ' + '-'*40 + RESET)   
+    module = C2f_Block(in_channel, out_channel, partial(APBottleneck, e=1.0), 2).to(device)     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+     
+    print(YELLOW + '-'*40 + ' C3k2 ' + '-'*40 + RESET)   
+    module = C3k2_Block(in_channel, out_channel, partial(APBottleneck, e=1.0), 2).to(device)    
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
diff --git a/engine/extre_module/custom_nn/module/fasterblock.py b/engine/extre_module/custom_nn/module/fasterblock.py
new file mode 100644
index 00000000..53b5f449
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/fasterblock.py
@@ -0,0 +1,169 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2023-FasterBlock.png
+论文链接：https://arxiv.org/pdf/2303.03667
+论文链接：https://arxiv.org/abs/2311.17132
+'''    
+ 
+import os, sys    
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')    
+
+import warnings     
+warnings.filterwarnings('ignore') 
+from calflops import calculate_flops 
+
+import torch
+import torch.nn as nn
+from timm.layers import DropPath
+
+from engine.extre_module.ultralytics_nn.conv import Conv    
+     
+class Partial_conv3(nn.Module):
+    def __init__(self, dim, n_div=4, forward='split_cat'):    
+        super().__init__()
+        self.dim_conv3 = dim // n_div
+        self.dim_untouched = dim - self.dim_conv3 
+        self.partial_conv3 = nn.Conv2d(self.dim_conv3, self.dim_conv3, 3, 1, 1, bias=False)  
+
+        if forward == 'slicing':    
+            self.forward = self.forward_slicing
+        elif forward == 'split_cat': 
+            self.forward = self.forward_split_cat   
+        else:
+            raise NotImplementedError
+ 
+    def forward_slicing(self, x):     
+        # only for inference   
+        x = x.clone()   # !!! Keep the original input intact for the residual connection later   
+        x[:, :self.dim_conv3, :, :] = self.partial_conv3(x[:, :self.dim_conv3, :, :])
+        return x 
+  
+    def forward_split_cat(self, x):
+        # for training/inference
+        x1, x2 = torch.split(x, [self.dim_conv3, self.dim_untouched], dim=1)
+        x1 = self.partial_conv3(x1)    
+        x = torch.cat((x1, x2), 1)     
+        return x   
+
+class Faster_Block(nn.Module):     
+    def __init__(self,     
+                 inc,
+                 ouc, 
+                 n_div=4,   
+                 mlp_ratio=2,
+                 drop_path=0.1,    
+                 layer_scale_init_value=0.0,     
+                 pconv_fw_type='split_cat',
+                 act = None
+                 ):     
+        super().__init__()
+        self.ouc = ouc
+        self.mlp_ratio = mlp_ratio  
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()     
+        self.n_div = n_div
+
+        mlp_hidden_dim = int(ouc * mlp_ratio)   
+   
+        mlp_layer = [ 
+            Conv(ouc, mlp_hidden_dim, 1),
+            nn.Conv2d(mlp_hidden_dim, ouc, 1, bias=False)
+        ] 
+
+        self.mlp = nn.Sequential(*mlp_layer)
+ 
+        self.spatial_mixing = Partial_conv3(
+            ouc,
+            n_div,    
+            pconv_fw_type   
+        )     
+        
+        self.adjust_channel = None
+        if inc != ouc:    
+            self.adjust_channel = Conv(inc, ouc, 1)     
+
+        if layer_scale_init_value > 0:  
+            self.layer_scale = nn.Parameter(layer_scale_init_value * torch.ones((ouc)), requires_grad=True)
+            self.forward = self.forward_layer_scale
+        else:     
+            self.forward = self.forward
+     
+    def forward(self, x):  
+        if self.adjust_channel is not None:
+            x = self.adjust_channel(x)
+        shortcut = x    
+        x = self.spatial_mixing(x) 
+        x = shortcut + self.drop_path(self.mlp(x)) 
+        return x
+     
+    def forward_layer_scale(self, x):
+        if self.adjust_channel is not None:
+            x = self.adjust_channel(x)
+        shortcut = x
+        x = self.spatial_mixing(x)   
+        x = shortcut + self.drop_path( 
+            self.layer_scale.unsqueeze(-1).unsqueeze(-1) * self.mlp(x))   
+        return x 
+
+class ConvolutionalGLU(nn.Module):     
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.) -> None:
+        super().__init__()  
+        out_features = out_features or in_features  
+        hidden_features = hidden_features or in_features 
+        hidden_features = int(2 * hidden_features / 3)
+        self.fc1 = nn.Conv2d(in_features, hidden_features * 2, 1)
+        self.dwconv = nn.Sequential(   
+            nn.Conv2d(hidden_features, hidden_features, kernel_size=3, stride=1, padding=1, bias=True, groups=hidden_features), 
+            act_layer()
+        )
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)  
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):   
+        x_shortcut = x     
+        x, v = self.fc1(x).chunk(2, dim=1)    
+        x = self.dwconv(x) * v
+        x = self.drop(x)     
+        x = self.fc2(x)  
+        x = self.drop(x)
+        return x_shortcut + x
+
+class Faster_Block_CGLU(Faster_Block): 
+    def __init__(self, inc, ouc, n_div=4, mlp_ratio=2, drop_path=0.1, layer_scale_init_value=0, pconv_fw_type='split_cat'):
+        super().__init__(inc, ouc, n_div, mlp_ratio, drop_path, layer_scale_init_value, pconv_fw_type)   
+        self.mlp = ConvolutionalGLU(ouc)  
+  
+if __name__ == '__main__': 
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+    
+    print(RED + '-'*20 + " Faster_Block " + '-'*20 + RESET)
+
+    module = Faster_Block(in_channel, out_channel, n_div=4).to(device)
+
+    outputs = module(inputs)   
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+   
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)  
+  
+    print(RED + '-'*20 + " Faster_Block_CGLU " + '-'*20 + RESET)  
+
+    module = Faster_Block_CGLU(in_channel, out_channel, n_div=4).to(device)
+     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+  
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)   
diff --git a/engine/extre_module/custom_nn/module/iRMB.py b/engine/extre_module/custom_nn/module/iRMB.py
new file mode 100644
index 00000000..f2e6464a
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/iRMB.py
@@ -0,0 +1,158 @@
+''' 
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/ICCV2023-iRMB.png     
+论文链接：https://arxiv.org/abs/2301.01146    
+''' 
+   
+import os, sys  
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings  
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+ 
+import torch  
+import torch.nn as nn 
+import torch.nn.functional as F  
+import torch.nn.init as init 
+from timm.layers import DropPath 
+from einops import rearrange  
+     
+from engine.extre_module.ultralytics_nn.conv import Conv
+
+class SEAttention(nn.Module):     
+    def __init__(self, channel=512,reduction=16):    
+        super().__init__() 
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)  
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),   
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.Sigmoid()  
+        )  
+ 
+    def init_weights(self):   
+        for m in self.modules():    
+            if isinstance(m, nn.Conv2d):   
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)    
+            elif isinstance(m, nn.BatchNorm2d): 
+                init.constant_(m.weight, 1)     
+                init.constant_(m.bias, 0)     
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.001)
+                if m.bias is not None:    
+                    init.constant_(m.bias, 0)    
+
+    def forward(self, x):   
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)     
+        return x * y.expand_as(x)
+
+class iRMB(nn.Module):   
+	def __init__(self, dim_in, dim_out, norm_in=True, has_skip=True, exp_ratio=1.0,
+				 act=True, v_proj=True, dw_ks=3, stride=1, dilation=1, se_ratio=0.0, dim_head=16, window_size=7,     
+				 attn_s=True, qkv_bias=False, attn_drop=0., drop=0., drop_path=0., v_group=False, attn_pre=False):
+		super().__init__() 
+		self.norm = nn.BatchNorm2d(dim_in) if norm_in else nn.Identity()  
+		self.act = Conv.default_act if act else nn.Identity()
+		dim_mid = int(dim_in * exp_ratio)
+		self.has_skip = (dim_in == dim_out and stride == 1) and has_skip  
+		self.attn_s = attn_s
+		if self.attn_s:     
+			assert dim_in % dim_head == 0, 'dim should be divisible by num_heads' 
+			self.dim_head = dim_head  
+			self.window_size = window_size   
+			self.num_head = dim_in // dim_head   
+			self.scale = self.dim_head ** -0.5 
+			self.attn_pre = attn_pre
+			self.qk = nn.Conv2d(dim_in, int(dim_in * 2), 1, bias=qkv_bias)
+			self.v = nn.Sequential(    
+				nn.Conv2d(dim_in, dim_mid, kernel_size=1, groups=self.num_head if v_group else 1, bias=qkv_bias),
+				self.act
+			)     
+			self.attn_drop = nn.Dropout(attn_drop)  
+		else:
+			if v_proj:     
+				self.v = nn.Sequential(    
+					nn.Conv2d(dim_in, dim_mid, kernel_size=1, groups=self.num_head if v_group else 1, bias=qkv_bias),
+					self.act 
+				)
+			else:   
+				self.v = nn.Identity()   
+		self.conv_local = Conv(dim_mid, dim_mid, k=dw_ks, s=stride, d=dilation, g=dim_mid)
+		self.se = SEAttention(dim_mid, reduction=se_ratio) if se_ratio > 0.0 else nn.Identity()
+		
+		self.proj_drop = nn.Dropout(drop)    
+		self.proj = nn.Conv2d(dim_mid, dim_out, kernel_size=1)
+		self.drop_path = DropPath(drop_path) if drop_path else nn.Identity()    
+  
+	def forward(self, x):
+		shortcut = x
+		x = self.norm(x)
+		B, C, H, W = x.shape
+		if self.attn_s:
+			# padding   
+			if self.window_size <= 0:
+				window_size_W, window_size_H = W, H     
+			else:
+				window_size_W, window_size_H = self.window_size, self.window_size
+			pad_l, pad_t = 0, 0     
+			pad_r = (window_size_W - W % window_size_W) % window_size_W
+			pad_b = (window_size_H - H % window_size_H) % window_size_H    
+			x = F.pad(x, (pad_l, pad_r, pad_t, pad_b, 0, 0,)) 
+			n1, n2 = (H + pad_b) // window_size_H, (W + pad_r) // window_size_W    
+			x = rearrange(x, 'b c (h1 n1) (w1 n2) -> (b n1 n2) c h1 w1', n1=n1, n2=n2).contiguous()   
+			# attention   
+			b, c, h, w = x.shape
+			qk = self.qk(x)
+			qk = rearrange(qk, 'b (qk heads dim_head) h w -> qk b heads (h w) dim_head', qk=2, heads=self.num_head, dim_head=self.dim_head).contiguous()     
+			q, k = qk[0], qk[1]
+			attn_spa = (q @ k.transpose(-2, -1)) * self.scale
+			attn_spa = attn_spa.softmax(dim=-1) 
+			attn_spa = self.attn_drop(attn_spa)  
+			if self.attn_pre:
+				x = rearrange(x, 'b (heads dim_head) h w -> b heads (h w) dim_head', heads=self.num_head).contiguous()
+				x_spa = attn_spa @ x
+				x_spa = rearrange(x_spa, 'b heads (h w) dim_head -> b (heads dim_head) h w', heads=self.num_head, h=h, w=w).contiguous()
+				x_spa = self.v(x_spa)
+			else:    
+				v = self.v(x)  
+				v = rearrange(v, 'b (heads dim_head) h w -> b heads (h w) dim_head', heads=self.num_head).contiguous()
+				x_spa = attn_spa @ v    
+				x_spa = rearrange(x_spa, 'b heads (h w) dim_head -> b (heads dim_head) h w', heads=self.num_head, h=h, w=w).contiguous()
+			# unpadding
+			x = rearrange(x_spa, '(b n1 n2) c h1 w1 -> b c (h1 n1) (w1 n2)', n1=n1, n2=n2).contiguous()
+			if pad_r > 0 or pad_b > 0:
+				x = x[:, :, :H, :W].contiguous()
+		else:  
+			x = self.v(x)
+  
+		x = x + self.se(self.conv_local(x)) if self.has_skip else self.se(self.conv_local(x))
+    
+		x = self.proj_drop(x)    
+		x = self.proj(x)    
+   
+		x = (shortcut + self.drop_path(x)) if self.has_skip else x
+		return x
+
+if __name__ == '__main__':    
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)   
+  
+    module = iRMB(in_channel, out_channel).to(device) 
+  
+    outputs = module(inputs)     
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)    
+    flops, macs, _ = calculate_flops(model=module,   
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,     
+                                     output_precision=4,
+                                     print_detailed=True)     
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/module/mambaout.py b/engine/extre_module/custom_nn/module/mambaout.py
new file mode 100644
index 00000000..1984e527
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/mambaout.py
@@ -0,0 +1,203 @@
+''' 
+本文件由BiliBili：魔傀面具整理  
+engine/extre_module/module_images/CVPR2025-MambaOut.png
+engine/extre_module/module_images/CVPR2025-MambaOut-UniRepBlock.png
+engine/extre_module/module_images/CVPR2025-MambaOut-DRC.png  
+论文链接：https://arxiv.org/abs/2405.07992
+论文链接：https://arxiv.org/abs/2311.15599
+'''
+  
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+   
+import warnings
+warnings.filterwarnings('ignore') 
+from calflops import calculate_flops   
+
+import torch
+import torch.nn as nn     
+from functools import partial   
+from timm.layers import DropPath 
+
+from engine.extre_module.custom_nn.conv_module.DilatedReparamConv import DilatedReparamConv
+from engine.extre_module.custom_nn.module.UniRepLKBlock import UniRepLKNetBlock
+from engine.extre_module.ultralytics_nn.conv import Conv
+from engine.extre_module.torch_utils import model_fuse_test   
+ 
+class LayerNormGeneral(nn.Module):
+    r""" General LayerNorm for different situations.    
+
+    Args:     
+        affine_shape (int, list or tuple): The shape of affine weight and bias.
+            Usually the affine_shape=C, but in some implementation, like torch.nn.LayerNorm, 
+            the affine_shape is the same as normalized_dim by default. 
+            To adapt to different situations, we offer this argument here.
+        normalized_dim (tuple or list): Which dims to compute mean and variance. 
+        scale (bool): Flag indicates whether to use scale or not.     
+        bias (bool): Flag indicates whether to use scale or not.
+
+        We give several examples to show how to specify the arguments.
+
+        LayerNorm (https://arxiv.org/abs/1607.06450):
+            For input shape of (B, *, C) like (B, N, C) or (B, H, W, C),  
+                affine_shape=C, normalized_dim=(-1, ), scale=True, bias=True;
+            For input shape of (B, C, H, W),   
+                affine_shape=(C, 1, 1), normalized_dim=(1, ), scale=True, bias=True.   
+    
+        Modified LayerNorm (https://arxiv.org/abs/2111.11418)
+            that is idental to partial(torch.nn.GroupNorm, num_groups=1):  
+            For input shape of (B, N, C),    
+                affine_shape=C, normalized_dim=(1, 2), scale=True, bias=True;
+            For input shape of (B, H, W, C),
+                affine_shape=C, normalized_dim=(1, 2, 3), scale=True, bias=True;
+            For input shape of (B, C, H, W),     
+                affine_shape=(C, 1, 1), normalized_dim=(1, 2, 3), scale=True, bias=True. 
+
+        For the several metaformer baslines,  
+            IdentityFormer, RandFormer and PoolFormerV2 utilize Modified LayerNorm without bias (bias=False);  
+            ConvFormer and CAFormer utilizes LayerNorm without bias (bias=False).  
+    """    
+    def __init__(self, affine_shape=None, normalized_dim=(-1, ), scale=True,     
+        bias=True, eps=1e-5):     
+        super().__init__()
+        self.normalized_dim = normalized_dim   
+        self.use_scale = scale
+        self.use_bias = bias    
+        self.weight = nn.Parameter(torch.ones(affine_shape)) if scale else None    
+        self.bias = nn.Parameter(torch.zeros(affine_shape)) if bias else None
+        self.eps = eps
+     
+    def forward(self, x):
+        c = x - x.mean(self.normalized_dim, keepdim=True)
+        s = c.pow(2).mean(self.normalized_dim, keepdim=True)
+        x = c / torch.sqrt(s + self.eps)
+        if self.use_scale: 
+            x = x * self.weight   
+        if self.use_bias:
+            x = x + self.bias
+        return x     
+     
+class MambaOut(nn.Module):
+    r""" Our implementation of Gated CNN Block: https://arxiv.org/pdf/1612.08083
+    Args: 
+        conv_ratio: control the number of channels to conduct depthwise convolution.     
+            Conduct convolution on partial channels can improve practical efficiency.    
+            The idea of partial channels is from ShuffleNet V2 (https://arxiv.org/abs/1807.11164) and 
+            also used by InceptionNeXt (https://arxiv.org/abs/2303.16900) and FasterNet (https://arxiv.org/abs/2303.03667)     
+    """
+    def __init__(self, inc, dim, expansion_ratio=8/3, kernel_size=7, conv_ratio=1.0,  
+                 norm_layer=partial(LayerNormGeneral,eps=1e-6,normalized_dim=(1, 2, 3)),    
+                 act_layer=nn.GELU,
+                 drop_path=0.,
+                 **kwargs):
+        super().__init__()     
+        self.norm = norm_layer((dim, 1, 1))
+        hidden = int(expansion_ratio * dim)     
+        self.fc1 = nn.Conv2d(dim, hidden * 2, 1)   
+        self.act = act_layer()     
+        conv_channels = int(conv_ratio * dim)
+        self.split_indices = (hidden, hidden - conv_channels, conv_channels)
+        self.conv = nn.Conv2d(conv_channels, conv_channels, kernel_size=kernel_size, padding=kernel_size//2, groups=conv_channels)
+        self.fc2 = nn.Conv2d(hidden, dim, 1)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.conv1x1 = Conv(inc, dim, 1) if inc != dim else nn.Identity()
+
+    def forward(self, x):
+        x = self.conv1x1(x)
+        shortcut = x # [B, H, W, C] 
+        x = self.norm(x)
+        g, i, c = torch.split(self.fc1(x), self.split_indices, dim=1) 
+        # c = c.permute(0, 3, 1, 2) # [B, H, W, C] -> [B, C, H, W] 
+        c = self.conv(c)
+        # c = c.permute(0, 2, 3, 1) # [B, C, H, W] -> [B, H, W, C]  
+        x = self.fc2(self.act(g) * torch.cat((i, c), dim=1))
+        x = self.drop_path(x) 
+        return x + shortcut   
+    
+class MambaOut_DilatedReparamConv(MambaOut):
+    r""" Our implementation of Gated CNN Block: https://arxiv.org/pdf/1612.08083
+    Args: 
+        conv_ratio: control the number of channels to conduct depthwise convolution.
+            Conduct convolution on partial channels can improve practical efficiency.
+            The idea of partial channels is from ShuffleNet V2 (https://arxiv.org/abs/1807.11164) and 
+            also used by InceptionNeXt (https://arxiv.org/abs/2303.16900) and FasterNet (https://arxiv.org/abs/2303.03667) 
+    """   
+    def __init__(self, inc, dim, expansion_ratio=8 / 3, kernel_size=7, conv_ratio=1, norm_layer=partial(LayerNormGeneral, eps=0.000001, normalized_dim=(1, 2, 3)), act_layer=nn.GELU, drop_path=0, **kwargs):    
+        super().__init__(inc, dim, expansion_ratio, kernel_size, conv_ratio, norm_layer, act_layer, drop_path, **kwargs) 
+        conv_channels = int(conv_ratio * dim)
+        self.conv = DilatedReparamConv(conv_channels, conv_channels, kernel_size=kernel_size)
+     
+class MambaOut_UniRepLKBlock(MambaOut):
+    r""" Our implementation of Gated CNN Block: https://arxiv.org/pdf/1612.08083
+    Args: 
+        conv_ratio: control the number of channels to conduct depthwise convolution.     
+            Conduct convolution on partial channels can improve practical efficiency.
+            The idea of partial channels is from ShuffleNet V2 (https://arxiv.org/abs/1807.11164) and   
+            also used by InceptionNeXt (https://arxiv.org/abs/2303.16900) and FasterNet (https://arxiv.org/abs/2303.03667)
+    """    
+    def __init__(self, inc, dim, expansion_ratio=8 / 3, kernel_size=7, conv_ratio=1, norm_layer=partial(LayerNormGeneral, eps=0.000001, normalized_dim=(1, 2, 3)), act_layer=nn.GELU, drop_path=0, **kwargs):
+        super().__init__(inc, dim, expansion_ratio, kernel_size, conv_ratio, norm_layer, act_layer, drop_path, **kwargs)
+        conv_channels = int(conv_ratio * dim)
+        self.conv = UniRepLKNetBlock(conv_channels, conv_channels, kernel_size=kernel_size)   
+  
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"   
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+
+    print(RED + '-'*20 + " MambaOut " + '-'*20 + RESET)  
+
+    module = MambaOut(in_channel, out_channel).to(device)  
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)  
+    
+    print(ORANGE) 
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,  
+                                     print_detailed=True)
+    print(RESET)    
+    
+    print(RED + '-'*20 + " MambaOut_DilatedReparamConv " + '-'*20 + RESET)    
+
+    module = MambaOut_DilatedReparamConv(in_channel, out_channel, kernel_size=11).to(device)
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+   
+    print(GREEN + 'test reparameterization.' + RESET) 
+    module = model_fuse_test(module)
+    outputs = module(inputs) 
+    print(GREEN + 'test reparameterization done.' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,  
+                                     output_precision=4,     
+                                     print_detailed=True)
+    print(RESET)  
+    
+    print(RED + '-'*20 + " MambaOut_UniRepLKBlock " + '-'*20 + RESET)
+    
+    module = MambaOut_UniRepLKBlock(in_channel, out_channel, kernel_size=11).to(device)
+     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)     
+
+    print(GREEN + 'test reparameterization.' + RESET) 
+    module = model_fuse_test(module) 
+    outputs = module(inputs)
+    print(GREEN + 'test reparameterization done.' + RESET) 
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, in_channel, height, width), 
+                                     output_as_string=True,   
+                                     output_precision=4,  
+                                     print_detailed=True)
+    print(RESET)   
diff --git a/engine/extre_module/custom_nn/module/starblock.py b/engine/extre_module/custom_nn/module/starblock.py
new file mode 100644
index 00000000..634aa426
--- /dev/null
+++ b/engine/extre_module/custom_nn/module/starblock.py
@@ -0,0 +1,62 @@
+'''    
+本文件由BiliBili：魔傀面具整理     
+engine/extre_module/module_images/CVPR2024-StarBlock.png
+论文链接：https://arxiv.org/pdf/2403.19967
+'''   
+    
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')  
+
+import warnings  
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops  
+   
+import torch     
+import torch.nn as nn
+from timm.layers import DropPath   
+     
+from engine.extre_module.ultralytics_nn.conv import Conv   
+ 
+class Star_Block(nn.Module):    
+    def __init__(self, inc, ouc, mlp_ratio=3, drop_path=0.):
+        super().__init__()   
+        self.dwconv = Conv(inc, inc, 7, g=inc, act=False)  
+        self.f1 = nn.Conv2d(inc, mlp_ratio * inc, 1)   
+        self.f2 = nn.Conv2d(inc, mlp_ratio * inc, 1)    
+        self.g = Conv(mlp_ratio * inc, inc, 1, act=False)   
+        self.dwconv2 = nn.Conv2d(inc, inc, 7, 1, (7 - 1) // 2, groups=inc) 
+        self.act = nn.ReLU6()
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+  
+        if inc != ouc:
+            self.conv1x1 = Conv(inc, ouc, k=1)    
+        else:
+            self.conv1x1 = nn.Identity()  
+
+    def forward(self, x):   
+        input = x 
+        x = self.dwconv(x)    
+        x1, x2 = self.f1(x), self.f2(x)     
+        x = self.act(x1) * x2  
+        x = self.dwconv2(self.g(x))
+        x = input + self.drop_path(x)
+        return self.conv1x1(x)   
+ 
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"     
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+
+    module = Star_Block(in_channel, out_channel, mlp_ratio=3).to(device)     
+
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+ 
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, in_channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,     
+                                     print_detailed=True)    
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/neck/FDPN.py b/engine/extre_module/custom_nn/neck/FDPN.py
new file mode 100644
index 00000000..03c6fc77
--- /dev/null
+++ b/engine/extre_module/custom_nn/neck/FDPN.py
@@ -0,0 +1,282 @@
+'''  
+本文件由BiliBili：魔傀面具整理   
+自研模块：FocusingDiffusionPyramidNetwork
+'''    
+
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+ 
+import warnings 
+warnings.filterwarnings('ignore')     
+from calflops import calculate_flops    
+  
+import copy   
+from collections import OrderedDict     
+    
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from engine.core import register
+from engine.extre_module.ultralytics_nn.conv import Conv, autopad   
+from engine.extre_module.ultralytics_nn.block import C2f    
+
+__all__ = ['FDPN']    
+    
+class ADown(nn.Module):
+    def __init__(self, c1, c2):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__()    
+        self.c = c2 // 2
+        self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)     
+        self.cv2 = Conv(c1 // 2, self.c, 1, 1, 0)    
+
+    def forward(self, x):   
+        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
+        x1,x2 = x.chunk(2, 1)
+        x1 = self.cv1(x1)
+        x2 = torch.nn.functional.max_pool2d(x2, 3, 2, 1)
+        x2 = self.cv2(x2)
+        return torch.cat((x1, x2), 1)
+   
+class FocusFeature(nn.Module):   
+    def __init__(self, inc, kernel_sizes=(5, 7, 9, 11), e=0.5) -> None:
+        super().__init__()     
+        hidc = int(inc[1] * e)
+     
+        self.conv1 = nn.Sequential(
+            nn.Upsample(scale_factor=2),
+            Conv(inc[0], hidc, 1)
+        )     
+        self.conv2 = Conv(inc[1], hidc, 1) if e != 1 else nn.Identity()    
+        self.conv3 = ADown(inc[2], hidc)
+    
+   
+        self.dw_conv = nn.ModuleList(nn.Conv2d(hidc * 3, hidc * 3, kernel_size=k, padding=autopad(k), groups=hidc * 3) for k in kernel_sizes)
+        self.pw_conv = Conv(hidc * 3, hidc * 3)
+        self.conv_1x1 = Conv(hidc * 3, int(hidc / e))
+    
+    def forward(self, x):
+        x1, x2, x3 = x    
+        x1 = self.conv1(x1)
+        x2 = self.conv2(x2)
+        x3 = self.conv3(x3)
+     
+        x = torch.cat([x1, x2, x3], dim=1)   
+        feature = torch.sum(torch.stack([x] + [layer(x) for layer in self.dw_conv], dim=0), dim=0)
+        feature = self.pw_conv(feature)     
+  
+        x = x + feature 
+        return self.conv_1x1(x)
+
+@register(force=True) # 避免因为导入导致的多次注册    
+class FDPN(nn.Module):    
+    def __init__(self, 
+                 in_channels=[512, 1024, 2048],        # 输入特征图的通道数列表，例如来自骨干网络的不同层 
+                 feat_strides=[8, 16, 32],             # 输入特征图的步幅列表，表示特征图相对于输入图像的缩放比例
+                 hidden_dim=256,                       # 隐藏层维度，所有特征图将被投影到这个维度     
+                 nhead=8,                              # Transformer 编码器中多头自注意力的头数
+                 dim_feedforward=1024,                 # Transformer 编码器中前馈网络的维度
+                 dropout=0.0,                          # Transformer 编码器中的 dropout 概率 
+                 enc_act='gelu',                       # Transformer 编码器中的激活函数类型     
+                 use_encoder_idx=[2],                  # 指定哪些层使用 Transformer 编码器（索引列表）    
+                 num_encoder_layers=1,                 # Transformer 编码器的层数     
+                 pe_temperature=10000,                 # 位置编码的温度参数，用于控制频率     
+                 fdpn_ks=[3, 5, 7, 9],                 # FDPN中的FocusFeature-kernel_sizes参数
+                 depth_mult=1.0,                       # 深度乘数，用于调整网络深度  
+                 out_strides=[8, 16, 32],              # 输出特征图的步幅列表 
+                 eval_spatial_size=None,               # 评估时的空间尺寸 (H, W)，用于预计算位置编码 
+                 ):
+        super().__init__()
+        from engine.deim.hybrid_encoder import TransformerEncoderLayer, TransformerEncoder # 避免 circular import
+    
+        # 保存传入的参数为类的成员变量
+        self.in_channels = in_channels              # 输入通道数列表
+        self.feat_strides = feat_strides            # 输入特征步幅列表
+        self.hidden_dim = hidden_dim                # 隐藏层维度
+        self.use_encoder_idx = use_encoder_idx      # 使用 Transformer 编码器的层索引 
+        self.num_encoder_layers = num_encoder_layers # Transformer 编码器层数
+        self.pe_temperature = pe_temperature        # 位置编码温度参数 
+        self.eval_spatial_size = eval_spatial_size  # 评估时的空间尺寸
+        self.out_channels = [hidden_dim for _ in range(len(in_channels))]  # 输出通道数，统一为 hidden_dim 
+        self.out_strides = out_strides              # 输出步幅
+     
+        assert len(in_channels) == 3 # 仅支持3层特征图的输入
+
+        # 输入投影层：将不同通道数的输入特征图投影到统一的 hidden_dim  
+        self.input_proj = nn.ModuleList()
+        for in_channel in in_channels:
+            # 每个投影层包含 1x1 卷积和批量归一化    
+            proj = nn.Sequential(OrderedDict([     
+                ('conv', nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False)),  # 1x1 卷积变换通道数    
+                ('norm', nn.BatchNorm2d(hidden_dim))                                    # 批量归一化
+            ]))   
+            self.input_proj.append(proj)
+        
+        # Transformer 编码器：对指定层进行特征增强     
+        # 定义单层 Transformer 编码器
+        encoder_layer = TransformerEncoderLayer(   
+            hidden_dim,             # 输入维度
+            nhead=nhead,            # 注意力头数
+            dim_feedforward=dim_feedforward,  # 前馈网络维度 
+            dropout=dropout,        # dropout 概率 
+            activation=enc_act      # 激活函数
+        )  
+        # 为每个指定层创建独立的 Transformer 编码器    
+        self.encoder = nn.ModuleList([
+            TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)  # 深拷贝确保独立性
+            for _ in range(len(use_encoder_idx))    
+        ])
+  
+        # --------------------------- 第一阶段
+        self.FocusFeature_1 = FocusFeature(inc=[hidden_dim, hidden_dim, hidden_dim], kernel_sizes=fdpn_ks)
+
+        self.p4_to_p5_down1 = Conv(hidden_dim, hidden_dim, k=3, s=2)  
+        self.p5_block1 = C2f(hidden_dim * 2, hidden_dim, round(3 * depth_mult), shortcut=True)
+
+        self.p4_to_p3_up1 = nn.Upsample(scale_factor=2) 
+        self.p3_block1 = C2f(hidden_dim * 2, hidden_dim, round(3 * depth_mult), shortcut=True)     
+
+        # --------------------------- 第二阶段    
+        self.FocusFeature_2 = FocusFeature(inc=[hidden_dim, hidden_dim, hidden_dim], kernel_sizes=fdpn_ks)
+  
+        self.p4_to_p5_down2 = Conv(hidden_dim, hidden_dim, k=3, s=2)
+        self.p5_block2 = C2f(hidden_dim * 3, hidden_dim, round(3 * depth_mult), shortcut=True)
+
+        if len(out_strides) == 3:     
+            self.p4_to_p3_up2 = nn.Upsample(scale_factor=2) 
+            self.p3_block2 = C2f(hidden_dim * 3, hidden_dim, round(3 * depth_mult), shortcut=True)
+
+        # 初始化参数，包括预计算位置编码 
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # 如果指定了评估时的空间尺寸，则预计算位置编码    
+        if self.eval_spatial_size: 
+            for idx in self.use_encoder_idx:     
+                stride = self.feat_strides[idx]  # 当前层的步幅
+                # 根据特征图尺寸和步幅计算位置编码   
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_spatial_size[1] // stride,  # 宽度
+                    self.eval_spatial_size[0] // stride,  # 高度
+                    self.hidden_dim,                      # 嵌入维度     
+                    self.pe_temperature                   # 温度参数   
+                ) 
+                # 将位置编码存储为类的属性
+                setattr(self, f'pos_embed{idx}', pos_embed)
+                # self.register_buffer(f'pos_embed{idx}', pos_embed)    
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
+        """
+        生成 2D sine-cosine 位置编码
+        Args: 
+            w (int): 特征图宽度
+            h (int): 特征图高度
+            embed_dim (int): 嵌入维度，必须能被 4 整除
+            temperature (float): 温度参数，控制频率
+        Returns:
+            torch.Tensor: 位置编码张量，形状为 [1, w*h, embed_dim]  
+        """
+        # 创建宽度和高度的网格     
+        grid_w = torch.arange(int(w), dtype=torch.float32)   
+        grid_h = torch.arange(int(h), dtype=torch.float32)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')  # 生成 2D 网格
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'  
+        pos_dim = embed_dim // 4  # 每个方向 (w, h) 的编码维度
+        # 计算频率因子
+        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim     
+        omega = 1. / (temperature ** omega)    
+   
+        # 计算宽度和高度的 sin 和 cos 编码
+        out_w = grid_w.flatten()[..., None] @ omega[None]  # [w*h, pos_dim]  
+        out_h = grid_h.flatten()[..., None] @ omega[None]  # [w*h, pos_dim]
+   
+        # 拼接 sin 和 cos 编码，形成最终的位置编码
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :] 
+
+    def forward(self, feats):    
+        """
+        前向传播函数
+        Args:
+            feats (list[torch.Tensor]): 输入特征图列表，形状为 [B, C, H, W]，长度需与 in_channels 一致     
+        Returns:    
+            list[torch.Tensor]: 融合后的多尺度特征图列表
+        """
+   
+        # 检查输入特征图数量是否与预期一致
+        assert len(feats) == len(self.in_channels)
+
+        # 输入投影：将所有特征图投影到 hidden_dim 通道
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]  
+ 
+        # Transformer 编码器：对指定层进行特征增强   
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):    
+                h, w = proj_feats[enc_ind].shape[2:]  # 获取当前特征图的高度和宽度
+                # 将特征图展平并调整维度：[B, C, H, W] -> [B, H*W, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1)
+                # 根据训练或评估模式选择位置编码     
+                if self.training or self.eval_spatial_size is None:
+                    # 训练时动态生成位置编码
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device)   
+                else:
+                    # 评估时使用预计算的位置编码  
+                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None).to(src_flatten.device)  
+     
+                # Transformer 编码器处理 
+                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)  
+                # 将输出重塑回特征图形状：[B, H*W, C] -> [B, C, H, W]
+                proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
+
+        fouce_feature1 = self.FocusFeature_1(proj_feats[::-1]) # 倒序是因为FocusFeature要求从小特征图到大特征图输入
+    
+        fouce_feature1_to_p5_1 = self.p4_to_p5_down1(fouce_feature1) # fouce_feature1 to p5 
+        fouce_feature1_to_p5_2 = self.p5_block1(torch.cat([fouce_feature1_to_p5_1, proj_feats[2]], dim=1))
+  
+        fouce_feature1_to_p3_1 = self.p4_to_p3_up1(fouce_feature1) # fouce_feature1 to p3 
+        fouce_feature1_to_p3_2 = self.p3_block1(torch.cat([fouce_feature1_to_p3_1, proj_feats[0]], dim=1))
+
+        fouce_feature2 = self.FocusFeature_2([fouce_feature1_to_p5_2, fouce_feature1, fouce_feature1_to_p3_2]) 
+
+        fouce_feature2_to_p5 = self.p4_to_p5_down2(fouce_feature2) # fouce_feature2 to p5     
+        fouce_feature2_to_p5 = self.p5_block2(torch.cat([fouce_feature2_to_p5, fouce_feature1_to_p5_1, fouce_feature1_to_p5_2], dim=1))
+
+        if len(self.out_strides) == 3:     
+            fouce_feature2_to_p3 = self.p4_to_p3_up2(fouce_feature2) # fouce_feature2 to p3 
+            fouce_feature2_to_p3 = self.p3_block2(torch.cat([fouce_feature2_to_p3, fouce_feature1_to_p3_1, fouce_feature1_to_p3_2], dim=1))
+            return [fouce_feature2_to_p3, fouce_feature2, fouce_feature2_to_p5]  
+        else:
+            return [fouce_feature2, fouce_feature2_to_p5]
+  
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    bs, image_height, image_width = 1, 640, 640
+    params = {  
+        'in_channels' : [32, 64, 128],
+        'feat_strides' : [8, 16, 32],  
+        'hidden_dim' : 128,
+        'use_encoder_idx' : [2],
+        'fdpn_ks' : [3, 5, 7, 9], 
+        'depth_mult' : 1.0, 
+        'out_strides' : [16, 32],     
+        'eval_spatial_size' : [image_height, image_width]  
+    }
+ 
+    feats = [torch.randn((bs, params['in_channels'][i], image_height // params['feat_strides'][i], image_width // params['feat_strides'][i])).to(device) for i in range(len(params['in_channels']))]
+    module = FDPN(**params).to(device)     
+    outputs = module(feats)
+   
+    input_feats_info = ', '.join([str(i.size()) for i in feats])
+    print(GREEN + f'input feature:[{input_feats_info}]' + RESET)    
+    output_feats_info = ', '.join([str(i.size()) for i in outputs])
+    print(GREEN + f'output feature:[{output_feats_info}]' + RESET)
+
+    print(ORANGE)  
+    flops, macs, _ = calculate_flops(model=module,
+                                     args=[feats], 
+                                     output_as_string=True,  
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/norm/dyt.py b/engine/extre_module/custom_nn/norm/dyt.py
new file mode 100644
index 00000000..187e26f4
--- /dev/null
+++ b/engine/extre_module/custom_nn/norm/dyt.py
@@ -0,0 +1,53 @@
+'''     
+本文件由BiliBili：魔傀面具整理 
+engine/extre_module/module_images/CVPR2025-Dynamic Tanh.png    
+论文链接：https://arxiv.org/abs/2503.10622
+'''
+  
+import warnings    
+warnings.filterwarnings('ignore')   
+from calflops import calculate_flops
+
+import torch     
+import torch.nn as nn
+
+class DynamicTanh(nn.Module):
+    def __init__(self, normalized_shape, channels_last=False, alpha_init_value=0.5):    
+        super().__init__()
+        self.normalized_shape = normalized_shape    
+        self.alpha_init_value = alpha_init_value    
+        self.channels_last = channels_last
+     
+        self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)  
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+
+    def forward(self, x):
+        x = torch.tanh(self.alpha * x)
+        if self.channels_last:    
+            x = x * self.weight + self.bias
+        else:  
+            x = x * self.weight[:, None, None] + self.bias[:, None, None]    
+        return x   
+
+    def extra_repr(self):     
+        return f"normalized_shape={self.normalized_shape}, alpha_init_value={self.alpha_init_value}, channels_last={self.channels_last}"     
+     
+if __name__ == '__main__':     
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)     
+     
+    module = DynamicTanh(channel).to(device) 
+
+    outputs = module(inputs)   
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,    
+                                     input_shape=(batch_size, channel, height, width),   
+                                     output_as_string=True,  
+                                     output_precision=4,
+                                     print_detailed=True) 
+    print(RESET)    
diff --git a/engine/extre_module/custom_nn/norm/repbn.py b/engine/extre_module/custom_nn/norm/repbn.py
new file mode 100644
index 00000000..71f0a972
--- /dev/null
+++ b/engine/extre_module/custom_nn/norm/repbn.py
@@ -0,0 +1,69 @@
+''' 
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/ICML2024-RepBN.png
+论文链接：https://arxiv.org/pdf/2405.11582    
+'''     
+ 
+import warnings
+warnings.filterwarnings('ignore')  
+from calflops import calculate_flops
+     
+import torch
+import torch.nn as nn   
+
+class RepBN(nn.Module):   
+    def __init__(self, channels):    
+        super(RepBN, self).__init__()
+        self.alpha = nn.Parameter(torch.ones(1))
+        self.bn = nn.BatchNorm1d(channels)     
+
+    def forward(self, x):
+        x = x.transpose(1, 2)  
+        x = self.bn(x) + self.alpha * x
+        x = x.transpose(1, 2)     
+        return x
+
+class LinearNorm(nn.Module):
+    def __init__(self, dim, norm1=nn.LayerNorm, norm2=RepBN, warm=0, step=10000, r0=1.0):
+        super(LinearNorm, self).__init__()
+        self.register_buffer('warm', torch.tensor(warm))    
+        self.register_buffer('iter', torch.tensor(step))   
+        self.register_buffer('total_step', torch.tensor(step))  
+        self.r0 = r0
+        self.norm1 = norm1(dim)
+        self.norm2 = norm2(dim)   
+ 
+    def forward(self, x):
+        if self.training:
+            if self.warm > 0:     
+                self.warm.copy_(self.warm - 1)   
+                x = self.norm1(x)
+            else: 
+                lamda = self.r0 * self.iter / self.total_step
+                if self.iter > 0:
+                    self.iter.copy_(self.iter - 1)
+                x1 = self.norm1(x)     
+                x2 = self.norm2(x)     
+                x = lamda * x1 + (1 - lamda) * x2 
+        else:   
+            x = self.norm2(x)
+        return x
+
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, height * width, channel)).to(device)
+
+    module = LinearNorm(channel).to(device)  
+  
+    outputs = module(inputs)  
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, height * width, channel), 
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET) 
diff --git a/engine/extre_module/custom_nn/stem/SRFD.py b/engine/extre_module/custom_nn/stem/SRFD.py
new file mode 100644
index 00000000..a14d21c3
--- /dev/null
+++ b/engine/extre_module/custom_nn/stem/SRFD.py
@@ -0,0 +1,105 @@
+'''
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/SRFD.png
+论文链接：https://ieeexplore.ieee.org/document/10142024   
+'''    
+    
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops 
+
+import torch
+import torch.nn as nn
+    
+class Cut(nn.Module):
+    def __init__(self, in_channels, out_channels): 
+        super().__init__()  
+        self.conv_fusion = nn.Conv2d(in_channels * 4, out_channels, kernel_size=1, stride=1)  
+        self.batch_norm = nn.BatchNorm2d(out_channels)
+ 
+    def forward(self, x):
+        x0 = x[:, :, 0::2, 0::2]  # x = [B, C, H/2, W/2]
+        x1 = x[:, :, 1::2, 0::2]   
+        x2 = x[:, :, 0::2, 1::2]
+        x3 = x[:, :, 1::2, 1::2]
+        x = torch.cat([x0, x1, x2, x3], dim=1)  # x = [B, 4*C, H/2, W/2]
+        x = self.conv_fusion(x)     # x = [B, out_channels, H/2, W/2]
+        x = self.batch_norm(x)
+        return x    
+
+class SRFD(nn.Module):
+    def __init__(self, in_channels=3, out_channels=96):
+        super().__init__()
+        out_c14 = int(out_channels / 4)  # out_channels / 4
+        out_c12 = int(out_channels / 2)  # out_channels / 2 
+  
+        # 7x7 convolution with stride 1 for feature reinforcement, Channels from 3 to 1/4C.
+        self.conv_init = nn.Conv2d(in_channels, out_c14, kernel_size=7, stride=1, padding=3)    
+
+        # original size to 2x downsampling layer     
+        self.conv_1 = nn.Conv2d(out_c14, out_c12, kernel_size=3, stride=1, padding=1, groups=out_c14)
+        self.conv_x1 = nn.Conv2d(out_c12, out_c12, kernel_size=3, stride=2, padding=1, groups=out_c12)   
+        self.batch_norm_x1 = nn.BatchNorm2d(out_c12)   
+        self.cut_c = Cut(out_c14, out_c12)   
+        self.fusion1 = nn.Conv2d(out_channels, out_c12, kernel_size=1, stride=1)  
+
+        # 2x to 4x downsampling layer
+        self.conv_2 = nn.Conv2d(out_c12, out_channels, kernel_size=3, stride=1, padding=1, groups=out_c12)   
+        self.conv_x2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1, groups=out_channels)     
+        self.batch_norm_x2 = nn.BatchNorm2d(out_channels) 
+        self.max_m = nn.MaxPool2d(kernel_size=2, stride=2)  
+        self.batch_norm_m = nn.BatchNorm2d(out_channels)
+        self.cut_r = Cut(out_c12, out_channels)  
+        self.fusion2 = nn.Conv2d(out_channels * 3, out_channels, kernel_size=1, stride=1)    
+  
+    def forward(self, x):
+        # 7x7 convolution with stride 1 for feature reinforcement, Channels from 3 to 1/4C.
+        x = self.conv_init(x)  # x = [B, C/4, H, W]    
+
+    # original size to 2x downsampling layer    
+        c = x                   # c = [B, C/4, H, W]    
+        # CutD
+        c = self.cut_c(c)       # c = [B, C, H/2, W/2] --> [B, C/2, H/2, W/2]   
+        # ConvD
+        x = self.conv_1(x)      # x = [B, C/4, H, W] --> [B, C/2, H/2, W/2] 
+        x = self.conv_x1(x)     # x = [B, C/2, H/2, W/2]  
+        x = self.batch_norm_x1(x)   
+        # Concat + conv
+        x = torch.cat([x, c], dim=1)    # x = [B, C, H/2, W/2]
+        x = self.fusion1(x)     # x = [B, C, H/2, W/2] --> [B, C/2, H/2, W/2]     
+   
+    # 2x to 4x downsampling layer
+        r = x                   # r = [B, C/2, H/2, W/2]     
+        x = self.conv_2(x)      # x = [B, C/2, H/2, W/2] --> [B, C, H/2, W/2]    
+        m = x                   # m = [B, C, H/2, W/2]
+        # ConvD
+        x = self.conv_x2(x)     # x = [B, C, H/4, W/4]
+        x = self.batch_norm_x2(x)
+        # MaxD    
+        m = self.max_m(m)       # m = [B, C, H/4, W/4]
+        m = self.batch_norm_m(m)
+        # CutD
+        r = self.cut_r(r)       # r = [B, C, H/4, W/4]
+        # Concat + conv  
+        x = torch.cat([x, r, m], dim=1)  # x = [B, C*3, H/4, W/4]
+        x = self.fusion2(x)     # x = [B, C*3, H/4, W/4] --> [B, C, H/4, W/4]  
+        return x                # x = [B, C, H/4, W/4]
+  
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  
+    batch_size, in_channel, out_channel, height, width = 1, 16, 32, 32, 32
+    inputs = torch.randn((batch_size, in_channel, height, width)).to(device)
+    
+    module = SRFD(in_channel, out_channel).to(device)
+    
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,  
+                                     input_shape=(batch_size, in_channel, height, width),    
+                                     output_as_string=True, 
+                                     output_precision=4, 
+                                     print_detailed=True)     
+    print(RESET) 
diff --git a/engine/extre_module/custom_nn/transformer/CascadedGroupAttention.py b/engine/extre_module/custom_nn/transformer/CascadedGroupAttention.py
new file mode 100644
index 00000000..166221b6
--- /dev/null
+++ b/engine/extre_module/custom_nn/transformer/CascadedGroupAttention.py
@@ -0,0 +1,178 @@
+'''     
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2023-Cascaded Group Attention.png
+论文链接：https://arxiv.org/pdf/2305.07027
+'''   
+
+import os, sys     
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings
+warnings.filterwarnings('ignore')  
+from calflops import calculate_flops    
+
+import itertools
+import torch
+    
+from engine.extre_module.ultralytics_nn.conv import Conv
+
+class CascadedGroupAtt(torch.nn.Module):
+    r""" Cascaded Group Attention.   
+
+    Args: 
+        dim (int): Number of input channels.  
+        key_dim (int): The dimension for query and key.
+        num_heads (int): Number of attention heads.
+        attn_ratio (int): Multiplier for the query dim for value dimension.     
+        resolution (int): Input resolution, correspond to the window size.    
+        kernels (List[int]): The kernel size of the dw conv on query.
+    """     
+    def __init__(self, dim, key_dim, num_heads=4, 
+                 attn_ratio=4,
+                 resolution=14,
+                 kernels=[5, 5, 5, 5]):
+        super().__init__()   
+        self.num_heads = num_heads    
+        self.scale = key_dim ** -0.5    
+        self.key_dim = key_dim    
+        self.d = dim // num_heads     
+        self.attn_ratio = attn_ratio 
+   
+        qkvs = []    
+        dws = []  
+        for i in range(num_heads):
+            qkvs.append(Conv(dim // (num_heads), self.key_dim * 2 + self.d, act=False))
+            dws.append(Conv(self.key_dim, self.key_dim, kernels[i], g=self.key_dim, act=False))
+        self.qkvs = torch.nn.ModuleList(qkvs)
+        self.dws = torch.nn.ModuleList(dws)
+        self.proj = torch.nn.Sequential(torch.nn.ReLU(), Conv(self.d * num_heads, dim, act=False))
+ 
+        points = list(itertools.product(range(resolution), range(resolution)))
+        N = len(points)   
+        attention_offsets = {}
+        idxs = []
+        for p1 in points: 
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets)))   
+        self.register_buffer('attention_bias_idxs', torch.LongTensor(idxs).view(N, N))
+
+    @torch.no_grad()     
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'): 
+            del self.ab     
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+
+    def forward(self, x):  # x (B,C,H,W)  
+        B, C, H, W = x.shape     
+        trainingab = self.attention_biases[:, self.attention_bias_idxs]  
+        feats_in = x.chunk(len(self.qkvs), dim=1)
+        feats_out = []
+        feat = feats_in[0]
+        for i, qkv in enumerate(self.qkvs):
+            if i > 0: # add the previous output to the input     
+                feat = feat + feats_in[i]
+            feat = qkv(feat)   
+            q, k, v = feat.view(B, -1, H, W).split([self.key_dim, self.key_dim, self.d], dim=1) # B, C/h, H, W
+            q = self.dws[i](q) 
+            q, k, v = q.flatten(2), k.flatten(2), v.flatten(2) # B, C/h, N 
+            attn = (     
+                (q.transpose(-2, -1) @ k) * self.scale    
+                + 
+                (trainingab[i] if self.training else self.ab[i])
+            )
+            attn = attn.softmax(dim=-1) # BNN
+            feat = (v @ attn.transpose(-2, -1)).view(B, self.d, H, W) # BCHW
+            feats_out.append(feat)
+        x = self.proj(torch.cat(feats_out, 1))   
+        return x
+   
+
+class CascadedGroupAttention(torch.nn.Module):
+    r""" Local Window Attention. CVPR2023-EfficientViT 
+
+    Args:    
+        dim (int): Number of input channels.
+        key_dim (int): The dimension for query and key.   
+        num_heads (int): Number of attention heads.
+        attn_ratio (int): Multiplier for the query dim for value dimension.
+        resolution (int): Input resolution.    
+        window_resolution (int): Local window resolution.
+        kernels (List[int]): The kernel size of the dw conv on query.
+    """
+    def __init__(self, dim, key_dim=16, num_heads=4, 
+                 attn_ratio=4,
+                 resolution=14,    
+                 window_resolution=7,
+                 kernels=[5, 5, 5, 5]):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.resolution = resolution    
+        assert window_resolution > 0, 'window_size must be greater than 0' 
+        self.window_resolution = window_resolution
+ 
+        self.attn = CascadedGroupAtt(dim, key_dim, num_heads, 
+                                attn_ratio=attn_ratio, 
+                                resolution=window_resolution, 
+                                kernels=kernels) 
+   
+    def forward(self, x):    
+        B, C, H, W = x.shape
+               
+        if H <= self.window_resolution and W <= self.window_resolution:
+            x = self.attn(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            pad_b = (self.window_resolution - H %  
+                     self.window_resolution) % self.window_resolution
+            pad_r = (self.window_resolution - W %   
+                     self.window_resolution) % self.window_resolution
+            padding = pad_b > 0 or pad_r > 0
+
+            if padding:
+                x = torch.nn.functional.pad(x, (0, 0, 0, pad_r, 0, pad_b)) 
+
+            pH, pW = H + pad_b, W + pad_r  
+            nH = pH // self.window_resolution   
+            nW = pW // self.window_resolution
+            # window partition, BHWC -> B(nHh)(nWw)C -> BnHnWhwC -> (BnHnW)hwC -> (BnHnW)Chw
+            x = x.view(B, nH, self.window_resolution, nW, self.window_resolution, C).transpose(2, 3).reshape(
+                B * nH * nW, self.window_resolution, self.window_resolution, C 
+            ).permute(0, 3, 1, 2) 
+            x = self.attn(x)
+            # window reverse, (BnHnW)Chw -> (BnHnW)hwC -> BnHnWhwC -> B(nHh)(nWw)C -> BHWC  
+            x = x.permute(0, 2, 3, 1).view(B, nH, nW, self.window_resolution, self.window_resolution,     
+                       C).transpose(2, 3).reshape(B, pH, pW, C)
+
+            if padding:    
+                x = x[:, :H, :W].contiguous() 
+
+            x = x.permute(0, 3, 1, 2) 
+ 
+        return x   
+ 
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')   
+    batch_size, channel, height, width = 1, 16, 20, 20 
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)
+
+    module = CascadedGroupAttention(channel).to(device)
+   
+    outputs = module(inputs)    
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)     
+    
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width), 
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True) 
+    print(RESET)  
diff --git a/engine/extre_module/custom_nn/transformer/DAttention.py b/engine/extre_module/custom_nn/transformer/DAttention.py
new file mode 100644
index 00000000..b6087f12
--- /dev/null
+++ b/engine/extre_module/custom_nn/transformer/DAttention.py
@@ -0,0 +1,250 @@
+'''
+本文件由BiliBili：魔傀面具整理    
+engine/extre_module/module_images/CVPR2022-DAttention.png
+论文链接：https://arxiv.org/abs/2201.00520     
+'''
+
+import warnings
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+
+import numpy as np     
+import torch, einops
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import trunc_normal_
+
+class LayerNormProxy(nn.Module):
+    def __init__(self, dim):     
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x):
+        x = einops.rearrange(x, 'b c h w -> b h w c')    
+        x = self.norm(x)   
+        return einops.rearrange(x, 'b h w c -> b c h w')
+
+class DAttention(nn.Module): 
+    # Vision Transformer with Deformable Attention CVPR2022     
+    # fixed_pe=True need adujust 640x640   
+    def __init__(
+        self, channel, q_size, n_heads=8, n_groups=4,
+        attn_drop=0.0, proj_drop=0.0, stride=1,  
+        offset_range_factor=4, use_pe=True, dwc_pe=True, 
+        no_off=False, fixed_pe=False, ksize=3, log_cpb=False, kv_size=None
+    ):
+        super().__init__() 
+        n_head_channels = channel // n_heads   
+        self.dwc_pe = dwc_pe     
+        self.n_head_channels = n_head_channels
+        self.scale = self.n_head_channels ** -0.5    
+        self.n_heads = n_heads
+        self.q_h, self.q_w = q_size
+        # self.kv_h, self.kv_w = kv_size    
+        self.kv_h, self.kv_w = self.q_h // stride, self.q_w // stride     
+        self.nc = n_head_channels * n_heads   
+        self.n_groups = n_groups
+        self.n_group_channels = self.nc // self.n_groups    
+        self.n_group_heads = self.n_heads // self.n_groups    
+        self.use_pe = use_pe   
+        self.fixed_pe = fixed_pe
+        self.no_off = no_off
+        self.offset_range_factor = offset_range_factor    
+        self.ksize = ksize  
+        self.log_cpb = log_cpb     
+        self.stride = stride
+        kk = self.ksize     
+        pad_size = kk // 2 if kk != stride else 0    
+
+        self.conv_offset = nn.Sequential(  
+            nn.Conv2d(self.n_group_channels, self.n_group_channels, kk, stride, pad_size, groups=self.n_group_channels),
+            LayerNormProxy(self.n_group_channels),  
+            nn.GELU(),
+            nn.Conv2d(self.n_group_channels, 2, 1, 1, 0, bias=False)  
+        )   
+        if self.no_off:    
+            for m in self.conv_offset.parameters():
+                m.requires_grad_(False)
+
+        self.proj_q = nn.Conv2d(    
+            self.nc, self.nc,  
+            kernel_size=1, stride=1, padding=0   
+        )   
+
+        self.proj_k = nn.Conv2d(    
+            self.nc, self.nc, 
+            kernel_size=1, stride=1, padding=0     
+        ) 
+
+        self.proj_v = nn.Conv2d(   
+            self.nc, self.nc, 
+            kernel_size=1, stride=1, padding=0    
+        )  
+ 
+        self.proj_out = nn.Conv2d(    
+            self.nc, self.nc,     
+            kernel_size=1, stride=1, padding=0     
+        )
+
+        self.proj_drop = nn.Dropout(proj_drop, inplace=True)  
+        self.attn_drop = nn.Dropout(attn_drop, inplace=True) 
+   
+        if self.use_pe and not self.no_off:  
+            if self.dwc_pe:
+                self.rpe_table = nn.Conv2d(   
+                    self.nc, self.nc, kernel_size=3, stride=1, padding=1, groups=self.nc)
+            elif self.fixed_pe:
+                self.rpe_table = nn.Parameter(   
+                    torch.zeros(self.n_heads, self.q_h * self.q_w, self.kv_h * self.kv_w)   
+                )   
+                trunc_normal_(self.rpe_table, std=0.01)  
+            elif self.log_cpb:
+                # Borrowed from Swin-V2  
+                self.rpe_table = nn.Sequential( 
+                    nn.Linear(2, 32, bias=True), 
+                    nn.ReLU(inplace=True),     
+                    nn.Linear(32, self.n_group_heads, bias=False)
+                ) 
+            else:
+                self.rpe_table = nn.Parameter(  
+                    torch.zeros(self.n_heads, self.q_h * 2 - 1, self.q_w * 2 - 1)
+                )     
+                trunc_normal_(self.rpe_table, std=0.01)
+        else:
+            self.rpe_table = None    
+
+    @torch.no_grad() 
+    def _get_ref_points(self, H_key, W_key, B, dtype, device):   
+
+        ref_y, ref_x = torch.meshgrid(
+            torch.linspace(0.5, H_key - 0.5, H_key, dtype=dtype, device=device),
+            torch.linspace(0.5, W_key - 0.5, W_key, dtype=dtype, device=device),   
+            indexing='ij'   
+        )     
+        ref = torch.stack((ref_y, ref_x), -1)
+        ref[..., 1].div_(W_key - 1.0).mul_(2.0).sub_(1.0)
+        ref[..., 0].div_(H_key - 1.0).mul_(2.0).sub_(1.0)     
+        ref = ref[None, ...].expand(B * self.n_groups, -1, -1, -1) # B * g H W 2   
+     
+        return ref    
+    
+    @torch.no_grad()
+    def _get_q_grid(self, H, W, B, dtype, device):
+     
+        ref_y, ref_x = torch.meshgrid(
+            torch.arange(0, H, dtype=dtype, device=device),
+            torch.arange(0, W, dtype=dtype, device=device),   
+            indexing='ij'
+        )
+        ref = torch.stack((ref_y, ref_x), -1)     
+        ref[..., 1].div_(W - 1.0).mul_(2.0).sub_(1.0) 
+        ref[..., 0].div_(H - 1.0).mul_(2.0).sub_(1.0)
+        ref = ref[None, ...].expand(B * self.n_groups, -1, -1, -1) # B * g H W 2
+   
+        return ref     
+    
+    def forward(self, x):
+   
+        B, C, H, W = x.size() 
+        dtype, device = x.dtype, x.device
+     
+        q = self.proj_q(x)
+        q_off = einops.rearrange(q, 'b (g c) h w -> (b g) c h w', g=self.n_groups, c=self.n_group_channels)   
+        offset = self.conv_offset(q_off).contiguous()  # B * g 2 Hg Wg
+        Hk, Wk = offset.size(2), offset.size(3)     
+        n_sample = Hk * Wk  
+
+        if self.offset_range_factor >= 0 and not self.no_off:   
+            offset_range = torch.tensor([1.0 / (Hk - 1.0), 1.0 / (Wk - 1.0)], device=device).reshape(1, 2, 1, 1)  
+            offset = offset.tanh().mul(offset_range).mul(self.offset_range_factor)   
+     
+        offset = einops.rearrange(offset, 'b p h w -> b h w p')  
+        reference = self._get_ref_points(Hk, Wk, B, dtype, device)
+  
+        if self.no_off:
+            offset = offset.fill_(0.0)  
+
+        if self.offset_range_factor >= 0:   
+            pos = offset + reference
+        else:
+            pos = (offset + reference).clamp(-1., +1.)
+
+        if self.no_off:     
+            x_sampled = F.avg_pool2d(x, kernel_size=self.stride, stride=self.stride)
+            assert x_sampled.size(2) == Hk and x_sampled.size(3) == Wk, f"Size is {x_sampled.size()}"
+        else:     
+            pos = pos.type(x.dtype)
+            x_sampled = F.grid_sample(    
+                input=x.reshape(B * self.n_groups, self.n_group_channels, H, W),   
+                grid=pos[..., (1, 0)], # y, x -> x, y
+                mode='bilinear', align_corners=True) # B * g, Cg, Hg, Wg    
+   
+
+        x_sampled = x_sampled.reshape(B, C, 1, n_sample) 
+   
+        q = q.reshape(B * self.n_heads, self.n_head_channels, H * W)  
+        k = self.proj_k(x_sampled).reshape(B * self.n_heads, self.n_head_channels, n_sample)   
+        v = self.proj_v(x_sampled).reshape(B * self.n_heads, self.n_head_channels, n_sample) 
+
+        attn = torch.einsum('b c m, b c n -> b m n', q, k) # B * h, HW, Ns  
+        attn = attn.mul(self.scale)
+
+        if self.use_pe and (not self.no_off):
+     
+            if self.dwc_pe:
+                residual_lepe = self.rpe_table(q.reshape(B, C, H, W)).reshape(B * self.n_heads, self.n_head_channels, H * W)  
+            elif self.fixed_pe:
+                rpe_table = self.rpe_table
+                attn_bias = rpe_table[None, ...].expand(B, -1, -1, -1)     
+                attn = attn + attn_bias.reshape(B * self.n_heads, H * W, n_sample)     
+            elif self.log_cpb:    
+                q_grid = self._get_q_grid(H, W, B, dtype, device)
+                displacement = (q_grid.reshape(B * self.n_groups, H * W, 2).unsqueeze(2) - pos.reshape(B * self.n_groups, n_sample, 2).unsqueeze(1)).mul(4.0) # d_y, d_x [-8, +8]  
+                displacement = torch.sign(displacement) * torch.log2(torch.abs(displacement) + 1.0) / np.log2(8.0)
+                attn_bias = self.rpe_table(displacement) # B * g, H * W, n_sample, h_g
+                attn = attn + einops.rearrange(attn_bias, 'b m n h -> (b h) m n', h=self.n_group_heads)
+            else: 
+                rpe_table = self.rpe_table 
+                rpe_bias = rpe_table[None, ...].expand(B, -1, -1, -1)
+                q_grid = self._get_q_grid(H, W, B, dtype, device)
+                displacement = (q_grid.reshape(B * self.n_groups, H * W, 2).unsqueeze(2) - pos.reshape(B * self.n_groups, n_sample, 2).unsqueeze(1)).mul(0.5)
+                attn_bias = F.grid_sample(     
+                    input=einops.rearrange(rpe_bias, 'b (g c) h w -> (b g) c h w', c=self.n_group_heads, g=self.n_groups),    
+                    grid=displacement[..., (1, 0)],  
+                    mode='bilinear', align_corners=True) # B * g, h_g, HW, Ns  
+   
+                attn_bias = attn_bias.reshape(B * self.n_heads, H * W, n_sample)   
+                attn = attn + attn_bias
+   
+        attn = F.softmax(attn, dim=2)     
+        attn = self.attn_drop(attn)
+
+        out = torch.einsum('b m n, b c n -> b c m', attn, v)   
+
+        if self.use_pe and self.dwc_pe:     
+            out = out + residual_lepe  
+        out = out.reshape(B, C, H, W)    
+ 
+        y = self.proj_drop(self.proj_out(out))     
+    
+        return y
+   
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"   
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, channel, height, width = 1, 16, 20, 20    
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)   
+    
+    # 此模块不支持多尺度训练，q_size的参数是一个元组，其为当前特征图的height,width。
+    module = DAttention(channel, q_size=(height, width)).to(device)
+     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+     
+    print(ORANGE)    
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width), 
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)   
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/transformer/DPBAttention.py b/engine/extre_module/custom_nn/transformer/DPBAttention.py
new file mode 100644
index 00000000..b099a44e
--- /dev/null
+++ b/engine/extre_module/custom_nn/transformer/DPBAttention.py
@@ -0,0 +1,160 @@
+'''  
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/ICLR2022-DPBAttention.png
+论文链接：https://arxiv.org/pdf/2108.00154    
+'''   
+
+import warnings    
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+
+import torch 
+import torch.nn as nn   
+
+class DynamicPosBias(nn.Module):  
+    r"""DPB module   
+    
+    Use a MLP to predict position bias used in attention.
+    """
+    def __init__(self, dim, num_heads, residual):
+        super().__init__()
+        self.residual = residual
+        self.num_heads = num_heads
+        self.pos_dim = dim // 4     
+        self.pos_proj = nn.Linear(2, self.pos_dim)
+        self.pos1 = nn.Sequential(   
+            nn.LayerNorm(self.pos_dim),
+            nn.ReLU(inplace=True), 
+            nn.Linear(self.pos_dim, self.pos_dim),
+        )
+        self.pos2 = nn.Sequential(
+            nn.LayerNorm(self.pos_dim),   
+            nn.ReLU(inplace=True),
+            nn.Linear(self.pos_dim, self.pos_dim)  
+        )
+        self.pos3 = nn.Sequential(
+            nn.LayerNorm(self.pos_dim),    
+            nn.ReLU(inplace=True),
+            nn.Linear(self.pos_dim, self.num_heads)   
+        )     
+    def forward(self, biases):
+        if self.residual: 
+            pos = self.pos_proj(biases) # 2Wh-1 * 2Ww-1, heads
+            pos = pos + self.pos1(pos)     
+            pos = pos + self.pos2(pos)   
+            pos = self.pos3(pos)
+        else:
+            pos = self.pos3(self.pos2(self.pos1(self.pos_proj(biases))))
+        return pos     
+   
+    def flops(self, N):
+        flops = N * 2 * self.pos_dim
+        flops += N * self.pos_dim * self.pos_dim
+        flops += N * self.pos_dim * self.pos_dim
+        flops += N * self.pos_dim * self.num_heads     
+        return flops
+
+class DPB_Attention(nn.Module):     
+    r""" Multi-head self attention module with relative position bias. 
+    Args:  
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads. 
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set    
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0    
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0 
+    """ 
+  
+    def __init__(self, dim, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.,   
+                 position_bias=True):
+
+        super().__init__()   
+        self.dim = dim
+        self.num_heads = num_heads   
+        head_dim = dim // num_heads   
+        self.scale = qk_scale or head_dim ** -0.5   
+        self.position_bias = position_bias
+        if self.position_bias:     
+            self.pos = DynamicPosBias(self.dim // 4, self.num_heads, residual=False)
+    
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+ 
+        self.softmax = nn.Softmax(dim=-1) 
+
+    def forward(self, x, mask=None):   
+        """   
+        Args:
+            x: input features with shape of (num_windows*B, N, C) 
+            mask: (0/-inf) mask with shape of (num_windows, Gh*Gw, Gh*Gw) or None
+        """
+        B_, C, H, W = x.shape
+        group_size, N = (H, W), H * W     
+        x = x.flatten(2).permute(0, 2, 1)
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+     
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1).contiguous()) # (num_windows*B, N, N), N = Gh*Gw
+
+        if self.position_bias:
+            # generate mother-set     
+            position_bias_h = torch.arange(1 - group_size[0], group_size[0], device=attn.device)     
+            position_bias_w = torch.arange(1 - group_size[1], group_size[1], device=attn.device)
+            biases = torch.stack(torch.meshgrid([position_bias_h, position_bias_w]))  # 2, 2Gh-1, 2W2-1
+            biases = biases.flatten(1).transpose(0, 1).contiguous().float() 
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(group_size[0], device=attn.device)
+            coords_w = torch.arange(group_size[1], device=attn.device)
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Gh, Gw  
+            coords_flatten = torch.flatten(coords, 1)  # 2, Gh*Gw 
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Gh*Gw, Gh*Gw
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Gh*Gw, Gh*Gw, 2     
+            relative_coords[:, :, 0] += group_size[0] - 1  # shift to start from 0  
+            relative_coords[:, :, 1] += group_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * group_size[1] - 1
+            relative_position_index = relative_coords.sum(-1)  # Gh*Gw, Gh*Gw     
+
+            pos = self.pos(biases) # 2Gh-1 * 2Gw-1, heads
+            # select position bias   
+            relative_position_bias = pos[relative_position_index.view(-1)].view(     
+                group_size[0] * group_size[1], group_size[0] * group_size[1], -1)  # Gh*Gw,Gh*Gw,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Gh*Gw, Gh*Gw  
+            attn = attn + relative_position_bias.unsqueeze(0) 
+
+        if mask is not None:    
+            nG = mask.shape[0]   
+            attn = attn.view(B_ // nG, nG, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) # (B, nG, nHead, N, N)     
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)  
+        else:     
+            attn = self.softmax(attn)  
+
+        attn = self.attn_drop(attn)     
+   
+        x = (attn @ v).transpose(1, 2).contiguous().reshape(B_, N, C)
+        x = self.proj(x)     
+        x = self.proj_drop(x)
+        return x.permute(0, 2, 1).view([B_, C, H, W]).contiguous()    
+   
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m" 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel, height, width = 1, 32, 20, 20 
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)  
+
+    module = DPB_Attention(channel, num_heads=8).to(device)
+    
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)     
+    
+    print(ORANGE)   
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,  
+                                     print_detailed=True)     
+    print(RESET)     
diff --git a/engine/extre_module/custom_nn/transformer/PolaLinearAttention.py b/engine/extre_module/custom_nn/transformer/PolaLinearAttention.py
new file mode 100644
index 00000000..14cef73a
--- /dev/null
+++ b/engine/extre_module/custom_nn/transformer/PolaLinearAttention.py
@@ -0,0 +1,126 @@
+'''
+本文件由BiliBili：魔傀面具整理     
+engine/extre_module/module_images/ICLR2025-PolaLinearAttention.png    
+论文链接：https://arxiv.org/abs/2501.15061  
+'''
+    
+import warnings  
+warnings.filterwarnings('ignore')  
+from calflops import calculate_flops
+   
+import torch    
+import torch.nn as nn
+   
+class PolaLinearAttention(nn.Module):    
+    def __init__(self, dim, hw, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1,
+                 kernel_size=5, alpha=4):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."    
+
+        self.h = hw[0]    
+        self.w = hw[1]
+ 
+        self.dim = dim 
+        self.num_heads = num_heads
+        head_dim = dim // num_heads     
+        self.head_dim = head_dim     
+
+        self.qg = nn.Linear(dim, 2 * dim, bias=qkv_bias) 
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)   
+        self.proj_drop = nn.Dropout(proj_drop)
+  
+        self.sr_ratio = sr_ratio  
+        if sr_ratio > 1:
+            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim) 
+ 
+        self.dwc = nn.Conv2d(in_channels=head_dim, out_channels=head_dim, kernel_size=kernel_size,
+                             groups=head_dim, padding=kernel_size // 2) 
+     
+        self.power = nn.Parameter(torch.zeros(size=(1, self.num_heads, 1, self.head_dim)))
+        self.alpha = alpha  
+
+        self.scale = nn.Parameter(torch.zeros(size=(1, 1, dim))) 
+        self.positional_encoding = nn.Parameter(torch.zeros(size=(1, (self.w * self.h) // (sr_ratio * sr_ratio), dim)))
+ 
+    def forward(self, x):     
+        B, N, C = x.shape  
+        q, g = self.qg(x).reshape(B, N, 2, C).unbind(2)
+  
+        if self.sr_ratio > 1:
+            x_ = x.permute(0, 2, 1).reshape(B, C, self.h, self.w)  
+            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) 
+            x_ = self.norm(x_)  
+            kv = self.kv(x_).reshape(B, -1, 2, C).permute(2, 0, 1, 3)
+        else:    
+            kv = self.kv(x).reshape(B, -1, 2, C).permute(2, 0, 1, 3) 
+        k, v = kv[0], kv[1]  
+        n = k.shape[1]  
+
+        k = k + self.positional_encoding    
+        kernel_function = nn.ReLU() 
+        
+        scale = nn.Softplus()(self.scale)  
+        power = 1 + self.alpha * nn.functional.sigmoid(self.power)
+        
+        q = q / scale    
+        k = k / scale    
+        q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3).contiguous() 
+        k = k.reshape(B, n, self.num_heads, -1).permute(0, 2, 1, 3).contiguous()     
+        v = v.reshape(B, n, self.num_heads, -1).permute(0, 2, 1, 3).contiguous()     
+   
+        q_pos = kernel_function(q) ** power   
+        q_neg = kernel_function(-q) ** power 
+        k_pos = kernel_function(k) ** power 
+        k_neg = kernel_function(-k) ** power 
+
+        q_sim = torch.cat([q_pos, q_neg],dim=-1)
+        q_opp = torch.cat([q_neg, q_pos],dim=-1)   
+        k = torch.cat([k_pos, k_neg],dim=-1)   
+    
+        v1,v2 = torch.chunk(v,2,dim=-1)   
+ 
+        z = 1 / (q_sim @ k.mean(dim=-2, keepdim=True).transpose(-2, -1) + 1e-6)
+        kv = (k.transpose(-2, -1) * (n ** -0.5)) @ (v1 * (n ** -0.5))
+        x_sim = q_sim @ kv * z   
+        z = 1 / (q_opp @ k.mean(dim=-2, keepdim=True).transpose(-2, -1) + 1e-6)  
+        kv = (k.transpose(-2, -1) * (n ** -0.5)) @ (v2 * (n ** -0.5))   
+        x_opp = q_opp @ kv * z
+     
+        x = torch.cat([x_sim, x_opp],dim=-1)
+        x = x.transpose(1, 2).reshape(B, N, C)
+
+        if self.sr_ratio > 1: 
+            v = nn.functional.interpolate(v.transpose(-2, -1).reshape(B * self.num_heads, -1, n), size=N, mode='linear').reshape(B, self.num_heads, -1, N).transpose(-2, -1)    
+ 
+        v = v.reshape(B * self.num_heads, self.h, self.w, -1).permute(0, 3, 1, 2)
+        v = self.dwc(v).reshape(B, C, N).permute(0, 2, 1)
+        x = x + v  
+        x = x * g     
+
+        x = self.proj(x)  
+        x = self.proj_drop(x)
+  
+        return x     
+   
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"  
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')   
+    batch_size, channel, height, width = 1, 16, 32, 32     
+    inputs = torch.randn((batch_size, height * width, channel)).to(device)    
+  
+    # 此模块不支持多尺度训练，hw的参数是一个元组，其为当前特征图的height,width。
+    module = PolaLinearAttention(channel, hw=(height, width)).to(device)
+   
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET) 
+   
+    print(ORANGE)   
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, height * width, channel),
+                                     output_as_string=True,     
+                                     output_precision=4,
+                                     print_detailed=True)  
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/transformer/biformer.py b/engine/extre_module/custom_nn/transformer/biformer.py
new file mode 100644
index 00000000..8152625b
--- /dev/null
+++ b/engine/extre_module/custom_nn/transformer/biformer.py
@@ -0,0 +1,481 @@
+'''   
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/CVPR2023-BiLevelRoutingAttention.png
+论文链接：https://arxiv.org/pdf/2303.08810
+''' 
+
+import warnings  
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange    
+from torch import nn, Tensor, LongTensor 
+from typing import Tuple, Optional, List
+
+class TopkRouting(nn.Module):
+    """     
+    differentiable topk routing with scaling
+    Args:    
+        qk_dim: int, feature dimension of query and key
+        topk: int, the 'topk'   
+        qk_scale: int or None, temperature (multiply) of softmax activation     
+        with_param: bool, wether inorporate learnable params in routing unit
+        diff_routing: bool, wether make routing differentiable
+        soft_routing: bool, wether make output value multiplied by routing weights   
+    """    
+    def __init__(self, qk_dim, topk=4, qk_scale=None, param_routing=False, diff_routing=False):     
+        super().__init__()    
+        self.topk = topk
+        self.qk_dim = qk_dim
+        self.scale = qk_scale or qk_dim ** -0.5
+        self.diff_routing = diff_routing
+        # TODO: norm layer before/after linear?   
+        self.emb = nn.Linear(qk_dim, qk_dim) if param_routing else nn.Identity()  
+        # routing activation   
+        self.routing_act = nn.Softmax(dim=-1) 
+     
+    def forward(self, query:Tensor, key:Tensor)->Tuple[Tensor]:
+        """
+        Args:
+            q, k: (n, p^2, c) tensor
+        Return:
+            r_weight, topk_index: (n, p^2, topk) tensor     
+        """    
+        if not self.diff_routing:
+            query, key = query.detach(), key.detach()
+        query_hat, key_hat = self.emb(query), self.emb(key) # per-window pooling -> (n, p^2, c) 
+        attn_logit = (query_hat*self.scale) @ key_hat.transpose(-2, -1) # (n, p^2, p^2)   
+        topk_attn_logit, topk_index = torch.topk(attn_logit, k=self.topk, dim=-1) # (n, p^2, k), (n, p^2, k)
+        r_weight = self.routing_act(topk_attn_logit) # (n, p^2, k)    
+        
+        return r_weight, topk_index    
+        
+     
+class KVGather(nn.Module):    
+    def __init__(self, mul_weight='none'):
+        super().__init__() 
+        assert mul_weight in ['none', 'soft', 'hard']
+        self.mul_weight = mul_weight
+ 
+    def forward(self, r_idx:Tensor, r_weight:Tensor, kv:Tensor):
+        """
+        r_idx: (n, p^2, topk) tensor    
+        r_weight: (n, p^2, topk) tensor
+        kv: (n, p^2, w^2, c_kq+c_v)    
+
+        Return:
+            (n, p^2, topk, w^2, c_kq+c_v) tensor
+        """   
+        # select kv according to routing index
+        n, p2, w2, c_kv = kv.size()
+        topk = r_idx.size(-1)   
+        # print(r_idx.size(), r_weight.size()) 
+        # FIXME: gather consumes much memory (topk times redundancy), write cuda kernel?     
+        topk_kv = torch.gather(kv.view(n, 1, p2, w2, c_kv).expand(-1, p2, -1, -1, -1), # (n, p^2, p^2, w^2, c_kv) without mem cpy
+                                dim=2,    
+                                index=r_idx.view(n, p2, topk, 1, 1).expand(-1, -1, -1, w2, c_kv) # (n, p^2, k, w^2, c_kv)
+                               )   
+   
+        if self.mul_weight == 'soft':    
+            topk_kv = r_weight.view(n, p2, topk, 1, 1) * topk_kv # (n, p^2, k, w^2, c_kv)
+        elif self.mul_weight == 'hard':
+            raise NotImplementedError('differentiable hard routing TBA')  
+        # else: #'none'     
+        #     topk_kv = topk_kv # do nothing   
+   
+        return topk_kv
+  
+class QKVLinear(nn.Module):
+    def __init__(self, dim, qk_dim, bias=True):
+        super().__init__()     
+        self.dim = dim   
+        self.qk_dim = qk_dim
+        self.qkv = nn.Linear(dim, qk_dim + qk_dim + dim, bias=bias) 
+    
+    def forward(self, x): 
+        q, kv = self.qkv(x).split([self.qk_dim, self.qk_dim+self.dim], dim=-1)     
+        return q, kv    
+    
+class BiLevelRoutingAttention(nn.Module):  
+    """ 
+    n_win: number of windows in one side (so the actual number of windows is n_win*n_win)  
+    kv_per_win: for kv_downsample_mode='ada_xxxpool' only, number of key/values per window. Similar to n_win, the actual number is kv_per_win*kv_per_win.   
+    topk: topk for window filtering
+    param_attention: 'qkvo'-linear for q,k,v and o, 'none': param free attention
+    param_routing: extra linear for routing   
+    diff_routing: wether to set routing differentiable
+    soft_routing: wether to multiply soft routing weights 
+    """ 
+    def __init__(self, dim, num_heads=8, n_win=7, qk_dim=None, qk_scale=None,
+                 kv_per_win=4, kv_downsample_ratio=4, kv_downsample_kernel=None, kv_downsample_mode='identity',
+                 topk=4, param_attention="qkvo", param_routing=False, diff_routing=False, soft_routing=False, side_dwconv=3,
+                 auto_pad=True): 
+        super().__init__()
+        # local attention setting     
+        self.dim = dim    
+        self.n_win = n_win  # Wh, Ww 
+        self.num_heads = num_heads    
+        self.qk_dim = qk_dim or dim
+        assert self.qk_dim % num_heads == 0 and self.dim % num_heads==0, 'qk_dim and dim must be divisible by num_heads!'    
+        self.scale = qk_scale or self.qk_dim ** -0.5
+
+  
+        ################side_dwconv (i.e. LCE in ShuntedTransformer)###########
+        self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv//2, groups=dim) if side_dwconv > 0 else \
+                    lambda x: torch.zeros_like(x) 
+        
+        ################ global routing setting #################    
+        self.topk = topk 
+        self.param_routing = param_routing 
+        self.diff_routing = diff_routing
+        self.soft_routing = soft_routing
+        # router
+        assert not (self.param_routing and not self.diff_routing) # cannot be with_param=True and diff_routing=False   
+        self.router = TopkRouting(qk_dim=self.qk_dim,    
+                                  qk_scale=self.scale,
+                                  topk=self.topk,   
+                                  diff_routing=self.diff_routing,    
+                                  param_routing=self.param_routing)
+        if self.soft_routing: # soft routing, always diffrentiable (if no detach)
+            mul_weight = 'soft'
+        elif self.diff_routing: # hard differentiable routing     
+            mul_weight = 'hard'
+        else:  # hard non-differentiable routing
+            mul_weight = 'none'
+        self.kv_gather = KVGather(mul_weight=mul_weight)
+
+        # qkv mapping (shared by both global routing and local attention)
+        self.param_attention = param_attention
+        if self.param_attention == 'qkvo':
+            self.qkv = QKVLinear(self.dim, self.qk_dim)  
+            self.wo = nn.Linear(dim, dim)
+        elif self.param_attention == 'qkv':    
+            self.qkv = QKVLinear(self.dim, self.qk_dim)  
+            self.wo = nn.Identity()   
+        else:    
+            raise ValueError(f'param_attention mode {self.param_attention} is not surpported!')
+   
+        self.kv_downsample_mode = kv_downsample_mode
+        self.kv_per_win = kv_per_win
+        self.kv_downsample_ratio = kv_downsample_ratio
+        self.kv_downsample_kenel = kv_downsample_kernel 
+        if self.kv_downsample_mode == 'ada_avgpool':
+            assert self.kv_per_win is not None  
+            self.kv_down = nn.AdaptiveAvgPool2d(self.kv_per_win)     
+        elif self.kv_downsample_mode == 'ada_maxpool':   
+            assert self.kv_per_win is not None    
+            self.kv_down = nn.AdaptiveMaxPool2d(self.kv_per_win)     
+        elif self.kv_downsample_mode == 'maxpool':   
+            assert self.kv_downsample_ratio is not None  
+            self.kv_down = nn.MaxPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity() 
+        elif self.kv_downsample_mode == 'avgpool':
+            assert self.kv_downsample_ratio is not None 
+            self.kv_down = nn.AvgPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity()    
+        elif self.kv_downsample_mode == 'identity': # no kv downsampling   
+            self.kv_down = nn.Identity()
+        elif self.kv_downsample_mode == 'fracpool':
+            # assert self.kv_downsample_ratio is not None
+            # assert self.kv_downsample_kenel is not None
+            # TODO: fracpool     
+            # 1. kernel size should be input size dependent
+            # 2. there is a random factor, need to avoid independent sampling for k and v    
+            raise NotImplementedError('fracpool policy is not implemented yet!')
+        elif kv_downsample_mode == 'conv': 
+            # TODO: need to consider the case where k != v so that need two downsample modules
+            raise NotImplementedError('conv policy is not implemented yet!')  
+        else:
+            raise ValueError(f'kv_down_sample_mode {self.kv_downsaple_mode} is not surpported!')
+
+        # softmax for local attention
+        self.attn_act = nn.Softmax(dim=-1) 
+ 
+        self.auto_pad=auto_pad
+     
+    def forward(self, x, ret_attn_mask=False):
+        """
+        x: NHWC tensor
+
+        Return: 
+            NHWC tensor     
+        """ 
+        x = rearrange(x, "n c h w -> n h w c")
+         # NOTE: use padding for semantic segmentation
+        ###################################################     
+        if self.auto_pad:
+            N, H_in, W_in, C = x.size()
+   
+            pad_l = pad_t = 0    
+            pad_r = (self.n_win - W_in % self.n_win) % self.n_win
+            pad_b = (self.n_win - H_in % self.n_win) % self.n_win     
+            x = F.pad(x, (0, 0, # dim=-1
+                          pad_l, pad_r, # dim=-2
+                          pad_t, pad_b)) # dim=-3
+            _, H, W, _ = x.size() # padded size
+        else:    
+            N, H, W, C = x.size()  
+            assert H%self.n_win == 0 and W%self.n_win == 0 #
+        ###################################################
+
+ 
+        # patchify, (n, p^2, w, w, c), keep 2d window as we need 2d pooling to reduce kv size     
+        x = rearrange(x, "n (j h) (i w) c -> n (j i) h w c", j=self.n_win, i=self.n_win)
+
+        #################qkv projection###################  
+        # q: (n, p^2, w, w, c_qk)
+        # kv: (n, p^2, w, w, c_qk+c_v) 
+        # NOTE: separte kv if there were memory leak issue caused by gather
+        q, kv = self.qkv(x) 
+ 
+        # pixel-wise qkv
+        # q_pix: (n, p^2, w^2, c_qk)
+        # kv_pix: (n, p^2, h_kv*w_kv, c_qk+c_v)
+        q_pix = rearrange(q, 'n p2 h w c -> n p2 (h w) c')
+        kv_pix = self.kv_down(rearrange(kv, 'n p2 h w c -> (n p2) c h w'))
+        kv_pix = rearrange(kv_pix, '(n j i) c h w -> n (j i) (h w) c', j=self.n_win, i=self.n_win) 
+
+        q_win, k_win = q.mean([2, 3]), kv[..., 0:self.qk_dim].mean([2, 3]) # window-wise qk, (n, p^2, c_qk), (n, p^2, c_qk) 
+
+        ##################side_dwconv(lepe)##################  
+        # NOTE: call contiguous to avoid gradient warning when using ddp 
+        lepe = self.lepe(rearrange(kv[..., self.qk_dim:], 'n (j i) h w c -> n c (j h) (i w)', j=self.n_win, i=self.n_win).contiguous())
+        lepe = rearrange(lepe, 'n c (j h) (i w) -> n (j h) (i w) c', j=self.n_win, i=self.n_win)   
+ 
+        ############ gather q dependent k/v #################
+     
+        r_weight, r_idx = self.router(q_win, k_win) # both are (n, p^2, topk) tensors    
+
+        kv_pix_sel = self.kv_gather(r_idx=r_idx, r_weight=r_weight, kv=kv_pix) #(n, p^2, topk, h_kv*w_kv, c_qk+c_v)  
+        k_pix_sel, v_pix_sel = kv_pix_sel.split([self.qk_dim, self.dim], dim=-1)
+        # kv_pix_sel: (n, p^2, topk, h_kv*w_kv, c_qk)
+        # v_pix_sel: (n, p^2, topk, h_kv*w_kv, c_v)     
+        
+        ######### do attention as normal ####################     
+        k_pix_sel = rearrange(k_pix_sel, 'n p2 k w2 (m c) -> (n p2) m c (k w2)', m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_kq//m) transpose here?
+        v_pix_sel = rearrange(v_pix_sel, 'n p2 k w2 (m c) -> (n p2) m (k w2) c', m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_v//m) 
+        q_pix = rearrange(q_pix, 'n p2 w2 (m c) -> (n p2) m w2 c', m=self.num_heads) # to BMLC tensor (n*p^2, m, w^2, c_qk//m)
+  
+        # param-free multihead attention
+        attn_weight = (q_pix * self.scale) @ k_pix_sel # (n*p^2, m, w^2, c) @ (n*p^2, m, c, topk*h_kv*w_kv) -> (n*p^2, m, w^2, topk*h_kv*w_kv)     
+        attn_weight = self.attn_act(attn_weight)   
+        out = attn_weight @ v_pix_sel # (n*p^2, m, w^2, topk*h_kv*w_kv) @ (n*p^2, m, topk*h_kv*w_kv, c) -> (n*p^2, m, w^2, c)
+        out = rearrange(out, '(n j i) m (h w) c -> n (j h) (i w) (m c)', j=self.n_win, i=self.n_win,  
+                        h=H//self.n_win, w=W//self.n_win) 
+ 
+        out = out + lepe  
+        # output linear
+        out = self.wo(out)   
+    
+        # NOTE: use padding for semantic segmentation    
+        # crop padded region    
+        if self.auto_pad and (pad_r > 0 or pad_b > 0):   
+            out = out[:, :H_in, :W_in, :].contiguous()
+
+        if ret_attn_mask:
+            return out, r_weight, r_idx, attn_weight   
+        else:
+            return rearrange(out, "n h w c -> n c h w") 
+
+def _grid2seq(x:Tensor, region_size:Tuple[int], num_heads:int):  
+    """ 
+    Args:
+        x: BCHW tensor
+        region size: int    
+        num_heads: number of attention heads    
+    Return:
+        out: rearranged x, has a shape of (bs, nhead, nregion, reg_size, head_dim)    
+        region_h, region_w: number of regions per col/row  
+    """
+    B, C, H, W = x.size()
+    region_h, region_w =  H//region_size[0],  W//region_size[1]
+    x = x.view(B, num_heads, C//num_heads, region_h, region_size[0], region_w, region_size[1])
+    x = torch.einsum('bmdhpwq->bmhwpqd', x).flatten(2, 3).flatten(-3, -2) # (bs, nhead, nregion, reg_size, head_dim)
+    return x, region_h, region_w 
+  
+     
+def _seq2grid(x:Tensor, region_h:int, region_w:int, region_size:Tuple[int]):
+    """    
+    Args: 
+        x: (bs, nhead, nregion, reg_size^2, head_dim) 
+    Return:  
+        x: (bs, C, H, W)  
+    """
+    bs, nhead, nregion, reg_size_square, head_dim = x.size()
+    x = x.view(bs, nhead, region_h, region_w, region_size[0], region_size[1], head_dim)
+    x = torch.einsum('bmhwpqd->bmdhpwq', x).reshape(bs, nhead*head_dim,    
+        region_h*region_size[0], region_w*region_size[1])   
+    return x
+
+  
+def regional_routing_attention_torch(     
+    query:Tensor, key:Tensor, value:Tensor, scale:float,
+    region_graph:LongTensor, region_size:Tuple[int], 
+    kv_region_size:Optional[Tuple[int]]=None,  
+    auto_pad=True)->Tensor:
+    """     
+    Args:  
+        query, key, value: (B, C, H, W) tensor
+        scale: the scale/temperature for dot product attention   
+        region_graph: (B, nhead, h_q*w_q, topk) tensor, topk <= h_k*w_k     
+        region_size: region/window size for queries, (rh, rw)  
+        key_region_size: optional, if None, key_region_size=region_size 
+        auto_pad: required to be true if the input sizes are not divisible by the region_size     
+    Return:
+        output: (B, C, H, W) tensor    
+        attn: (bs, nhead, q_nregion, reg_size, topk*kv_region_size) attention matrix    
+    """
+    kv_region_size = kv_region_size or region_size     
+    bs, nhead, q_nregion, topk = region_graph.size()
+    
+    # Auto pad to deal with any input size 
+    q_pad_b, q_pad_r, kv_pad_b, kv_pad_r = 0, 0, 0, 0
+    if auto_pad: 
+        _, _, Hq, Wq = query.size()  
+        q_pad_b = (region_size[0] - Hq % region_size[0]) % region_size[0]    
+        q_pad_r = (region_size[1] - Wq % region_size[1]) % region_size[1]
+        if (q_pad_b > 0 or q_pad_r > 0):  
+            query = F.pad(query, (0, q_pad_r, 0, q_pad_b)) # zero padding
+
+        _, _, Hk, Wk = key.size()
+        kv_pad_b = (kv_region_size[0] - Hk % kv_region_size[0]) % kv_region_size[0]
+        kv_pad_r = (kv_region_size[1] - Wk % kv_region_size[1]) % kv_region_size[1]     
+        if (kv_pad_r > 0 or kv_pad_b > 0):
+            key = F.pad(key, (0, kv_pad_r, 0, kv_pad_b)) # zero padding    
+            value = F.pad(value, (0, kv_pad_r, 0, kv_pad_b)) # zero padding
+    
+    # to sequence format, i.e. (bs, nhead, nregion, reg_size, head_dim)
+    query, q_region_h, q_region_w = _grid2seq(query, region_size=region_size, num_heads=nhead)
+    key, _, _ = _grid2seq(key, region_size=kv_region_size, num_heads=nhead)
+    value, _, _ = _grid2seq(value, region_size=kv_region_size, num_heads=nhead)    
+ 
+    # gather key and values.
+    # TODO: is seperate gathering slower than fused one (our old version) ?
+    # torch.gather does not support broadcasting, hence we do it manually
+    bs, nhead, kv_nregion, kv_region_size, head_dim = key.size()    
+    broadcasted_region_graph = region_graph.view(bs, nhead, q_nregion, topk, 1, 1).\
+        expand(-1, -1, -1, -1, kv_region_size, head_dim)   
+    key_g = torch.gather(key.view(bs, nhead, 1, kv_nregion, kv_region_size, head_dim).\
+        expand(-1, -1, query.size(2), -1, -1, -1), dim=3,
+        index=broadcasted_region_graph) # (bs, nhead, q_nregion, topk, kv_region_size, head_dim)
+    value_g = torch.gather(value.view(bs, nhead, 1, kv_nregion, kv_region_size, head_dim).\
+        expand(-1, -1, query.size(2), -1, -1, -1), dim=3,
+        index=broadcasted_region_graph) # (bs, nhead, q_nregion, topk, kv_region_size, head_dim)
+     
+    # token-to-token attention     
+    # (bs, nhead, q_nregion, reg_size, head_dim) @ (bs, nhead, q_nregion, head_dim, topk*kv_region_size)
+    # -> (bs, nhead, q_nregion, reg_size, topk*kv_region_size)
+    # TODO: mask padding region    
+    attn = (query * scale) @ key_g.flatten(-3, -2).transpose(-1, -2)
+    attn = torch.softmax(attn, dim=-1)
+    # (bs, nhead, q_nregion, reg_size, topk*kv_region_size) @ (bs, nhead, q_nregion, topk*kv_region_size, head_dim)     
+    # -> (bs, nhead, q_nregion, reg_size, head_dim) 
+    output = attn @ value_g.flatten(-3, -2)    
+   
+    # to BCHW format
+    output = _seq2grid(output, region_h=q_region_h, region_w=q_region_w, region_size=region_size)    
+  
+    # remove paddings if needed  
+    if auto_pad and (q_pad_b > 0 or q_pad_r > 0):
+        output = output[:, :, :Hq, :Wq]
+
+    return output, attn 
+
+class BiLevelRoutingAttention_nchw(nn.Module):   
+    """Bi-Level Routing Attention that takes nchw input
+
+    Compared to legacy version, this implementation:
+    * removes unused args and components
+    * uses nchw input format to avoid frequent permutation 
+
+    When the size of inputs is not divisible by the region size, there is also a numerical difference
+    than legacy implementation, due to:  
+    * different way to pad the input feature map (padding after linear projection)
+    * different pooling behavior (count_include_pad=False)   
+  
+    Current implementation is more reasonable, hence we do not keep backward numerical compatiability  
+    """
+    def __init__(self, dim, num_heads=8, n_win=7, qk_scale=None, topk=4,  side_dwconv=3, auto_pad=False, attn_backend='torch'): 
+        super().__init__()
+        # local attention setting 
+        self.dim = dim 
+        self.num_heads = num_heads
+        assert self.dim % num_heads == 0, 'dim must be divisible by num_heads!'
+        self.head_dim = self.dim // self.num_heads
+        self.scale = qk_scale or self.dim ** -0.5 # NOTE: to be consistent with old models.     
+
+        ################side_dwconv (i.e. LCE in Shunted Transformer)########### 
+        self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv//2, groups=dim) if side_dwconv > 0 else \
+                    lambda x: torch.zeros_like(x)    
+ 
+        ################ regional routing setting #################
+        self.topk = topk  
+        self.n_win = n_win  # number of windows per row/col
+  
+        ##########################################
+   
+        self.qkv_linear = nn.Conv2d(self.dim, 3*self.dim, kernel_size=1)
+        self.output_linear = nn.Conv2d(self.dim, self.dim, kernel_size=1)
+ 
+        if attn_backend == 'torch':
+            self.attn_fn = regional_routing_attention_torch
+        else:     
+            raise ValueError('CUDA implementation is not available yet. Please stay tuned.')   
+
+    def forward(self, x:Tensor, ret_attn_mask=False):  
+        """ 
+        Args:  
+            x: NCHW tensor, better to be channel_last (https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html)   
+        Return:  
+            NCHW tensor  
+        """
+        N, C, H, W = x.size()
+        region_size = (H//self.n_win, W//self.n_win)  
+
+        # STEP 1: linear projection
+        qkv = self.qkv_linear.forward(x) # ncHW 
+        q, k, v = qkv.chunk(3, dim=1) # ncHW    
+       
+        # STEP 2: region-to-region routing
+        # NOTE: ceil_mode=True, count_include_pad=False = auto padding
+        # NOTE: gradients backward through token-to-token attention. See Appendix A for the intuition.
+        q_r = F.avg_pool2d(q.detach(), kernel_size=region_size, ceil_mode=True, count_include_pad=False)     
+        k_r = F.avg_pool2d(k.detach(), kernel_size=region_size, ceil_mode=True, count_include_pad=False) # nchw
+        q_r:Tensor = q_r.permute(0, 2, 3, 1).flatten(1, 2) # n(hw)c
+        k_r:Tensor = k_r.flatten(2, 3) # nc(hw)
+        a_r = q_r @ k_r # n(hw)(hw), adj matrix of regional graph    
+        _, idx_r = torch.topk(a_r, k=self.topk, dim=-1) # n(hw)k long tensor
+        idx_r:LongTensor = idx_r.unsqueeze_(1).expand(-1, self.num_heads, -1, -1)   
+
+        # STEP 3: token to token attention (non-parametric function) 
+        output, attn_mat = self.attn_fn(query=q, key=k, value=v, scale=self.scale,
+                                        region_graph=idx_r, region_size=region_size
+                                       )
+     
+        output = output + self.lepe(v) # ncHW    
+        output = self.output_linear(output) # ncHW 
+  
+        if ret_attn_mask:
+            return output, attn_mat
+   
+        return output   
+
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)
+
+    module = BiLevelRoutingAttention_nchw(channel, num_heads=8).to(device)  
+
+    outputs = module(inputs) 
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+    
+    print(ORANGE)    
+    flops, macs, _ = calculate_flops(model=module, 
+                                     input_shape=(batch_size, channel, height, width),    
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)    
+    print(RESET)   
diff --git a/engine/extre_module/custom_nn/upsample/CARAFE.py b/engine/extre_module/custom_nn/upsample/CARAFE.py
new file mode 100644
index 00000000..ad0ca71e
--- /dev/null
+++ b/engine/extre_module/custom_nn/upsample/CARAFE.py
@@ -0,0 +1,76 @@
+'''   
+本文件由BiliBili：魔傀面具整理
+engine/extre_module/module_images/ICCV2019-CARAFE.png   
+论文链接：https://arxiv.org/abs/1905.02188  
+'''
+ 
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+     
+import warnings 
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops
+  
+import torch
+import torch.nn as nn     
+   
+from engine.extre_module.ultralytics_nn.conv import Conv     
+
+class CARAFE(nn.Module):   
+    def __init__(self, c, k_enc=3, k_up=5, c_mid=64, scale=2): 
+        """ The unofficial implementation of the CARAFE module.
+        The details are in "https://arxiv.org/abs/1905.02188". 
+        Args:
+            c: The channel number of the input and the output.
+            c_mid: The channel number after compression.  
+            scale: The expected upsample scale.  
+            k_up: The size of the reassembly kernel. 
+            k_enc: The kernel size of the encoder.
+        Returns:  
+            X: The upsampled feature map. 
+        """
+        super(CARAFE, self).__init__()
+        self.scale = scale    
+
+        self.comp = Conv(c, c_mid)
+        self.enc = Conv(c_mid, (scale*k_up)**2, k=k_enc, act=False)
+        self.pix_shf = nn.PixelShuffle(scale)
+
+        self.upsmp = nn.Upsample(scale_factor=scale, mode='nearest')
+        self.unfold = nn.Unfold(kernel_size=k_up, dilation=scale, 
+                                padding=k_up//2*scale)     
+ 
+    def forward(self, X):  
+        b, c, h, w = X.size()
+        h_, w_ = h * self.scale, w * self.scale
+        
+        W = self.comp(X)                                # b * m * h * w
+        W = self.enc(W)                                 # b * 100 * h * w
+        W = self.pix_shf(W)                             # b * 25 * h_ * w_  
+        W = torch.softmax(W, dim=1)                         # b * 25 * h_ * w_   
+
+        X = self.upsmp(X)                               # b * c * h_ * w_  
+        X = self.unfold(X)                              # b * 25c * h_ * w_
+        X = X.view(b, c, -1, h_, w_)                    # b * 25 * c * h_ * w_
+  
+        X = torch.einsum('bkhw,bckhw->bchw', [W, X])    # b * c * h_ * w_
+        return X   
+    
+if __name__ == '__main__':
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"     
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)
+
+    module = CARAFE(channel).to(device)    
+
+    outputs = module(inputs)  
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)    
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,  
+                                     input_shape=(batch_size, channel, height, width),  
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)  
+    print(RESET) 
diff --git a/engine/extre_module/custom_nn/upsample/DySample.py b/engine/extre_module/custom_nn/upsample/DySample.py
new file mode 100644
index 00000000..f64a4c2e
--- /dev/null
+++ b/engine/extre_module/custom_nn/upsample/DySample.py
@@ -0,0 +1,112 @@
+'''
+本文件由BiliBili：魔傀面具整理  
+engine/extre_module/module_images/ICCV2023-DySample.png  
+论文链接：https://arxiv.org/abs/2308.15085
+'''
+
+import os, sys  
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+ 
+import warnings  
+warnings.filterwarnings('ignore')  
+from calflops import calculate_flops
+   
+import torch
+import torch.nn as nn
+import torch.nn.functional as F   
+
+from engine.extre_module.ultralytics_nn.conv import Conv
+ 
+class DySample(nn.Module):
+    def __init__(self, in_channels, scale=2, style='lp', groups=4, dyscope=True):  
+        super().__init__()     
+        self.scale = scale
+        self.style = style
+        self.groups = groups   
+        assert style in ['lp', 'pl']  
+        if style == 'pl':
+            assert in_channels >= scale ** 2 and in_channels % scale ** 2 == 0
+        assert in_channels >= groups and in_channels % groups == 0
+    
+        if style == 'pl':
+            in_channels = in_channels // scale ** 2 
+            out_channels = 2 * groups   
+        else:
+            out_channels = 2 * groups * scale ** 2
+    
+        self.offset = nn.Conv2d(in_channels, out_channels, 1)
+        self.normal_init(self.offset, std=0.001)
+        if dyscope:   
+            self.scope = nn.Conv2d(in_channels, out_channels, 1)  
+            self.constant_init(self.scope, val=0.) 
+  
+        self.register_buffer('init_pos', self._init_pos())  
+
+    def normal_init(self, module, mean=0, std=1, bias=0):    
+        if hasattr(module, 'weight') and module.weight is not None:
+            nn.init.normal_(module.weight, mean, std)   
+        if hasattr(module, 'bias') and module.bias is not None:
+            nn.init.constant_(module.bias, bias)     
+
+    def constant_init(self, module, val, bias=0):
+        if hasattr(module, 'weight') and module.weight is not None:   
+            nn.init.constant_(module.weight, val)
+        if hasattr(module, 'bias') and module.bias is not None:  
+            nn.init.constant_(module.bias, bias)
+ 
+    def _init_pos(self):
+        h = torch.arange((-self.scale + 1) / 2, (self.scale - 1) / 2 + 1) / self.scale  
+        return torch.stack(torch.meshgrid([h, h])).transpose(1, 2).repeat(1, self.groups, 1).reshape(1, -1, 1, 1)
+    
+    def sample(self, x, offset):     
+        B, _, H, W = offset.shape
+        offset = offset.view(B, 2, -1, H, W)     
+        coords_h = torch.arange(H) + 0.5  
+        coords_w = torch.arange(W) + 0.5 
+        coords = torch.stack(torch.meshgrid([coords_w, coords_h])
+                             ).transpose(1, 2).unsqueeze(1).unsqueeze(0).type(x.dtype).to(x.device)
+        normalizer = torch.tensor([W, H], dtype=x.dtype, device=x.device).view(1, 2, 1, 1, 1)   
+        coords = 2 * (coords + offset) / normalizer - 1   
+        coords = F.pixel_shuffle(coords.view(B, -1, H, W), self.scale).view(   
+            B, 2, -1, self.scale * H, self.scale * W).permute(0, 2, 3, 4, 1).contiguous().flatten(0, 1)
+        return F.grid_sample(x.reshape(B * self.groups, -1, H, W), coords, mode='bilinear',
+                             align_corners=False, padding_mode="border").reshape((B, -1, self.scale * H, self.scale * W))
+
+    def forward_lp(self, x):     
+        if hasattr(self, 'scope'):
+            offset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos
+        else:
+            offset = self.offset(x) * 0.25 + self.init_pos    
+        return self.sample(x, offset)
+    
+    def forward_pl(self, x):     
+        x_ = F.pixel_shuffle(x, self.scale)   
+        if hasattr(self, 'scope'):
+            offset = F.pixel_unshuffle(self.offset(x_) * self.scope(x_).sigmoid(), self.scale) * 0.5 + self.init_pos   
+        else:    
+            offset = F.pixel_unshuffle(self.offset(x_), self.scale) * 0.25 + self.init_pos
+        return self.sample(x, offset)    
+
+    def forward(self, x):
+        if self.style == 'pl':
+            return self.forward_pl(x)   
+        return self.forward_lp(x) 
+
+if __name__ == '__main__':    
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"   
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)   
+     
+    module = DySample(channel).to(device)
+     
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)     
+ 
+    print(ORANGE)     
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,
+                                     print_detailed=True)
+    print(RESET)    
diff --git a/engine/extre_module/custom_nn/upsample/WaveletUnPool.py b/engine/extre_module/custom_nn/upsample/WaveletUnPool.py
new file mode 100644
index 00000000..2f1559c7
--- /dev/null
+++ b/engine/extre_module/custom_nn/upsample/WaveletUnPool.py
@@ -0,0 +1,78 @@
+'''
+本文件由BiliBili：魔傀面具整理     
+论文链接：https://openreview.net/pdf?id=rkhlb8lCZ    
+'''
+
+import warnings 
+warnings.filterwarnings('ignore')    
+from calflops import calculate_flops
+
+import torch  
+import torch.nn as nn 
+import torch.nn.functional as F   
+import numpy as np
+
+class WaveletUnPool(nn.Module):    
+    def __init__(self): 
+        """ 
+        小波反池化 (Wavelet Unpooling) 层，使用 Haar 小波基进行上采样。     
+        该层的作用是将输入特征图进行 2x2 上采样，通过小波反变换重构特征。
+        """
+        super(WaveletUnPool, self).__init__()   
+  
+        # 定义 Haar 小波的反变换滤波器（低频 LL、高频 LH、HL、HH 分量）   
+        ll = np.array([[0.5, 0.5], [0.5, 0.5]])  # 低频成分
+        lh = np.array([[-0.5, -0.5], [0.5, 0.5]])  # 垂直高频分量
+        hl = np.array([[-0.5, 0.5], [-0.5, 0.5]])  # 水平高频分量
+        hh = np.array([[0.5, -0.5], [-0.5, 0.5]])  # 对角高频分量
+ 
+        # 组合所有滤波器，并沿第 0 维堆叠 (输出通道数维度)
+        filts = np.stack([
+            ll[None, ::-1, ::-1],  # 低频分量 (LL)  
+            lh[None, ::-1, ::-1],  # 垂直高频分量 (LH)
+            hl[None, ::-1, ::-1],  # 水平高频分量 (HL)
+            hh[None, ::-1, ::-1]   # 对角高频分量 (HH)
+        ], axis=0)    
+        
+        # 将滤波器转换为 PyTorch 张量，并设为不可训练参数   
+        self.weight = nn.Parameter(
+            torch.tensor(filts).to(torch.get_default_dtype()),  # 转换为默认数据类型  
+            requires_grad=False  # 该参数在训练过程中不进行更新
+        )
+
+    def forward(self, x):
+        """
+        前向传播函数，执行小波反变换操作。     
+        :param x: 输入特征图，形状为 (B, C, H, W)，其中 C 是通道数。    
+        :return: 上采样后的特征图，形状为 (B, C/4, 2H, 2W)。
+        """ 
+   
+        # 计算通道数 C，需要保证输入通道数是 4 的倍数，因为每 4 个通道组成一个小波分量    
+        C = torch.floor_divide(x.shape[1], 4)  # 计算每个组的通道数
+  
+        # 复制滤波器，使其适用于所有通道，并扩展到完整的通道数
+        filters = torch.cat([self.weight, ] * C, dim=0)   
+     
+        # 进行反卷积 (转置卷积) 操作，相当于小波反变换
+        y = F.conv_transpose2d(x, filters, groups=C, stride=2)
+        
+        return y   
+ 
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"   
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)   
+    
+    module = WaveletUnPool().to(device)     
+
+    outputs = module(inputs)   
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+  
+    print(ORANGE) 
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),
+                                     output_as_string=True,
+                                     output_precision=4,    
+                                     print_detailed=True)     
+    print(RESET)    
diff --git a/engine/extre_module/custom_nn/upsample/eucb.py b/engine/extre_module/custom_nn/upsample/eucb.py
new file mode 100644
index 00000000..f65901fa
--- /dev/null
+++ b/engine/extre_module/custom_nn/upsample/eucb.py
@@ -0,0 +1,64 @@
+'''   
+本文件由BiliBili：魔傀面具整理    
+engine/extre_module/module_images/CVPR2024-EUCB.png  
+论文链接：https://arxiv.org/abs/2405.06880
+'''  
+     
+import os, sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+
+import warnings  
+warnings.filterwarnings('ignore')
+from calflops import calculate_flops  
+     
+import torch
+import torch.nn as nn 
+  
+from engine.extre_module.ultralytics_nn.conv import Conv
+
+class EUCB(nn.Module): 
+    def __init__(self, in_channels, kernel_size=3):
+        super(EUCB,self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.up_dwc = nn.Sequential(  
+            nn.Upsample(scale_factor=2),
+            Conv(self.in_channels, self.in_channels, kernel_size, g=self.in_channels) 
+        )
+        self.pwc = nn.Sequential(
+            nn.Conv2d(self.in_channels, self.out_channels, kernel_size=1, stride=1, padding=0, bias=True)    
+        )
+  
+    def forward(self, x):    
+        x = self.up_dwc(x)
+        x = self.channel_shuffle(x, self.in_channels)
+        x = self.pwc(x)
+        return x
+ 
+    def channel_shuffle(self, x, groups):
+        batchsize, num_channels, height, width = x.data.size() 
+        channels_per_group = num_channels // groups
+        x = x.view(batchsize, groups, channels_per_group, height, width)    
+        x = torch.transpose(x, 1, 2).contiguous()
+        x = x.view(batchsize, -1, height, width)
+        return x    
+
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"    
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')    
+    batch_size, channel, height, width = 1, 16, 32, 32
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)    
+     
+    module = EUCB(channel).to(device)
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)   
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),
+                                     output_as_string=True, 
+                                     output_precision=4, 
+                                     print_detailed=True)
+    print(RESET)
\ No newline at end of file
diff --git a/engine/extre_module/custom_nn/upsample/eucb_sc.py b/engine/extre_module/custom_nn/upsample/eucb_sc.py
new file mode 100644
index 00000000..bec337a3
--- /dev/null
+++ b/engine/extre_module/custom_nn/upsample/eucb_sc.py
@@ -0,0 +1,97 @@
+'''  
+本文件由BiliBili：魔傀面具整理 
+engine/extre_module/module_images/CVPR2024-EUCB.png
+论文链接：https://arxiv.org/abs/2405.06880
+论文链接：https://arxiv.org/abs/2503.02394
+'''   
+
+import os, sys    
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../../../..')
+     
+import warnings
+warnings.filterwarnings('ignore')     
+from calflops import calculate_flops     
+
+import torch 
+import torch.nn as nn
+
+from engine.extre_module.ultralytics_nn.conv import Conv
+ 
+# Shift_channel_mix 模块：    
+# 本研究提出了一种轻量级特征混合模块 Shift_channel_mix，旨在通过通道分割与空间偏移操作增强特征表达能力。     
+# 具体而言，该模块首先沿着通道维度（dim=1）对输入特征图进行四等分分块（即 x_1, x_2, x_3, x_4），随后分别在水平方向（宽度维度）和垂直方向（高度维度）上施加正负方向的循环移位（circular shift）。   
+# 其中，x_1 和 x_2 分别在高度方向进行正向和负向偏移，而 x_3 和 x_4 则在宽度方向进行正向和负向偏移。  
+# 最终，偏移后的特征块通过通道拼接（channel concatenation）重新组合，以实现跨通道的信息交互与局部特征增强。     
+
+# 该设计的核心思想是利用通道内信息重分布的方式，引导不同通道特征感受不同的空间位置信息，从而提升网络的特征表达能力。
+# 此外，由于该操作仅涉及基本的通道切分与循环移位，计算复杂度极低，不引入额外的参数或显著的计算开销。
+# 因此，Shift_channel_mix 适用于对计算资源受限的任务，如嵌入式视觉系统或实时目标检测等场景。    
+class Shift_channel_mix(nn.Module):  
+    def __init__(self,shift_size):     
+        super(Shift_channel_mix, self).__init__()     
+        self.shift_size = shift_size
+   
+    def forward(self, x):
+
+        x1, x2, x3, x4 = x.chunk(4, dim = 1)
+     
+        x1 = torch.roll(x1, self.shift_size, dims=2)#[:,:,1:,:]
+
+        x2 = torch.roll(x2, -self.shift_size, dims=2)#[:,:,:-1,:]
+
+        x3 = torch.roll(x3, self.shift_size, dims=3)#[:,:,:,1:]
+  
+        x4 = torch.roll(x4, -self.shift_size, dims=3)#[:,:,:,:-1] 
+     
+        x = torch.cat([x1, x2, x3, x4], 1)
+
+        return x
+   
+class EUCB_SC(nn.Module):
+    def __init__(self, in_channels, kernel_size=3, stride=1):    
+        super(EUCB_SC,self).__init__()    
+
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.up_dwc = nn.Sequential( 
+            nn.Upsample(scale_factor=2),
+            Conv(self.in_channels, self.in_channels, kernel_size, g=self.in_channels, s=stride, act=nn.ReLU())
+        )     
+        self.pwc = nn.Sequential(   
+            nn.Conv2d(self.in_channels, self.out_channels, kernel_size=1, stride=1, padding=0, bias=True)  
+        )
+        self.shift_channel_mix = Shift_channel_mix(1)  
+
+    def forward(self, x):   
+        x = self.up_dwc(x)
+        x = self.channel_shuffle(x, self.in_channels) 
+        x = self.pwc(x)
+        return x 
+    
+    def channel_shuffle(self, x, groups):   
+        batchsize, num_channels, height, width = x.data.size()
+        channels_per_group = num_channels // groups     
+        x = x.view(batchsize, groups, channels_per_group, height, width)
+        x = torch.transpose(x, 1, 2).contiguous()
+        x = x.view(batchsize, -1, height, width)
+        x = self.shift_channel_mix(x) 
+        return x 
+
+if __name__ == '__main__':  
+    RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')   
+    batch_size, channel, height, width = 1, 16, 32, 32    
+    inputs = torch.randn((batch_size, channel, height, width)).to(device)
+     
+    module = EUCB_SC(channel).to(device)  
+
+    outputs = module(inputs)
+    print(GREEN + f'inputs.size:{inputs.size()} outputs.size:{outputs.size()}' + RESET)
+
+    print(ORANGE)
+    flops, macs, _ = calculate_flops(model=module,
+                                     input_shape=(batch_size, channel, height, width),
+                                     output_as_string=True, 
+                                     output_precision=4,   
+                                     print_detailed=True)
+    print(RESET)  
diff --git a/engine/extre_module/module_images/AAAI2024-DynamicFilter.png b/engine/extre_module/module_images/AAAI2024-DynamicFilter.png
new file mode 100644
index 00000000..d0da133b
Binary files /dev/null and b/engine/extre_module/module_images/AAAI2024-DynamicFilter.png differ
diff --git a/engine/extre_module/module_images/AAAI2025-PSConv.png b/engine/extre_module/module_images/AAAI2025-PSConv.png
new file mode 100644
index 00000000..998aa9d5
Binary files /dev/null and b/engine/extre_module/module_images/AAAI2025-PSConv.png differ
diff --git a/engine/extre_module/module_images/BMVC2024-MASAG.png b/engine/extre_module/module_images/BMVC2024-MASAG.png
new file mode 100644
index 00000000..a08ed0fb
Binary files /dev/null and b/engine/extre_module/module_images/BMVC2024-MASAG.png differ
diff --git a/engine/extre_module/module_images/CVPR2021-Coordinate Attention.png b/engine/extre_module/module_images/CVPR2021-Coordinate Attention.png
new file mode 100644
index 00000000..6986a62d
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2021-Coordinate Attention.png differ
diff --git a/engine/extre_module/module_images/CVPR2021-Diverse Branch Block.png b/engine/extre_module/module_images/CVPR2021-Diverse Branch Block.png
new file mode 100644
index 00000000..fc7c5565
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2021-Diverse Branch Block.png differ
diff --git a/engine/extre_module/module_images/CVPR2022-DAttention.png b/engine/extre_module/module_images/CVPR2022-DAttention.png
new file mode 100644
index 00000000..909a9878
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2022-DAttention.png differ
diff --git a/engine/extre_module/module_images/CVPR2023-BiLevelRoutingAttention.png b/engine/extre_module/module_images/CVPR2023-BiLevelRoutingAttention.png
new file mode 100644
index 00000000..cf23735d
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2023-BiLevelRoutingAttention.png differ
diff --git a/engine/extre_module/module_images/CVPR2023-Cascaded Group Attention.png b/engine/extre_module/module_images/CVPR2023-Cascaded Group Attention.png
new file mode 100644
index 00000000..33e081a5
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2023-Cascaded Group Attention.png differ
diff --git a/engine/extre_module/module_images/CVPR2023-FasterBlock.png b/engine/extre_module/module_images/CVPR2023-FasterBlock.png
new file mode 100644
index 00000000..9793b82d
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2023-FasterBlock.png differ
diff --git a/engine/extre_module/module_images/CVPR2023-SCConv.png b/engine/extre_module/module_images/CVPR2023-SCConv.png
new file mode 100644
index 00000000..7eb70017
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2023-SCConv.png differ
diff --git a/engine/extre_module/module_images/CVPR2023-partial convolution.png b/engine/extre_module/module_images/CVPR2023-partial convolution.png
new file mode 100644
index 00000000..6a4d13f1
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2023-partial convolution.png differ
diff --git a/engine/extre_module/module_images/CVPR2024-CGLU.png b/engine/extre_module/module_images/CVPR2024-CGLU.png
new file mode 100644
index 00000000..c2eedf6d
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2024-CGLU.png differ
diff --git a/engine/extre_module/module_images/CVPR2024-DilatedReparamConv.png b/engine/extre_module/module_images/CVPR2024-DilatedReparamConv.png
new file mode 100644
index 00000000..62fdc24e
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2024-DilatedReparamConv.png differ
diff --git a/engine/extre_module/module_images/CVPR2024-EUCB.png b/engine/extre_module/module_images/CVPR2024-EUCB.png
new file mode 100644
index 00000000..9f3f5d13
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2024-EUCB.png differ
diff --git a/engine/extre_module/module_images/CVPR2024-LEGM.png b/engine/extre_module/module_images/CVPR2024-LEGM.png
new file mode 100644
index 00000000..6f53d2a6
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2024-LEGM.png differ
diff --git a/engine/extre_module/module_images/CVPR2024-MFM.png b/engine/extre_module/module_images/CVPR2024-MFM.png
new file mode 100644
index 00000000..76879dcd
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2024-MFM.png differ
diff --git a/engine/extre_module/module_images/CVPR2024-MSCB.png b/engine/extre_module/module_images/CVPR2024-MSCB.png
new file mode 100644
index 00000000..0bcb59f0
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2024-MSCB.png differ
diff --git a/engine/extre_module/module_images/CVPR2024-StarBlock.png b/engine/extre_module/module_images/CVPR2024-StarBlock.png
new file mode 100644
index 00000000..559b8c74
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2024-StarBlock.png differ
diff --git a/engine/extre_module/module_images/CVPR2024-UniRepLKNetBlock.png b/engine/extre_module/module_images/CVPR2024-UniRepLKNetBlock.png
new file mode 100644
index 00000000..d8a7825e
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2024-UniRepLKNetBlock.png differ
diff --git a/engine/extre_module/module_images/CVPR2025-Dynamic Tanh.png b/engine/extre_module/module_images/CVPR2025-Dynamic Tanh.png
new file mode 100644
index 00000000..49460015
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2025-Dynamic Tanh.png differ
diff --git a/engine/extre_module/module_images/CVPR2025-EfficientVIM.png b/engine/extre_module/module_images/CVPR2025-EfficientVIM.png
new file mode 100644
index 00000000..ea41feee
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2025-EfficientVIM.png differ
diff --git a/engine/extre_module/module_images/CVPR2025-MambaOut-DRC.png b/engine/extre_module/module_images/CVPR2025-MambaOut-DRC.png
new file mode 100644
index 00000000..7e2e2d89
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2025-MambaOut-DRC.png differ
diff --git a/engine/extre_module/module_images/CVPR2025-MambaOut-UniRepBlock.png b/engine/extre_module/module_images/CVPR2025-MambaOut-UniRepBlock.png
new file mode 100644
index 00000000..e864f1ed
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2025-MambaOut-UniRepBlock.png differ
diff --git a/engine/extre_module/module_images/CVPR2025-MambaOut.png b/engine/extre_module/module_images/CVPR2025-MambaOut.png
new file mode 100644
index 00000000..40fd97e2
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2025-MambaOut.png differ
diff --git a/engine/extre_module/module_images/CVPR2025-ShiftwiseConv.png b/engine/extre_module/module_images/CVPR2025-ShiftwiseConv.png
new file mode 100644
index 00000000..4c7b7d19
Binary files /dev/null and b/engine/extre_module/module_images/CVPR2025-ShiftwiseConv.png differ
diff --git a/engine/extre_module/module_images/DRFD.png b/engine/extre_module/module_images/DRFD.png
new file mode 100644
index 00000000..ec4ecf5b
Binary files /dev/null and b/engine/extre_module/module_images/DRFD.png differ
diff --git a/engine/extre_module/module_images/DWR.png b/engine/extre_module/module_images/DWR.png
new file mode 100644
index 00000000..3b34dd49
Binary files /dev/null and b/engine/extre_module/module_images/DWR.png differ
diff --git a/engine/extre_module/module_images/ECCV2024-WTConv2D.png b/engine/extre_module/module_images/ECCV2024-WTConv2D.png
new file mode 100644
index 00000000..7ce69917
Binary files /dev/null and b/engine/extre_module/module_images/ECCV2024-WTConv2D.png differ
diff --git a/engine/extre_module/module_images/HWD.png b/engine/extre_module/module_images/HWD.png
new file mode 100644
index 00000000..7dda12ed
Binary files /dev/null and b/engine/extre_module/module_images/HWD.png differ
diff --git a/engine/extre_module/module_images/ICASSP2023-EMA.png b/engine/extre_module/module_images/ICASSP2023-EMA.png
new file mode 100644
index 00000000..364dd873
Binary files /dev/null and b/engine/extre_module/module_images/ICASSP2023-EMA.png differ
diff --git a/engine/extre_module/module_images/ICCV2019-CARAFE.png b/engine/extre_module/module_images/ICCV2019-CARAFE.png
new file mode 100644
index 00000000..2aef7841
Binary files /dev/null and b/engine/extre_module/module_images/ICCV2019-CARAFE.png differ
diff --git a/engine/extre_module/module_images/ICCV2023-DySample.png b/engine/extre_module/module_images/ICCV2023-DySample.png
new file mode 100644
index 00000000..d1d22c84
Binary files /dev/null and b/engine/extre_module/module_images/ICCV2023-DySample.png differ
diff --git a/engine/extre_module/module_images/ICCV2023-LSKBlock.png b/engine/extre_module/module_images/ICCV2023-LSKBlock.png
new file mode 100644
index 00000000..03aee421
Binary files /dev/null and b/engine/extre_module/module_images/ICCV2023-LSKBlock.png differ
diff --git a/engine/extre_module/module_images/ICCV2023-dynamic_snake_conv.png b/engine/extre_module/module_images/ICCV2023-dynamic_snake_conv.png
new file mode 100644
index 00000000..28e15cbd
Binary files /dev/null and b/engine/extre_module/module_images/ICCV2023-dynamic_snake_conv.png differ
diff --git a/engine/extre_module/module_images/ICCV2023-iRMB.png b/engine/extre_module/module_images/ICCV2023-iRMB.png
new file mode 100644
index 00000000..f1ff592c
Binary files /dev/null and b/engine/extre_module/module_images/ICCV2023-iRMB.png differ
diff --git a/engine/extre_module/module_images/ICLR2018-WaveletPool.png b/engine/extre_module/module_images/ICLR2018-WaveletPool.png
new file mode 100644
index 00000000..806e998d
Binary files /dev/null and b/engine/extre_module/module_images/ICLR2018-WaveletPool.png differ
diff --git a/engine/extre_module/module_images/ICLR2022-DPBAttention.png b/engine/extre_module/module_images/ICLR2022-DPBAttention.png
new file mode 100644
index 00000000..f5e3803a
Binary files /dev/null and b/engine/extre_module/module_images/ICLR2022-DPBAttention.png differ
diff --git a/engine/extre_module/module_images/ICLR2024-FATBlock.png b/engine/extre_module/module_images/ICLR2024-FATBlock.png
new file mode 100644
index 00000000..ae41108e
Binary files /dev/null and b/engine/extre_module/module_images/ICLR2024-FATBlock.png differ
diff --git a/engine/extre_module/module_images/ICLR2024-FMFFN.png b/engine/extre_module/module_images/ICLR2024-FMFFN.png
new file mode 100644
index 00000000..e04ee35b
Binary files /dev/null and b/engine/extre_module/module_images/ICLR2024-FMFFN.png differ
diff --git a/engine/extre_module/module_images/ICLR2025-PolaLinearAttention.png b/engine/extre_module/module_images/ICLR2025-PolaLinearAttention.png
new file mode 100644
index 00000000..b9d3f139
Binary files /dev/null and b/engine/extre_module/module_images/ICLR2025-PolaLinearAttention.png differ
diff --git a/engine/extre_module/module_images/ICML2021-SimAM.png b/engine/extre_module/module_images/ICML2021-SimAM.png
new file mode 100644
index 00000000..ac1170e8
Binary files /dev/null and b/engine/extre_module/module_images/ICML2021-SimAM.png differ
diff --git a/engine/extre_module/module_images/ICML2024-RepBN.png b/engine/extre_module/module_images/ICML2024-RepBN.png
new file mode 100644
index 00000000..e7c85b9e
Binary files /dev/null and b/engine/extre_module/module_images/ICML2024-RepBN.png differ
diff --git a/engine/extre_module/module_images/IEEETGRS2024-ELGCA.png b/engine/extre_module/module_images/IEEETGRS2024-ELGCA.png
new file mode 100644
index 00000000..abd5f58c
Binary files /dev/null and b/engine/extre_module/module_images/IEEETGRS2024-ELGCA.png differ
diff --git a/engine/extre_module/module_images/IEEETIP2020-ContextGuidedBlock_Down.png b/engine/extre_module/module_images/IEEETIP2020-ContextGuidedBlock_Down.png
new file mode 100644
index 00000000..6ec46d4e
Binary files /dev/null and b/engine/extre_module/module_images/IEEETIP2020-ContextGuidedBlock_Down.png differ
diff --git a/engine/extre_module/module_images/IEEETIP2023-CSFCN.png b/engine/extre_module/module_images/IEEETIP2023-CSFCN.png
new file mode 100644
index 00000000..e1bdc745
Binary files /dev/null and b/engine/extre_module/module_images/IEEETIP2023-CSFCN.png differ
diff --git a/engine/extre_module/module_images/IEEETIP2024-DEConv.png b/engine/extre_module/module_images/IEEETIP2024-DEConv.png
new file mode 100644
index 00000000..2d9511ff
Binary files /dev/null and b/engine/extre_module/module_images/IEEETIP2024-DEConv.png differ
diff --git a/engine/extre_module/module_images/IJCAI2024-DFFN.png b/engine/extre_module/module_images/IJCAI2024-DFFN.png
new file mode 100644
index 00000000..2f42291e
Binary files /dev/null and b/engine/extre_module/module_images/IJCAI2024-DFFN.png differ
diff --git a/engine/extre_module/module_images/Mixed Local Channel Attention.png b/engine/extre_module/module_images/Mixed Local Channel Attention.png
new file mode 100644
index 00000000..bcc4c89f
Binary files /dev/null and b/engine/extre_module/module_images/Mixed Local Channel Attention.png differ
diff --git a/engine/extre_module/module_images/RepHMS.png b/engine/extre_module/module_images/RepHMS.png
new file mode 100644
index 00000000..cdaa7e8b
Binary files /dev/null and b/engine/extre_module/module_images/RepHMS.png differ
diff --git a/engine/extre_module/module_images/SEAM.png b/engine/extre_module/module_images/SEAM.png
new file mode 100644
index 00000000..30b1ef82
Binary files /dev/null and b/engine/extre_module/module_images/SEAM.png differ
diff --git a/engine/extre_module/module_images/SPDConv.png b/engine/extre_module/module_images/SPDConv.png
new file mode 100644
index 00000000..a5f5d9f5
Binary files /dev/null and b/engine/extre_module/module_images/SPDConv.png differ
diff --git a/engine/extre_module/module_images/SRFD.png b/engine/extre_module/module_images/SRFD.png
new file mode 100644
index 00000000..6beeef90
Binary files /dev/null and b/engine/extre_module/module_images/SRFD.png differ
diff --git a/engine/extre_module/module_images/StripBlock.png b/engine/extre_module/module_images/StripBlock.png
new file mode 100644
index 00000000..e73559de
Binary files /dev/null and b/engine/extre_module/module_images/StripBlock.png differ
diff --git a/engine/extre_module/module_images/TPAMI2025-MSBlock.png b/engine/extre_module/module_images/TPAMI2025-MSBlock.png
new file mode 100644
index 00000000..98bdda0b
Binary files /dev/null and b/engine/extre_module/module_images/TPAMI2025-MSBlock.png differ
diff --git a/engine/extre_module/module_images/WACV2024-DeformableLKA.png b/engine/extre_module/module_images/WACV2024-DeformableLKA.png
new file mode 100644
index 00000000..49be3e1b
Binary files /dev/null and b/engine/extre_module/module_images/WACV2024-DeformableLKA.png differ
diff --git a/engine/extre_module/module_images/WideDBB.png b/engine/extre_module/module_images/WideDBB.png
new file mode 100644
index 00000000..57f24dcb
Binary files /dev/null and b/engine/extre_module/module_images/WideDBB.png differ
diff --git a/engine/extre_module/module_images/gConv.png b/engine/extre_module/module_images/gConv.png
new file mode 100644
index 00000000..a96aecaa
Binary files /dev/null and b/engine/extre_module/module_images/gConv.png differ
diff --git "a/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-ContextGuideFusionModule.png" "b/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-ContextGuideFusionModule.png"
new file mode 100644
index 00000000..947a3d28
Binary files /dev/null and "b/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-ContextGuideFusionModule.png" differ
diff --git "a/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-Light Adaptive-weight downsampling.png" "b/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-Light Adaptive-weight downsampling.png"
new file mode 100644
index 00000000..4c5c29b8
Binary files /dev/null and "b/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-Light Adaptive-weight downsampling.png" differ
diff --git "a/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-RGCSPELAN.png" "b/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-RGCSPELAN.png"
new file mode 100644
index 00000000..6983e787
Binary files /dev/null and "b/engine/extre_module/module_images/\350\207\252\347\240\224\346\250\241\345\235\227-RGCSPELAN.png" differ
diff --git a/engine/extre_module/ops.py b/engine/extre_module/ops.py
new file mode 100644
index 00000000..d894ba1f
--- /dev/null
+++ b/engine/extre_module/ops.py
@@ -0,0 +1,49 @@
+import contextlib, time   
+import torch 
+
+class Profile(contextlib.ContextDecorator):  
+    """  
+    YOLOv8 Profile class. Use as a decorator with @Profile() or as a context manager with 'with Profile():'.
+
+    Example:
+        ```python
+        from ultralytics.utils.ops import Profile    
+
+        with Profile(device=device) as dt:     
+            pass  # slow operation here
+
+        print(dt)  # prints "Elapsed time is 9.5367431640625e-07 s"     
+        ```  
+    """     
+
+    def __init__(self, t=0.0, device: torch.device = None):    
+        """     
+        Initialize the Profile class.     
+
+        Args:     
+            t (float): Initial time. Defaults to 0.0.
+            device (torch.device): Devices used for model inference. Defaults to None (cpu). 
+        """ 
+        self.t = t   
+        self.device = device 
+        self.cuda = bool(device and str(device).startswith("cuda"))
+   
+    def __enter__(self):  
+        """Start timing."""    
+        self.start = self.time()     
+        return self 
+    
+    def __exit__(self, type, value, traceback):  # noqa 
+        """Stop timing."""
+        self.dt = self.time() - self.start  # delta-time  
+        self.t += self.dt  # accumulate dt  
+
+    def __str__(self):
+        """Returns a human-readable string representing the accumulated elapsed time in the profiler."""
+        return f"Elapsed time is {self.t} s"  
+
+    def time(self):
+        """Get current time."""     
+        if self.cuda: 
+            torch.cuda.synchronize(self.device)
+        return time.time()  
diff --git a/engine/extre_module/torch_utils.py b/engine/extre_module/torch_utils.py
new file mode 100644
index 00000000..df669655
--- /dev/null
+++ b/engine/extre_module/torch_utils.py
@@ -0,0 +1,59 @@
+import torch, torchvision
+import torch.nn as nn
+   
+RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m"
+
+def check_cuda():   
+    print(GREEN + f"PyTorch 版本: {torch.__version__}")   
+    print(f"Torchvision 版本: {torchvision.__version__}")
+    cuda_available = torch.cuda.is_available()
+    print(f"CUDA 是否可用: {cuda_available}")
+
+    if cuda_available:  
+        device_count = torch.cuda.device_count()    
+        print(f"GPU 数量: {device_count}")
+        
+        for i in range(device_count):     
+            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+            print(f"  显存: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB") 
+            print(f"  计算能力: {torch.cuda.get_device_capability(i)}") 
+        
+        print(f"当前设备索引: {torch.cuda.current_device()}")   
+        print(f"当前设备名称: {torch.cuda.get_device_name(torch.cuda.current_device())}" + RESET)  
+  
+def fuse_conv_and_bn(conv, bn):
+    """Fuse Conv2d() and BatchNorm2d() layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/."""
+    fusedconv = (
+        nn.Conv2d(     
+            conv.in_channels,     
+            conv.out_channels,
+            kernel_size=conv.kernel_size,   
+            stride=conv.stride,
+            padding=conv.padding,
+            dilation=conv.dilation,
+            groups=conv.groups,
+            bias=True,    
+        )     
+        .requires_grad_(False)
+        .to(conv.weight.device)     
+    )     
+   
+    # Prepare filters   
+    w_conv = conv.weight.view(conv.out_channels, -1)     
+    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
+    fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))  
+
+    # Prepare spatial bias    
+    b_conv = torch.zeros(conv.weight.shape[0], device=conv.weight.device) if conv.bias is None else conv.bias
+    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
+    fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+
+    return fusedconv    
+
+def model_fuse_test(model):
+    model.eval()     
+    for name, m in model.named_modules():  
+        if hasattr(m, 'convert_to_deploy'): 
+            print(BLUE + f"Converting module: {m.__class__}" + RESET)
+            m.convert_to_deploy()
+    return model  
diff --git a/engine/extre_module/ultralytics_nn/block.py b/engine/extre_module/ultralytics_nn/block.py
new file mode 100644
index 00000000..23f64a1e
--- /dev/null
+++ b/engine/extre_module/ultralytics_nn/block.py
@@ -0,0 +1,1249 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""Block modules."""
+
+import torch
+import torch.nn as nn 
+import torch.nn.functional as F    
+from functools import partial 
+
+from ..torch_utils import fuse_conv_and_bn 
+
+from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad     
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_func
+    FLASH_ATTN_FLAG = True 
+except ImportError as e:
+    # assert False, "import FlashAttention error! Please install FlashAttention first."   
+    FLASH_ATTN_FLAG = False   
+     
+__all__ = (
+    "C1",   
+    "C2",
+    "C3",    
+    "C2f",   
+    "C2fAttn",  
+    "ImagePoolingAttn", 
+    "ContrastiveHead",    
+    "BNContrastiveHead",
+    "C3x",   
+    "C3Ghost", 
+    "GhostBottleneck",
+    "Bottleneck",
+    "BottleneckCSP",     
+    "Proto",
+    "RepC3",
+    "ResNetLayer",   
+    "ELAN1",
+    "AConv",   
+    "SPPELAN",
+    "C3k",   
+    "C3k2",
+    "C2fPSA",    
+    "C2PSA",
+    "RepVGGDW",
+    "CIB",
+    "C2fCIB",
+    "Attention",
+    "PSA",   
+    "SCDown",     
+    "ABlock",
+    "A2C2f"    
+)  
+
+class C1(nn.Module):
+    """CSP Bottleneck with 1 convolution."""
+
+    def __init__(self, c1, c2, n=1):
+        """Initializes the CSP Bottleneck with configurations for 1 convolution with arguments ch_in, ch_out, number."""   
+        super().__init__()
+        self.cv1 = Conv(c1, c2, 1, 1)
+        self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
+  
+    def forward(self, x):    
+        """Applies cross-convolutions to input in the C3 module."""
+        y = self.cv1(x)
+        return self.m(y) + y     
+  
+    
+class C2(nn.Module):
+    """CSP Bottleneck with 2 convolutions."""
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initializes a CSP Bottleneck with 2 convolutions and optional shortcut connection."""
+        super().__init__()
+        self.c = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)    
+        self.cv2 = Conv(2 * self.c, c2, 1)  # optional act=FReLU(c2) 
+        # self.attention = ChannelAttention(2 * self.c)  # or SpatialAttention()    
+        self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))   
+  
+    def forward(self, x):
+        """Forward pass through the CSP bottleneck with 2 convolutions.""" 
+        a, b = self.cv1(x).chunk(2, 1)
+        return self.cv2(torch.cat((self.m(a), b), 1))
+
+
+class C2f(nn.Module):
+    """Faster Implementation of CSP Bottleneck with 2 convolutions.""" 
+    
+    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
+        """Initializes a CSP bottleneck with 2 convolutions and n Bottleneck blocks for faster processing.""" 
+        super().__init__()     
+        self.c = int(c2 * e)  # hidden channels 
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)  
+        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
+
+    def forward(self, x):    
+        """Forward pass through C2f layer."""    
+        y = list(self.cv1(x).chunk(2, 1))   
+        y.extend(m(y[-1]) for m in self.m)     
+        return self.cv2(torch.cat(y, 1))   
+     
+    def forward_split(self, x):    
+        """Forward pass using split() instead of chunk().""" 
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in self.m)    
+        return self.cv2(torch.cat(y, 1)) 
+   
+  
+class C3(nn.Module):
+    """CSP Bottleneck with 3 convolutions."""
+ 
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values."""     
+        super().__init__()  
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)     
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))    
+
+    def forward(self, x):
+        """Forward pass through the CSP bottleneck with 2 convolutions."""  
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
+
+
+class C3x(C3):
+    """C3 module with cross-convolutions.""" 
+    
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initialize C3TR instance and set default parameters."""  
+        super().__init__(c1, c2, n, shortcut, g, e)   
+        self.c_ = int(c2 * e)
+        self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
+    
+
+class RepC3(nn.Module):
+    """Rep C3."""     
+    
+    def __init__(self, c1, c2, n=3, e=1.0): 
+        """Initialize CSP Bottleneck with a single convolution using input channels, output channels, and number."""
+        super().__init__()    
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c2, 1, 1)
+        self.cv2 = Conv(c1, c2, 1, 1) 
+        self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)])    
+        self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()   
+
+    def forward(self, x):
+        """Forward pass of RT-DETR neck layer."""  
+        return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
+
+     
+class C3Ghost(C3):    
+    """C3 module with GhostBottleneck().""" 
+  
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""    
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)  # hidden channels
+        self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))  
+
+   
+class GhostBottleneck(nn.Module):
+    """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""     
+
+    def __init__(self, c1, c2, k=3, s=1): 
+        """Initializes GhostBottleneck module with arguments ch_in, ch_out, kernel, stride."""     
+        super().__init__()  
+        c_ = c2 // 2     
+        self.conv = nn.Sequential(
+            GhostConv(c1, c_, 1, 1),  # pw
+            DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
+            GhostConv(c_, c2, 1, 1, act=False),  # pw-linear     
+        )     
+        self.shortcut = (     
+            nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
+        )
+
+    def forward(self, x):
+        """Applies skip connection and concatenation to input tensor."""   
+        return self.conv(x) + self.shortcut(x)
+
+
+class Bottleneck(nn.Module): 
+    """Standard bottleneck.""" 
+
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):    
+        """Initializes a standard bottleneck module with optional shortcut connection and configurable parameters.""" 
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):     
+        """Applies the YOLO FPN to input data."""
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class BottleneckCSP(nn.Module):
+    """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""   
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  
+        """Initializes the CSP Bottleneck given arguments for ch_in, ch_out, number, shortcut, groups, expansion."""
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1) 
+        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)  
+        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
+        self.cv4 = Conv(2 * c_, c2, 1, 1)    
+        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3) 
+        self.act = nn.SiLU()
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))  
+
+    def forward(self, x): 
+        """Applies a CSP bottleneck with 3 convolutions."""     
+        y1 = self.cv3(self.m(self.cv1(x)))    
+        y2 = self.cv2(x)
+        return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
+  
+ 
+class ResNetBlock(nn.Module):
+    """ResNet block with standard convolution layers."""
+ 
+    def __init__(self, c1, c2, s=1, e=4):
+        """Initialize convolution with given parameters."""
+        super().__init__()   
+        c3 = e * c2 
+        self.cv1 = Conv(c1, c2, k=1, s=1, act=True)
+        self.cv2 = Conv(c2, c2, k=3, s=s, p=1, act=True)    
+        self.cv3 = Conv(c2, c3, k=1, act=False)
+        self.shortcut = nn.Sequential(Conv(c1, c3, k=1, s=s, act=False)) if s != 1 or c1 != c3 else nn.Identity()
+
+    def forward(self, x):     
+        """Forward pass through the ResNet block."""
+        return F.relu(self.cv3(self.cv2(self.cv1(x))) + self.shortcut(x))
+  
+
+class ResNetLayer(nn.Module):    
+    """ResNet layer with multiple ResNet blocks."""   
+
+    def __init__(self, c1, c2, s=1, is_first=False, n=1, e=4):
+        """Initializes the ResNetLayer given arguments."""
+        super().__init__()
+        self.is_first = is_first
+
+        if self.is_first:     
+            self.layer = nn.Sequential(     
+                Conv(c1, c2, k=7, s=2, p=3, act=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+            )
+        else:
+            blocks = [ResNetBlock(c1, c2, s, e=e)]     
+            blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])     
+            self.layer = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        """Forward pass through the ResNet layer."""
+        return self.layer(x) 
+
+
+class MaxSigmoidAttnBlock(nn.Module):   
+    """Max Sigmoid attention block."""    
+
+    def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False):    
+        """Initializes MaxSigmoidAttnBlock with specified arguments."""     
+        super().__init__()
+        self.nh = nh
+        self.hc = c2 // nh
+        self.ec = Conv(c1, ec, k=1, act=False) if c1 != ec else None
+        self.gl = nn.Linear(gc, ec) 
+        self.bias = nn.Parameter(torch.zeros(nh))
+        self.proj_conv = Conv(c1, c2, k=3, s=1, act=False)
+        self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0
+   
+    def forward(self, x, guide):
+        """Forward process.""" 
+        bs, _, h, w = x.shape
+
+        guide = self.gl(guide)
+        guide = guide.view(bs, -1, self.nh, self.hc)    
+        embed = self.ec(x) if self.ec is not None else x 
+        embed = embed.view(bs, self.nh, self.hc, h, w)
+
+        aw = torch.einsum("bmchw,bnmc->bmhwn", embed, guide)
+        aw = aw.max(dim=-1)[0]
+        aw = aw / (self.hc**0.5)     
+        aw = aw + self.bias[None, :, None, None] 
+        aw = aw.sigmoid() * self.scale    
+   
+        x = self.proj_conv(x)
+        x = x.view(bs, self.nh, -1, h, w)    
+        x = x * aw.unsqueeze(2)
+        return x.view(bs, -1, h, w)    
+     
+
+class C2fAttn(nn.Module):    
+    """C2f module with an additional attn module."""
+
+    def __init__(self, c1, c2, n=1, ec=128, nh=1, gc=512, shortcut=False, g=1, e=0.5):
+        """Initializes C2f module with attention mechanism for enhanced feature extraction and processing."""  
+        super().__init__()    
+        self.c = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1) 
+        self.cv2 = Conv((3 + n) * self.c, c2, 1)  # optional act=FReLU(c2)    
+        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
+        self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)
+
+    def forward(self, x, guide):
+        """Forward pass through C2f layer."""
+        y = list(self.cv1(x).chunk(2, 1)) 
+        y.extend(m(y[-1]) for m in self.m)   
+        y.append(self.attn(y[-1], guide))
+        return self.cv2(torch.cat(y, 1))
+  
+    def forward_split(self, x, guide):
+        """Forward pass using split() instead of chunk()."""    
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in self.m)    
+        y.append(self.attn(y[-1], guide))  
+        return self.cv2(torch.cat(y, 1))
+   
+     
+class ImagePoolingAttn(nn.Module):
+    """ImagePoolingAttn: Enhance the text embeddings with image-aware information."""
+
+    def __init__(self, ec=256, ch=(), ct=512, nh=8, k=3, scale=False):
+        """Initializes ImagePoolingAttn with specified arguments."""
+        super().__init__()     
+   
+        nf = len(ch)     
+        self.query = nn.Sequential(nn.LayerNorm(ct), nn.Linear(ct, ec)) 
+        self.key = nn.Sequential(nn.LayerNorm(ec), nn.Linear(ec, ec))
+        self.value = nn.Sequential(nn.LayerNorm(ec), nn.Linear(ec, ec))
+        self.proj = nn.Linear(ec, ct)    
+        self.scale = nn.Parameter(torch.tensor([0.0]), requires_grad=True) if scale else 1.0    
+        self.projections = nn.ModuleList([nn.Conv2d(in_channels, ec, kernel_size=1) for in_channels in ch])
+        self.im_pools = nn.ModuleList([nn.AdaptiveMaxPool2d((k, k)) for _ in range(nf)])  
+        self.ec = ec    
+        self.nh = nh  
+        self.nf = nf
+        self.hc = ec // nh   
+        self.k = k    
+
+    def forward(self, x, text):
+        """Executes attention mechanism on input tensor x and guide tensor."""    
+        bs = x[0].shape[0]
+        assert len(x) == self.nf
+        num_patches = self.k**2 
+        x = [pool(proj(x)).view(bs, -1, num_patches) for (x, proj, pool) in zip(x, self.projections, self.im_pools)]     
+        x = torch.cat(x, dim=-1).transpose(1, 2)
+        q = self.query(text)
+        k = self.key(x)
+        v = self.value(x)
+
+        # q = q.reshape(1, text.shape[1], self.nh, self.hc).repeat(bs, 1, 1, 1)  
+        q = q.reshape(bs, -1, self.nh, self.hc)
+        k = k.reshape(bs, -1, self.nh, self.hc)
+        v = v.reshape(bs, -1, self.nh, self.hc)  
+    
+        aw = torch.einsum("bnmc,bkmc->bmnk", q, k)     
+        aw = aw / (self.hc**0.5)    
+        aw = F.softmax(aw, dim=-1)
+
+        x = torch.einsum("bmnk,bkmc->bnmc", aw, v)
+        x = self.proj(x.reshape(bs, -1, self.ec))     
+        return x * self.scale + text
+
+
+class ContrastiveHead(nn.Module):     
+    """Implements contrastive learning head for region-text similarity in vision-language models."""
+
+    def __init__(self):
+        """Initializes ContrastiveHead with specified region-text similarity parameters.""" 
+        super().__init__()
+        # NOTE: use -10.0 to keep the init cls loss consistency with other losses
+        self.bias = nn.Parameter(torch.tensor([-10.0]))  
+        self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
+    
+    def forward(self, x, w):
+        """Forward function of contrastive learning."""
+        x = F.normalize(x, dim=1, p=2)
+        w = F.normalize(w, dim=-1, p=2)  
+        x = torch.einsum("bchw,bkc->bkhw", x, w)
+        return x * self.logit_scale.exp() + self.bias
+    
+
+class BNContrastiveHead(nn.Module):    
+    """
+    Batch Norm Contrastive Head for YOLO-World using batch norm instead of l2-normalization.
+
+    Args: 
+        embed_dims (int): Embed dimensions of text and image features.     
+    """
+     
+    def __init__(self, embed_dims: int): 
+        """Initialize ContrastiveHead with region-text similarity parameters."""
+        super().__init__()  
+        self.norm = nn.BatchNorm2d(embed_dims)    
+        # NOTE: use -10.0 to keep the init cls loss consistency with other losses
+        self.bias = nn.Parameter(torch.tensor([-10.0]))    
+        # use -1.0 is more stable
+        self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))
+  
+    def forward(self, x, w):
+        """Forward function of contrastive learning."""
+        x = self.norm(x)
+        w = F.normalize(w, dim=-1, p=2)   
+        x = torch.einsum("bchw,bkc->bkhw", x, w)   
+        return x * self.logit_scale.exp() + self.bias   
+
+
+class RepBottleneck(Bottleneck):   
+    """Rep bottleneck."""  
+     
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
+        """Initializes a RepBottleneck module with customizable in/out channels, shortcuts, groups and expansion."""  
+        super().__init__(c1, c2, shortcut, g, k, e)  
+        c_ = int(c2 * e)  # hidden channels    
+        self.cv1 = RepConv(c1, c_, k[0], 1)
+  
+
+class RepCSP(C3):    
+    """Repeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction."""  
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):   
+        """Initializes RepCSP layer with given channels, repetitions, shortcut, groups and expansion ratio."""   
+        super().__init__(c1, c2, n, shortcut, g, e) 
+        c_ = int(c2 * e)  # hidden channels
+        self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
+
+     
+class RepNCSPELAN4(nn.Module):     
+    """CSP-ELAN."""
+
+    def __init__(self, c1, c2, c3, c4, n=1):     
+        """Initializes CSP-ELAN layer with specified channel sizes, repetitions, and convolutions."""  
+        super().__init__()
+        self.c = c3 // 2
+        self.cv1 = Conv(c1, c3, 1, 1)
+        self.cv2 = nn.Sequential(RepCSP(c3 // 2, c4, n), Conv(c4, c4, 3, 1))
+        self.cv3 = nn.Sequential(RepCSP(c4, c4, n), Conv(c4, c4, 3, 1))    
+        self.cv4 = Conv(c3 + (2 * c4), c2, 1, 1)   
+
+    def forward(self, x):
+        """Forward pass through RepNCSPELAN4 layer."""  
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend((m(y[-1])) for m in [self.cv2, self.cv3])    
+        return self.cv4(torch.cat(y, 1))
+     
+    def forward_split(self, x):
+        """Forward pass using split() instead of chunk().""" 
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+
+     
+class ELAN1(RepNCSPELAN4):  
+    """ELAN1 module with 4 convolutions."""
+
+    def __init__(self, c1, c2, c3, c4):  
+        """Initializes ELAN1 layer with specified channel sizes."""
+        super().__init__(c1, c2, c3, c4)     
+        self.c = c3 // 2  
+        self.cv1 = Conv(c1, c3, 1, 1)
+        self.cv2 = Conv(c3 // 2, c4, 3, 1)  
+        self.cv3 = Conv(c4, c4, 3, 1)
+        self.cv4 = Conv(c3 + (2 * c4), c2, 1, 1)     
+
+  
+class AConv(nn.Module):     
+    """AConv."""
+
+    def __init__(self, c1, c2):   
+        """Initializes AConv module with convolution layers."""
+        super().__init__()    
+        self.cv1 = Conv(c1, c2, 3, 2, 1)
+ 
+    def forward(self, x):
+        """Forward pass through AConv layer."""   
+        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
+        return self.cv1(x)
+    
+    
+class ADown(nn.Module):
+    """ADown."""
+ 
+    def __init__(self, c1, c2):  
+        """Initializes ADown module with convolution layers to downsample input from channels c1 to c2.""" 
+        super().__init__()
+        self.c = c2 // 2
+        self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
+        self.cv2 = Conv(c1 // 2, self.c, 1, 1, 0)
+     
+    def forward(self, x):
+        """Forward pass through ADown layer."""    
+        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)    
+        x1, x2 = x.chunk(2, 1)  
+        x1 = self.cv1(x1)
+        x2 = torch.nn.functional.max_pool2d(x2, 3, 2, 1)    
+        x2 = self.cv2(x2)     
+        return torch.cat((x1, x2), 1)     
+ 
+ 
+class SPPELAN(nn.Module):
+    """SPP-ELAN."""  
+   
+    def __init__(self, c1, c2, c3, k=5):
+        """Initializes SPP-ELAN block with convolution and max pooling layers for spatial pyramid pooling."""
+        super().__init__()  
+        self.c = c3  
+        self.cv1 = Conv(c1, c3, 1, 1) 
+        self.cv2 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)     
+        self.cv3 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+        self.cv4 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+        self.cv5 = Conv(4 * c3, c2, 1, 1) 
+     
+    def forward(self, x):
+        """Forward pass through SPPELAN layer."""  
+        y = [self.cv1(x)]     
+        y.extend(m(y[-1]) for m in [self.cv2, self.cv3, self.cv4]) 
+        return self.cv5(torch.cat(y, 1))   
+
+
+class CBLinear(nn.Module):  
+    """CBLinear."""
+  
+    def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):   
+        """Initializes the CBLinear module, passing inputs unchanged."""
+        super().__init__()   
+        self.c2s = c2s
+        self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True) 
+  
+    def forward(self, x):  
+        """Forward pass through CBLinear layer.""" 
+        return self.conv(x).split(self.c2s, dim=1)
+
+  
+class CBFuse(nn.Module):
+    """CBFuse.""" 
+
+    def __init__(self, idx):
+        """Initializes CBFuse module with layer index for selective feature fusion."""
+        super().__init__()
+        self.idx = idx
+    
+    def forward(self, xs):
+        """Forward pass through CBFuse layer."""
+        target_size = xs[-1].shape[2:]
+        res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])] 
+        return torch.sum(torch.stack(res + xs[-1:]), dim=0)
+    
+
+class C3f(nn.Module):    
+    """Faster Implementation of CSP Bottleneck with 2 convolutions."""   
+
+    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
+        """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,   
+        expansion.
+        """
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels   
+        self.cv1 = Conv(c1, c_, 1, 1)   
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv((2 + n) * c_, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))    
+   
+    def forward(self, x):
+        """Forward pass through C2f layer."""    
+        y = [self.cv2(x), self.cv1(x)]
+        y.extend(m(y[-1]) for m in self.m) 
+        return self.cv3(torch.cat(y, 1)) 
+
+     
+class C3k2(C2f):  
+    """Faster Implementation of CSP Bottleneck with 2 convolutions."""     
+    
+    def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
+        """Initializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks."""  
+        super().__init__(c1, c2, n, shortcut, g, e)
+        self.m = nn.ModuleList(
+            C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
+        )
+
+
+class C3k(C3):  
+    """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):     
+        """Initializes the C3k module with specified channels, number of layers, and configurations."""
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)  # hidden channels     
+        # self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))    
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))    
+  
+    
+class RepVGGDW(torch.nn.Module):    
+    """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture.""" 
+
+    def __init__(self, ed) -> None: 
+        """Initializes RepVGGDW with depthwise separable convolutional layers for efficient processing."""
+        super().__init__()   
+        self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False) 
+        self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
+        self.dim = ed
+        self.act = nn.SiLU()    
+
+    def forward(self, x):
+        """   
+        Performs a forward pass of the RepVGGDW block.
+
+        Args:     
+            x (torch.Tensor): Input tensor.  
+   
+        Returns: 
+            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
+        """     
+        return self.act(self.conv(x) + self.conv1(x))
+
+    def forward_fuse(self, x):    
+        """
+        Performs a forward pass of the RepVGGDW block without fusing the convolutions. 
+    
+        Args: 
+            x (torch.Tensor): Input tensor.
+
+        Returns:  
+            (torch.Tensor): Output tensor after applying the depth wise separable convolution.    
+        """ 
+        return self.act(self.conv(x))   
+
+    @torch.no_grad()
+    def fuse(self):
+        """
+        Fuses the convolutional layers in the RepVGGDW block.     
+    
+        This method fuses the convolutional layers and updates the weights and biases accordingly.   
+        """
+        conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn)
+        conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn)    
+ 
+        conv_w = conv.weight
+        conv_b = conv.bias
+        conv1_w = conv1.weight     
+        conv1_b = conv1.bias
+
+        conv1_w = torch.nn.functional.pad(conv1_w, [2, 2, 2, 2])
+
+        final_conv_w = conv_w + conv1_w
+        final_conv_b = conv_b + conv1_b     
+
+        conv.weight.data.copy_(final_conv_w)
+        conv.bias.data.copy_(final_conv_b)
+     
+        self.conv = conv
+        del self.conv1     
+     
+
+class CIB(nn.Module):
+    """
+    Conditional Identity Block (CIB) module.    
+     
+    Args:
+        c1 (int): Number of input channels.
+        c2 (int): Number of output channels.  
+        shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
+        e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.    
+        lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
+    """ 
+ 
+    def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
+        """Initializes the custom model with optional shortcut, scaling factor, and RepVGGDW layer."""  
+        super().__init__() 
+        c_ = int(c2 * e)  # hidden channels    
+        self.cv1 = nn.Sequential(    
+            Conv(c1, c1, 3, g=c1), 
+            Conv(c1, 2 * c_, 1),
+            RepVGGDW(2 * c_) if lk else Conv(2 * c_, 2 * c_, 3, g=2 * c_), 
+            Conv(2 * c_, c2, 1), 
+            Conv(c2, c2, 3, g=c2),
+        )
+  
+        self.add = shortcut and c1 == c2     
+
+    def forward(self, x):   
+        """   
+        Forward pass of the CIB module.    
+   
+        Args:
+            x (torch.Tensor): Input tensor.   
+   
+        Returns:
+            (torch.Tensor): Output tensor.    
+        """ 
+        return x + self.cv1(x) if self.add else self.cv1(x)   
+  
+
+class C2fCIB(C2f):    
+    """  
+    C2fCIB class represents a convolutional block with C2f and CIB modules.  
+ 
+    Args:    
+        c1 (int): Number of input channels.     
+        c2 (int): Number of output channels.
+        n (int, optional): Number of CIB modules to stack. Defaults to 1.
+        shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.     
+        lk (bool, optional): Whether to use local key connection. Defaults to False.    
+        g (int, optional): Number of groups for grouped convolution. Defaults to 1.
+        e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
+    """ 
+   
+    def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
+        """Initializes the module with specified parameters for channel, shortcut, local key, groups, and expansion."""
+        super().__init__(c1, c2, n, shortcut, g, e)   
+        self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
+   
+
+class Attention(nn.Module):
+    """
+    Attention module that performs self-attention on the input tensor.   
+ 
+    Args:  
+        dim (int): The input tensor dimension.
+        num_heads (int): The number of attention heads.
+        attn_ratio (float): The ratio of the attention key dimension to the head dimension.     
+ 
+    Attributes:    
+        num_heads (int): The number of attention heads.  
+        head_dim (int): The dimension of each attention head.  
+        key_dim (int): The dimension of the attention key.     
+        scale (float): The scaling factor for the attention scores.
+        qkv (Conv): Convolutional layer for computing the query, key, and value. 
+        proj (Conv): Convolutional layer for projecting the attended values.     
+        pe (Conv): Convolutional layer for positional encoding.
+    """     
+ 
+    def __init__(self, dim, num_heads=8, attn_ratio=0.5):   
+        """Initializes multi-head attention module with query, key, and value convolutions and positional encoding.""" 
+        super().__init__() 
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.key_dim = int(self.head_dim * attn_ratio)
+        self.scale = self.key_dim**-0.5
+        nh_kd = self.key_dim * num_heads
+        h = dim + nh_kd * 2    
+        self.qkv = Conv(dim, h, 1, act=False)     
+        self.proj = Conv(dim, dim, 1, act=False)  
+        self.pe = Conv(dim, dim, 3, 1, g=dim, act=False) 
+
+    def forward(self, x):     
+        """
+        Forward pass of the Attention module.     
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns: 
+            (torch.Tensor): The output tensor after self-attention. 
+        """    
+        B, C, H, W = x.shape    
+        N = H * W    
+        qkv = self.qkv(x)
+        q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
+            [self.key_dim, self.key_dim, self.head_dim], dim=2     
+        )  
+
+        attn = (q.transpose(-2, -1) @ k) * self.scale    
+        attn = attn.softmax(dim=-1) 
+        x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))     
+        x = self.proj(x)
+        return x     
+ 
+
+class PSABlock(nn.Module):
+    """   
+    PSABlock class implementing a Position-Sensitive Attention block for neural networks.
+ 
+    This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers    
+    with optional shortcut connections.     
+
+    Attributes:    
+        attn (Attention): Multi-head attention module.    
+        ffn (nn.Sequential): Feed-forward neural network module.  
+        add (bool): Flag indicating whether to add shortcut connections.
+
+    Methods:
+        forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.  
+
+    Examples:
+        Create a PSABlock and perform a forward pass 
+        >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)   
+        >>> input_tensor = torch.randn(1, 128, 32, 32)
+        >>> output_tensor = psablock(input_tensor)
+    """
+
+    def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:   
+        """Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction."""     
+        super().__init__()
+
+        self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads) 
+        self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))     
+        self.add = shortcut  
+    
+    def forward(self, x):     
+        """Executes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor."""
+        x = x + self.attn(x) if self.add else self.attn(x)   
+        x = x + self.ffn(x) if self.add else self.ffn(x)  
+        return x
+  
+  
+class PSA(nn.Module):     
+    """ 
+    PSA class for implementing Position-Sensitive Attention in neural networks.   
+    
+    This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to     
+    input tensors, enhancing feature extraction and processing capabilities.   
+  
+    Attributes:    
+        c (int): Number of hidden channels after applying the initial convolution.     
+        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
+        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.    
+        attn (Attention): Attention module for position-sensitive attention.  
+        ffn (nn.Sequential): Feed-forward network for further processing.     
+
+    Methods:
+        forward: Applies position-sensitive attention and feed-forward network to the input tensor.    
+   
+    Examples:
+        Create a PSA module and apply it to an input tensor
+        >>> psa = PSA(c1=128, c2=128, e=0.5)     
+        >>> input_tensor = torch.randn(1, 128, 64, 64)
+        >>> output_tensor = psa.forward(input_tensor)     
+    """
+
+    def __init__(self, c1, c2, e=0.5):
+        """Initializes the PSA module with input/output channels and attention mechanism for feature extraction."""
+        super().__init__()     
+        assert c1 == c2
+        self.c = int(c1 * e)
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1) 
+        self.cv2 = Conv(2 * self.c, c1, 1)
+
+        self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)  
+        self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))    
+
+    def forward(self, x):
+        """Executes forward pass in PSA module, applying attention and feed-forward layers to the input tensor.""" 
+        a, b = self.cv1(x).split((self.c, self.c), dim=1)    
+        b = b + self.attn(b)
+        b = b + self.ffn(b)
+        return self.cv2(torch.cat((a, b), 1))
+ 
+
+class C2PSA(nn.Module):
+    """
+    C2PSA module with attention mechanism for enhanced feature extraction and processing.
+    
+    This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
+    capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
+
+    Attributes:     
+        c (int): Number of hidden channels.  
+        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
+        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.    
+        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.    
+
+    Methods:
+        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
+ 
+    Notes:
+        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
+
+    Examples:  
+        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
+        >>> input_tensor = torch.randn(1, 256, 64, 64) 
+        >>> output_tensor = c2psa(input_tensor)
+    """ 
+    
+    def __init__(self, c1, c2, n=1, e=0.5):   
+        """Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""
+        super().__init__()   
+        assert c1 == c2
+        self.c = int(c1 * e)
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+        self.cv2 = Conv(2 * self.c, c1, 1)    
+   
+        self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
+     
+    def forward(self, x):   
+        """Processes the input tensor 'x' through a series of PSA blocks and returns the transformed tensor."""
+        a, b = self.cv1(x).split((self.c, self.c), dim=1)   
+        b = self.m(b)  
+        return self.cv2(torch.cat((a, b), 1))   
+
+
+class C2fPSA(C2f):
+    """ 
+    C2fPSA module with enhanced feature extraction using PSA blocks.
+  
+    This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.
+
+    Attributes:  
+        c (int): Number of hidden channels.
+        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
+        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.   
+        m (nn.ModuleList): List of PSA blocks for feature extraction.     
+     
+    Methods: 
+        forward: Performs a forward pass through the C2fPSA module.
+        forward_split: Performs a forward pass using split() instead of chunk().     
+   
+    Examples:
+        >>> import torch    
+        >>> from ultralytics.models.common import C2fPSA    
+        >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
+        >>> x = torch.randn(1, 64, 128, 128)   
+        >>> output = model(x)
+        >>> print(output.shape)
+    """
+
+    def __init__(self, c1, c2, n=1, e=0.5):
+        """Initializes the C2fPSA module, a variant of C2f with PSA blocks for enhanced feature extraction."""
+        assert c1 == c2
+        super().__init__(c1, c2, n=n, e=e)   
+        self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
+     
+
+class SCDown(nn.Module):
+    """ 
+    SCDown module for downsampling with separable convolutions.
+
+    This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in  
+    efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information. 
+ 
+    Attributes:   
+        cv1 (Conv): Pointwise convolution layer that reduces the number of channels.     
+        cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.
+
+    Methods:    
+        forward: Applies the SCDown module to the input tensor.
+
+    Examples:   
+        >>> import torch
+        >>> from ultralytics import SCDown
+        >>> model = SCDown(c1=64, c2=128, k=3, s=2)  
+        >>> x = torch.randn(1, 64, 128, 128)     
+        >>> y = model(x) 
+        >>> print(y.shape)    
+        torch.Size([1, 128, 64, 64])
+    """  
+     
+    def __init__(self, c1, c2, k, s):
+        """Initializes the SCDown module with specified input/output channels, kernel size, and stride."""
+        super().__init__()
+        self.cv1 = Conv(c1, c2, 1, 1)
+        self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)     
+  
+    def forward(self, x):
+        """Applies convolution and downsampling to the input tensor in the SCDown module."""  
+        return self.cv2(self.cv1(x))   
+
+class AAttn(nn.Module):
+    """
+    Area-attention module with the requirement of flash attention.  
+
+    Attributes: 
+        dim (int): Number of hidden channels; 
+        num_heads (int): Number of heads into which the attention mechanism is divided; 
+        area (int, optional): Number of areas the feature map is divided. Defaults to 1.  
+  
+    Methods:  
+        forward: Performs a forward process of input tensor and outputs a tensor after the execution of the area attention mechanism.  
+
+    Examples: 
+        >>> import torch
+        >>> from ultralytics.nn.modules import AAttn     
+        >>> model = AAttn(dim=64, num_heads=2, area=4)
+        >>> x = torch.randn(2, 64, 128, 128) 
+        >>> output = model(x)     
+        >>> print(output.shape)
+    
+    Notes:  
+        recommend that dim//num_heads be a multiple of 32 or 64.
+  
+    """
+
+    def __init__(self, dim, num_heads, area=1):   
+        """Initializes the area-attention module, a simple yet efficient attention module for YOLO."""     
+        super().__init__()   
+        self.area = area
+
+        self.num_heads = num_heads
+        self.head_dim = head_dim = dim // num_heads
+        all_head_dim = head_dim * self.num_heads
+
+        self.qk = Conv(dim, all_head_dim * 2, 1, act=False)
+        self.v = Conv(dim, all_head_dim, 1, act=False) 
+        self.proj = Conv(all_head_dim, dim, 1, act=False)
+  
+        self.pe = Conv(all_head_dim, dim, 5, 1, 2, g=dim, act=False)  
+
+
+    def forward(self, x):     
+        """Processes the input tensor 'x' through the area-attention""" 
+        B, C, H, W = x.shape
+        N = H * W  
+     
+        if x.is_cuda and FLASH_ATTN_FLAG:  
+            qk = self.qk(x).flatten(2).transpose(1, 2) 
+            v = self.v(x)
+            pp = self.pe(v)     
+            v = v.flatten(2).transpose(1, 2)    
+
+            if self.area > 1:
+                qk = qk.reshape(B * self.area, N // self.area, C * 2)
+                v = v.reshape(B * self.area, N // self.area, C)
+                B, N, _ = qk.shape 
+            q, k = qk.split([C, C], dim=2) 
+            q = q.view(B, N, self.num_heads, self.head_dim)
+            k = k.view(B, N, self.num_heads, self.head_dim)
+            v = v.view(B, N, self.num_heads, self.head_dim)
+
+            x = flash_attn_func( 
+                q.contiguous().half(),    
+                k.contiguous().half(),  
+                v.contiguous().half()
+            ).to(q.dtype)     
+
+            if self.area > 1:
+                x = x.reshape(B // self.area, N * self.area, C)
+                B, N, _ = x.shape
+            x = x.reshape(B, H, W, C).permute(0, 3, 1, 2)     
+        else:   
+            qk = self.qk(x).flatten(2)
+            v = self.v(x)
+            pp = self.pe(v)  
+            v = v.flatten(2) 
+            if self.area > 1:
+                qk = qk.reshape(B * self.area, C * 2, N // self.area)
+                v = v.reshape(B * self.area, C, N // self.area)     
+                B, _, N = qk.shape    
+     
+            q, k = qk.split([C, C], dim=1)  
+            q = q.view(B, self.num_heads, self.head_dim, N)     
+            k = k.view(B, self.num_heads, self.head_dim, N)
+            v = v.view(B, self.num_heads, self.head_dim, N)  
+            attn = (q.transpose(-2, -1) @ k) * (self.head_dim ** -0.5)    
+            max_attn = attn.max(dim=-1, keepdim=True).values
+            exp_attn = torch.exp(attn - max_attn)
+            attn = exp_attn / exp_attn.sum(dim=-1, keepdim=True)
+            x = (v @ attn.transpose(-2, -1))
+   
+            if self.area > 1:
+                x = x.reshape(B // self.area, C, N * self.area)     
+                B, _, N = x.shape     
+            x = x.reshape(B, C, H, W)   
+
+        return self.proj(x + pp)
+
+
+class ABlock(nn.Module):   
+    """
+    Area-attention block module for efficient feature extraction in YOLO models.
+
+    This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.     
+    It uses a novel area-based attention approach that is more efficient than traditional self-attention while
+    maintaining effectiveness.     
+ 
+    Attributes:  
+        attn (AAttn): Area-attention module for processing spatial features.     
+        mlp (nn.Sequential): Multi-layer perceptron for feature transformation.
+
+    Methods:   
+        _init_weights: Initializes module weights using truncated normal distribution.    
+        forward: Applies area-attention and feed-forward processing to input tensor.
+ 
+    Examples:    
+        >>> block = ABlock(dim=256, num_heads=8, mlp_ratio=1.2, area=1)
+        >>> x = torch.randn(1, 256, 32, 32)
+        >>> output = block(x)  
+        >>> print(output.shape)
+        torch.Size([1, 256, 32, 32]) 
+    """     
+
+    def __init__(self, dim, num_heads, mlp_ratio=1.2, area=1):
+        """  
+        Initializes an Area-attention block module for efficient feature extraction in YOLO models.
+   
+        This module implements an area-attention mechanism combined with a feed-forward network for processing feature    
+        maps. It uses a novel area-based attention approach that is more efficient than traditional self-attention  
+        while maintaining effectiveness. 
+
+        Args:   
+            dim (int): Number of input channels.  
+            num_heads (int): Number of heads into which the attention mechanism is divided.
+            mlp_ratio (float): Expansion ratio for MLP hidden dimension.    
+            area (int): Number of areas the feature map is divided.   
+        """  
+        super().__init__()
+ 
+        self.attn = AAttn(dim, num_heads=num_heads, area=area)  
+        mlp_hidden_dim = int(dim * mlp_ratio)  
+        self.mlp = nn.Sequential(Conv(dim, mlp_hidden_dim, 1), Conv(mlp_hidden_dim, dim, 1, act=False))  
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        """Initialize weights using a truncated normal distribution."""     
+        if isinstance(m, nn.Conv2d): 
+            nn.init.trunc_normal_(m.weight, std=0.02)     
+            if m.bias is not None:   
+                nn.init.constant_(m.bias, 0)     
+
+    def forward(self, x):
+        """Forward pass through ABlock, applying area-attention and feed-forward layers to the input tensor."""     
+        x = x + self.attn(x)
+        return x + self.mlp(x) 
+
+  
+class A2C2f(nn.Module):
+    """
+    Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
+
+    This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
+    processing. It supports both area-attention and standard convolution modes.
+     
+    Attributes:
+        cv1 (Conv): Initial 1x1 convolution layer that reduces input channels to hidden channels.   
+        cv2 (Conv): Final 1x1 convolution layer that processes concatenated features.
+        gamma (nn.Parameter | None): Learnable parameter for residual scaling when using area attention.    
+        m (nn.ModuleList): List of either ABlock or C3k modules for feature processing.
+
+    Methods:
+        forward: Processes input through area-attention or standard convolution pathway.  
+
+    Examples:
+        >>> m = A2C2f(512, 512, n=1, a2=True, area=1)
+        >>> x = torch.randn(1, 512, 32, 32)    
+        >>> output = m(x)    
+        >>> print(output.shape) 
+        torch.Size([1, 512, 32, 32])     
+    """
+
+    def __init__(self, c1, c2, n=1, a2=True, area=1, residual=False, mlp_ratio=2.0, e=0.5, g=1, shortcut=True):     
+        """
+        Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.    
+
+        Args:   
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int): Number of ABlock or C3k modules to stack.     
+            a2 (bool): Whether to use area attention blocks. If False, uses C3k blocks instead.
+            area (int): Number of areas the feature map is divided.
+            residual (bool): Whether to use residual connections with learnable gamma parameter.    
+            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
+            e (float): Channel expansion ratio for hidden channels.
+            g (int): Number of groups for grouped convolutions.
+            shortcut (bool): Whether to use shortcut connections in C3k blocks.    
+        """ 
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."    
+     
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv((1 + n) * c_, c2, 1)    
+  
+        self.gamma = nn.Parameter(0.01 * torch.ones(c2), requires_grad=True) if a2 and residual else None
+        self.m = nn.ModuleList(     
+            nn.Sequential(*(ABlock(c_, c_ // 32, mlp_ratio, area) for _ in range(2)))
+            if a2     
+            else C3k(c_, c_, 2, shortcut, g)
+            for _ in range(n)   
+        )   
+        # print(c1, c2, n, a2, area)
+   
+    def forward(self, x):
+        """Forward pass through R-ELAN layer."""
+        y = [self.cv1(x)]
+        y.extend(m(y[-1]) for m in self.m)     
+        y = self.cv2(torch.cat(y, 1))   
+        if self.gamma is not None:
+            return x + self.gamma.view(-1, len(self.gamma), 1, 1) * y
+        return y  
+    
+class C3_Block(nn.Module):   
+    """CSP Bottleneck with 3 convolutions."""     
+
+    def __init__(self, c1, c2, module=partial(Bottleneck, k=(1, 3), shortcut=True, e=0.5), n=1, e=0.5):
+        """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values."""
+        super().__init__()   
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)   
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.Sequential(*(module(c_, c_) for _ in range(n)))     
+
+    def forward(self, x):     
+        """Forward pass through the CSP bottleneck with 2 convolutions."""  
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
+  
+class C2f_Block(nn.Module):   
+    """Faster Implementation of CSP Bottleneck with 2 convolutions.""" 
+ 
+    def __init__(self, c1, c2, module=partial(Bottleneck, k=(3, 3), shortcut=True, e=0.5), n=1, e=0.5):     
+        """Initializes a CSP bottleneck with 2 convolutions and n Bottleneck blocks for faster processing."""
+        super().__init__()     
+        self.c = int(c2 * e)  # hidden channels   
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1) 
+        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.ModuleList(module(self.c, self.c) for _ in range(n))
+
+    def forward(self, x):  
+        """Forward pass through C2f layer."""
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend(m(y[-1]) for m in self.m)    
+        return self.cv2(torch.cat(y, 1))
+
+class C3k_Block(nn.Module):     
+    """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""   
+
+    def __init__(self, c1, c2, module=partial(Bottleneck, k=(3, 3), shortcut=True, e=1.0), n=1, e=0.5): 
+        """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values."""
+        super().__init__() 
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.Sequential(*(module(c_, c_) for _ in range(n)))   
+ 
+    def forward(self, x):
+        """Forward pass through the CSP bottleneck with 2 convolutions."""   
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))     
+    
+class C3k2_Block(nn.Module):   
+    def __init__(self, c1, c2, module=partial(Bottleneck, k=(3, 3), shortcut=True, e=0.5), n=1, c3k=True, e=0.5):
+        super().__init__()   
+        self.c = int(c2 * e)  # hidden channels     
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1) 
+        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)   
+        self.m = nn.ModuleList(     
+                C3k_Block(self.c, self.c, module, 2) if c3k else module(self.c, self.c) for _ in range(n)
+            )
+  
+    def forward(self, x): 
+        """Forward pass through C2f layer."""  
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend(m(y[-1]) for m in self.m)    
+        return self.cv2(torch.cat(y, 1))     
diff --git a/engine/extre_module/ultralytics_nn/conv.py b/engine/extre_module/ultralytics_nn/conv.py
new file mode 100644
index 00000000..fbebcaca
--- /dev/null
+++ b/engine/extre_module/ultralytics_nn/conv.py
@@ -0,0 +1,238 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license     
+"""Convolution modules."""
+     
+import math   
+     
+import numpy as np     
+import torch    
+import torch.nn as nn 
+from ..torch_utils import fuse_conv_and_bn
+
+__all__ = (  
+    "Conv",
+    "LightConv",
+    "DWConv",
+    "DWConvTranspose2d", 
+    "ConvTranspose",    
+    "GhostConv",    
+    "RepConv",
+    "DSConv"
+)    
+
+   
+def autopad(k, p=None, d=1):  # kernel, padding, dilation
+    """Pad to 'same' shape outputs."""
+    if d > 1:
+        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size   
+    if p is None: 
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad    
+    return p  
+
+     
+class Conv(nn.Module):    
+    """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
+    default_act = nn.SiLU()  # default activation  
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
+        """Initialize Conv layer with given arguments including activation."""
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)   
+        self.bn = nn.BatchNorm2d(c2)     
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() 
+
+    def forward(self, x):
+        """Apply convolution, batch normalization and activation to input tensor."""
+        return self.act(self.bn(self.conv(x)))
+ 
+    def forward_fuse(self, x):   
+        """Perform transposed convolution of 2D data."""
+        return self.act(self.conv(x))  
+    
+    def convert_to_deploy(self): 
+        if hasattr(self, "bn"):
+            self.conv = fuse_conv_and_bn(self.conv, self.bn)  # update conv    
+            delattr(self, "bn")  # remove batchnorm 
+            self.forward = self.forward_fuse  # update forward   
+  
+class LightConv(nn.Module):
+    """  
+    Light convolution with args(ch_in, ch_out, kernel).
+    
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py 
+    """   
+    
+    def __init__(self, c1, c2, k=1, act=nn.ReLU()):
+        """Initialize Conv layer with given arguments including activation.""" 
+        super().__init__()
+        self.conv1 = Conv(c1, c2, 1, act=False)   
+        self.conv2 = DWConv(c2, c2, k, act=act)
+
+    def forward(self, x):
+        """Apply 2 convolutions to input tensor."""
+        return self.conv2(self.conv1(x))
+ 
+class DWConv(Conv):  
+    """Depth-wise convolution."""
+    
+    def __init__(self, c1, c2, k=1, s=1, d=1, act=True):  # ch_in, ch_out, kernel, stride, dilation, activation
+        """Initialize Depth-wise convolution with given parameters.""" 
+        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
+
+class DSConv(nn.Module): 
+    """Depthwise Separable Convolution"""   
+    def __init__(self, c1, c2, k=1, s=1, d=1, act=True) -> None:
+        super().__init__()  
+        
+        self.dwconv = DWConv(c1, c1, 3)    
+        self.pwconv = Conv(c1, c2, 1)    
+    
+    def forward(self, x):
+        return self.pwconv(self.dwconv(x))
+
+class DWConvTranspose2d(nn.ConvTranspose2d):
+    """Depth-wise transpose convolution."""     
+  
+    def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):  # ch_in, ch_out, kernel, stride, padding, padding_out     
+        """Initialize DWConvTranspose2d class with given parameters.""" 
+        super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))     
+ 
+
+class ConvTranspose(nn.Module):
+    """Convolution transpose 2d layer.""" 
+
+    default_act = nn.SiLU()  # default activation
+
+    def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
+        """Initialize ConvTranspose2d layer with batch normalization and activation function."""
+        super().__init__()    
+        self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
+        self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()  
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+    def forward(self, x):
+        """Applies transposed convolutions, batch normalization and activation to input."""
+        return self.act(self.bn(self.conv_transpose(x)))
+
+    def forward_fuse(self, x):     
+        """Applies activation and convolution transpose operation to input."""
+        return self.act(self.conv_transpose(x))    
+
+class GhostConv(nn.Module):    
+    """Ghost Convolution https://github.com/huawei-noah/ghostnet.""" 
+    
+    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  
+        """Initializes Ghost Convolution module with primary and cheap operations for efficient feature learning."""
+        super().__init__()
+        c_ = c2 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
+        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)   
+   
+    def forward(self, x):   
+        """Forward propagation through a Ghost Bottleneck layer with skip connection."""   
+        y = self.cv1(x)   
+        return torch.cat((y, self.cv2(y)), 1)     
+
+     
+class RepConv(nn.Module):     
+    """  
+    RepConv is a basic rep-style block, including training and deploy status.
+
+    This module is used in RT-DETR.
+    Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+    """     
+   
+    default_act = nn.SiLU()  # default activation
+
+    def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):   
+        """Initializes Light Convolution layer with inputs, outputs & optional activation function."""
+        super().__init__()     
+        assert k == 3 and p == 1    
+        self.g = g
+        self.c1 = c1
+        self.c2 = c2     
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+        self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None  
+        self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)   
+        self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
+
+    def forward_fuse(self, x):     
+        """Forward process."""  
+        return self.act(self.conv(x))
+
+    def forward(self, x):     
+        """Forward process."""  
+        id_out = 0 if self.bn is None else self.bn(x)   
+        return self.act(self.conv1(x) + self.conv2(x) + id_out)  
+
+    def get_equivalent_kernel_bias(self):
+        """Returns equivalent kernel and bias by adding 3x3 kernel, 1x1 kernel and identity kernel with their biases."""    
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)   
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        kernelid, biasid = self._fuse_bn_tensor(self.bn)  
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+ 
+    @staticmethod
+    def _pad_1x1_to_3x3_tensor(kernel1x1):  
+        """Pads a 1x1 tensor to a 3x3 tensor."""     
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+    
+    def _fuse_bn_tensor(self, branch):
+        """Generates appropriate kernels and biases for convolution by fusing branches of the neural network."""  
+        if branch is None:   
+            return 0, 0
+        if isinstance(branch, Conv):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight  
+            beta = branch.bn.bias    
+            eps = branch.bn.eps
+        elif isinstance(branch, nn.BatchNorm2d):     
+            if not hasattr(self, "id_tensor"):  
+                input_dim = self.c1 // self.g
+                kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.c1):
+                    kernel_value[i, i % input_dim, 1, 1] = 1    
+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)  
+            kernel = self.id_tensor 
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight    
+            beta = branch.bias    
+            eps = branch.eps    
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)   
+        return kernel * t, beta - running_mean * gamma / std
+     
+    def convert_to_deploy(self):  
+        """Combines two convolution layers into a single layer and removes unused attributes from the class."""     
+        if hasattr(self, "conv"):    
+            return
+        kernel, bias = self.get_equivalent_kernel_bias() 
+        self.conv = nn.Conv2d(     
+            in_channels=self.conv1.conv.in_channels,    
+            out_channels=self.conv1.conv.out_channels,
+            kernel_size=self.conv1.conv.kernel_size,
+            stride=self.conv1.conv.stride,     
+            padding=self.conv1.conv.padding,   
+            dilation=self.conv1.conv.dilation,  
+            groups=self.conv1.conv.groups,    
+            bias=True,
+        ).requires_grad_(False)   
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias    
+        for para in self.parameters():   
+            para.detach_()
+        self.__delattr__("conv1")
+        self.__delattr__("conv2")
+        if hasattr(self, "nm"):
+            self.__delattr__("nm")
+        if hasattr(self, "bn"):   
+            self.__delattr__("bn")
+        if hasattr(self, "id_tensor"):  
+            self.__delattr__("id_tensor") 
+        self.forward = self.forward_fuse
\ No newline at end of file
diff --git a/engine/extre_module/utils.py b/engine/extre_module/utils.py
new file mode 100644
index 00000000..9e937c21
--- /dev/null
+++ b/engine/extre_module/utils.py
@@ -0,0 +1,184 @@
+import os, contextlib, platform    
+import matplotlib.pyplot as plt    
+from pathlib import Path
+from tqdm import tqdm as tqdm_original
+     
+RANK = int(os.getenv("RANK", -1))    
+VERBOSE = str(os.getenv("DEIM_VERBOSE", True)).lower() == "true"  # global verbose mode
+TQDM_BAR_FORMAT = "{l_bar}{bar:10}{r_bar}" if VERBOSE else None  # tqdm bar format     
+MACOS, LINUX, WINDOWS = (platform.system() == x for x in ["Darwin", "Linux", "Windows"])  # environment booleans
+
+def emojis(string=""):   
+    """Return platform-dependent emoji-safe version of string."""  
+    return string.encode().decode("ascii", "ignore") if WINDOWS else string   
+
+class TryExcept(contextlib.ContextDecorator):
+    """ 
+    Ultralytics TryExcept class. Use as @TryExcept() decorator or 'with TryExcept():' context manager.
+ 
+    Examples:
+        As a decorator:     
+        >>> @TryExcept(msg="Error occurred in func", verbose=True)
+        >>> def func():  
+        >>> # Function logic here
+        >>>     pass
+
+        As a context manager:
+        >>> with TryExcept(msg="Error occurred in block", verbose=True):
+        >>> # Code block here
+        >>>     pass
+    """     
+
+    def __init__(self, msg="", verbose=True):
+        """Initialize TryExcept class with optional message and verbosity settings.""" 
+        self.msg = msg
+        self.verbose = verbose
+
+    def __enter__(self):    
+        """Executes when entering TryExcept context, initializes instance."""    
+        pass
+   
+    def __exit__(self, exc_type, value, traceback):
+        """Defines behavior when exiting a 'with' block, prints error message if necessary."""
+        if self.verbose and value:
+            print(emojis(f"{self.msg}{': ' if self.msg else ''}{value}"))   
+        return True
+
+def plt_settings(rcparams=None, backend="Agg"):  
+    """
+    Decorator to temporarily set rc parameters and the backend for a plotting function. 
+
+    Example:   
+        decorator: @plt_settings({"font.size": 12})  
+        context manager: with plt_settings({"font.size": 12}):     
+
+    Args:     
+        rcparams (dict): Dictionary of rc parameters to set.  
+        backend (str, optional): Name of the backend to use. Defaults to 'Agg'.    
+
+    Returns:
+        (Callable): Decorated function with temporarily set rc parameters and backend. This decorator can be
+            applied to any function that needs to have specific matplotlib rc parameters and backend for its execution. 
+    """
+    if rcparams is None:    
+        rcparams = {"font.size": 11}  
+
+    def decorator(func):
+        """Decorator to apply temporary rc parameters and backend to a function."""
+     
+        def wrapper(*args, **kwargs): 
+            """Sets rc parameters and backend, calls the original function, and restores the settings."""
+            original_backend = plt.get_backend()
+            switch = backend.lower() != original_backend.lower()
+            if switch:
+                plt.close("all")  # auto-close()ing of figures upon backend switching is deprecated since 3.8 
+                plt.switch_backend(backend)    
+
+            # Plot with backend and always revert to original backend
+            try:    
+                with plt.rc_context(rcparams):
+                    result = func(*args, **kwargs)     
+            finally:
+                if switch:   
+                    plt.close("all")
+                    plt.switch_backend(original_backend)    
+            return result
+  
+        return wrapper 
+ 
+    return decorator 
+
+def increment_path(path, exist_ok=False, sep="", mkdir=False):
+    """
+    Increments a file or directory path, i.e., runs/exp --> runs/exp{sep}2, runs/exp{sep}3, ... etc.  
+
+    If the path exists and `exist_ok` is not True, the path will be incremented by appending a number and `sep` to
+    the end of the path. If the path is a file, the file extension will be preserved. If the path is a directory, the
+    number will be appended directly to the end of the path. If `mkdir` is set to True, the path will be created as a
+    directory if it does not already exist.
+
+    Args:     
+        path (str | pathlib.Path): Path to increment.
+        exist_ok (bool): If True, the path will not be incremented and returned as-is.
+        sep (str): Separator to use between the path and the incrementation number.
+        mkdir (bool): Create a directory if it does not exist.   
+  
+    Returns:  
+        (pathlib.Path): Incremented path.
+    
+    Examples:
+        Increment a directory path:
+        >>> from pathlib import Path  
+        >>> path = Path("runs/exp")    
+        >>> new_path = increment_path(path)     
+        >>> print(new_path)
+        runs/exp2
+  
+        Increment a file path:
+        >>> path = Path("runs/exp/results.txt")
+        >>> new_path = increment_path(path) 
+        >>> print(new_path)
+        runs/exp/results2.txt    
+    """
+    path = Path(path)  # os-agnostic
+    if path.exists() and not exist_ok:
+        path, suffix = (path.with_suffix(""), path.suffix) if path.is_file() else (path, "")
+    
+        # Method 1     
+        for n in range(2, 9999):  
+            p = f"{path}{sep}{n}{suffix}"  # increment path 
+            if not os.path.exists(p):
+                break
+        path = Path(p)  
+  
+    if mkdir:    
+        path.mkdir(parents=True, exist_ok=True)  # make directory
+
+    return path
+
+class TQDM(tqdm_original):    
+    """     
+    A custom TQDM progress bar class that extends the original tqdm functionality.
+
+    This class modifies the behavior of the original tqdm progress bar based on global settings and provides   
+    additional customization options.
+  
+    Attributes: 
+        disable (bool): Whether to disable the progress bar. Determined by the global VERBOSE setting and
+            any passed 'disable' argument.     
+        bar_format (str): The format string for the progress bar. Uses the global TQDM_BAR_FORMAT if not
+            explicitly set.
+
+    Methods:   
+        __init__: Initializes the TQDM object with custom settings.
+
+    Examples:
+        >>> from ultralytics.utils import TQDM
+        >>> for i in TQDM(range(100)):
+        ...     # Your processing code here
+        ...     pass     
+    """    
+  
+    def __init__(self, *args, **kwargs):
+        """
+        Initializes a custom TQDM progress bar.     
+
+        This class extends the original tqdm class to provide customized behavior for Ultralytics projects.  
+  
+        Args: 
+            *args (Any): Variable length argument list to be passed to the original tqdm constructor.     
+            **kwargs (Any): Arbitrary keyword arguments to be passed to the original tqdm constructor.
+
+        Notes:
+            - The progress bar is disabled if VERBOSE is False or if 'disable' is explicitly set to True in kwargs.  
+            - The default bar format is set to TQDM_BAR_FORMAT unless overridden in kwargs.
+     
+        Examples: 
+            >>> from ultralytics.utils import TQDM
+            >>> for i in TQDM(range(100)):     
+            ...     # Your code here
+            ...     pass 
+        """
+        # kwargs["disable"] = not VERBOSE or kwargs.get("disable", False)  # logical 'and' with default value if passed  
+        # kwargs.setdefault("bar_format", TQDM_BAR_FORMAT)  # override default value if passed
+        super().__init__(*args, **kwargs)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 26cd97f2..01e0b1a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-torch>=2.0.1
-torchvision>=0.15.2
+# torch>=2.0.1
+# torchvision>=0.15.2
 faster-coco-eval>=1.6.5
 PyYAML
 tensorboard
diff --git a/train.py b/train.py
index 35e46eb1..14b5e030 100644
--- a/train.py
+++ b/train.py
@@ -7,6 +7,8 @@
 """
 
 import os
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" 
 import sys
 sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
 
diff --git a/train.sh b/train.sh
new file mode 100755
index 00000000..9f121b37
--- /dev/null
+++ b/train.sh
@@ -0,0 +1,3 @@
+#CUDA_VISIBLE_DEVICES=0 torchrun --master_port=7777 --nproc_per_node=1 train.py -c configs/deim_dfine/deim_hgnetv2_s_visdrone.yml --seed=0
+
+CUDA_VISIBLE_DEVICES=0 torchrun --master_port=7777 --nproc_per_node=1 train.py -c configs/dfine/dfine_hgnetv2_n_mal_custom.yml --seed=0
\ No newline at end of file