ModelTC · shihaobai · Nov 14, 2025 · Nov 18, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ dist
 .vscode
 tmp/
 requirements-musa.txt
+CLAUDE.md
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -7,7 +7,7 @@
 import torch
 import torch.nn.functional as F
 import triton
-from typing import final, List
+from typing import final, List, Optional
 from tqdm import tqdm
 
 from lightllm.common.basemodel.layer_weights.hf_load_utils import load_hf_weights
@@ -33,6 +33,10 @@
 from lightllm.utils.envs_utils import set_model_init_status, enable_diverse_mode_gqa_decode_fast_kernel
 from lightllm.common.triton_utils.autotuner import Autotuner
 from lightllm.utils.infer_utils import post_empty_cache
+from lightllm.utils.torch_memory_saver_utils import (
+    TorchMemorySaverWrapper,
+    MemoryTag,
+)
 from .attention import get_prefill_att_backend_class, get_decode_att_backend_class
 from .attention import BaseAttBackend
 
@@ -91,6 +95,7 @@ def __init__(self, kvargs):
         self.tp_world_size_ = get_dp_world_size()
         self.enable_tpsp_mix_mode = get_env_start_args().enable_tpsp_mix_mode
 
+        self.torch_memory_saver = TorchMemorySaverWrapper(self.args.enable_torch_memory_saver)
         self.is_mtp_mode = self.args.mtp_mode in [
             "vanilla_with_att",
             "eagle_with_att",
@@ -104,19 +109,21 @@ def __init__(self, kvargs):
         self._verify_params()
         self._init_quant()
 
-        self._init_weights()
-        self._init_req_manager()
-        self._init_mem_manager()
+        with self.torch_memory_saver.region(tag=MemoryTag.WEIGHT, enable_cpu_backup=self.args.enable_weight_cpu_backup):
+            self._init_weights()
+        with self.torch_memory_saver.region(tag=MemoryTag.KV_CACHE):
+            self._init_req_manager()
+            self._init_mem_manager()
+            self._init_kv_move_buffer()
+
         # 因为类似 qwen3.5 的linear 架构的模型，其 req_manager 会存储运行时使用的大量 linear state
         # 这可能会占用大量的显存，所以，req_manger 中保存的 mem_manger 是mem manager 初始化后再赋值
         self.req_manager.mem_manager = self.mem_manager
-
-        self._init_kv_move_buffer()
         self._check_mem_size()
         self._init_infer_layer()
         self._init_some_value()
         self._init_custom()
-        self._load_hf_weights()
+        self.load_weights(self.weight_dict)
         # wait必须在init cudagraph 之前，避免错误捕获
         self._wait_other_modules_ready()
 
@@ -181,17 +188,15 @@ def _init_weights(self, start_layer_index=0):
         ]
         return
 
-    def _load_hf_weights(self):
+    def load_weights(self, weight_dict: dict):
+        assert weight_dict is None or isinstance(weight_dict, dict), "weight_dict must be a dict or None"
         load_hf_weights(
             self.data_type,
-            weight_dir=self.weight_dir_,
+            self.weight_dir_,
             pre_post_layer=self.pre_post_weight,
             transformer_layer_list=self.trans_layers_weight,
-            weight_dict=self.weight_dict,
+            weight_dict=weight_dict,
         )
-        self.pre_post_weight.verify_load()
-        [weight.verify_load() for weight in self.trans_layers_weight]
-        return
 
     def _init_mem_manager(self):
         assert self.config["num_attention_heads"] % self.tp_world_size_ == 0
@@ -999,6 +1004,7 @@ def _check_max_len_infer(self):
             )
             logger.error(exception_str)
             raise Exception(exception_str)
+        torch.cuda.empty_cache()
         return
 
     def autotune_layers(self):
@@ -1133,6 +1139,9 @@ def _init_padded_req(self):
         del b_seq_len
         del b_ready_cache_len
         del model_output
+        del b_mtp_index
+        del b_prefill_start_loc
+        del b_q_seq_len
         torch.cuda.empty_cache()
         return
 
@@ -1153,3 +1162,72 @@ def _gen_special_model_input(self, token_num: int):
             special_model_input["mtp_draft_input_hiddens"] = None
 
         return special_model_input
+
+    def release_memory_occupation(self, tags: Optional[List[MemoryTag]]):
+        torch.cuda.synchronize()
+        if tags is None:
+            self.release_all()
+            return
+        if MemoryTag.WEIGHT in tags:
+            self.release_weight()
+        if MemoryTag.KV_CACHE in tags:
+            self.release_kv_cache()
+        if MemoryTag.GRAPH in tags:
+            self.release_graph()
+        return
+
+    def resume_memory_occupation(self, tags: Optional[List[MemoryTag]]):
+        if tags is None:
+            self.resume_all()
+            return
+        if MemoryTag.WEIGHT in tags:
+            self.resume_weight()
+        if MemoryTag.KV_CACHE in tags:
+            self.resume_kv_cache()
+        if MemoryTag.GRAPH in tags:
+            self.resume_graph()
+        return
+
+    def release_weight(self):
+        self.torch_memory_saver.pause(tag=MemoryTag.WEIGHT)
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def release_kv_cache(self):
+        self.torch_memory_saver.pause(tag=MemoryTag.KV_CACHE)
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def release_graph(self):
+        self.torch_memory_saver.pause(tag=MemoryTag.GRAPH)
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def release_all(self):
+        self.torch_memory_saver.pause(tag=MemoryTag.WEIGHT)
+        self.torch_memory_saver.pause(tag=MemoryTag.KV_CACHE)
+        self.torch_memory_saver.pause(tag=MemoryTag.GRAPH)
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def resume_weight(self):
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.torch_memory_saver.resume(tag=MemoryTag.WEIGHT)
+
+    def resume_kv_cache(self):
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.torch_memory_saver.resume(tag=MemoryTag.KV_CACHE)
+
+    def resume_graph(self):
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.torch_memory_saver.resume(tag=MemoryTag.GRAPH)
+
+    def resume_all(self):
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.torch_memory_saver.resume(tag=MemoryTag.WEIGHT)
+        self.torch_memory_saver.resume(tag=MemoryTag.KV_CACHE)
+        self.torch_memory_saver.resume(tag=MemoryTag.GRAPH)
diff --git a/lightllm/common/basemodel/cuda_graph.py b/lightllm/common/basemodel/cuda_graph.py
@@ -8,6 +8,10 @@
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.distributed import dist_group_manager, lightllm_capture_graph, CustomProcessGroup
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
+from lightllm.utils.torch_memory_saver_utils import (
+    TorchMemorySaverWrapper,
+    MemoryTag,
+)
 from .infer_struct import InferStateInfo
 
 
@@ -26,6 +30,7 @@ def __init__(self, max_batch_size=8, max_len_in_batch=8192, tp_world_size: int =
         self.max_batch_size = max_batch_size
         self.graph_max_len_in_batch = max_len_in_batch
         self.enable_decode_microbatch_overlap = self.args.enable_decode_microbatch_overlap
+        self.torch_memory_saver = TorchMemorySaverWrapper(self.args.enable_torch_memory_saver)
 
         # gen cuda graph batch_sizes
         # cuda graph gen for batch size = [1, 2, 3, ..., graph_split_batch_size]
@@ -96,7 +101,7 @@ def _capture_decode(self, decode_func, infer_state: InferStateInfo):
                     delattr(infer_state, param_name)
 
         with lightllm_capture_graph(dist_group):
-            with torch.cuda.graph(graph_obj, pool=self.mempool):
+            with self.torch_memory_saver.cuda_graph(graph_obj, pool=self.mempool):
                 model_output = decode_func(infer_state)
         self.graph[batch_size] = (graph_obj, infer_state, model_output)
         graph_obj.replay()
@@ -134,7 +139,7 @@ def _capture_decode_overlap(
 
         with lightllm_capture_graph(dist_group1):
             with lightllm_capture_graph(dist_group):
-                with torch.cuda.graph(graph_obj, pool=self.mempool):
+                with self.torch_memory_saver.cuda_graph(graph_obj, pool=self.mempool):
                     model_output, model_output1 = decode_func(infer_state, infer_state1)
         self.graph[batch_size] = (
             graph_obj,

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/__init__.py
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -33,6 +33,7 @@ def __init__(
         num_fused_shared_experts: int = 0,
         layer_num: int = 0,
         network_config: Dict[str, Any] = None,
+        moe_layer_index: int = 0,
     ) -> None:
         super().__init__(data_type=data_type)
         self.w1_weight_name = gate_proj_name
@@ -50,6 +51,7 @@ def __init__(
         self.enable_ep_moe = get_env_start_args().enable_ep_moe
         self.n_routed_experts = n_routed_experts
         self.num_fused_shared_experts = num_fused_shared_experts
+        self.moe_layer_index = moe_layer_index
         self._init_config(network_config)
         self._init_redundancy_expert_params()
         self._init_parallel_params()
@@ -130,6 +132,7 @@ def experts(
         topk_group: int,
         num_expert_group: int,
         is_prefill: Optional[bool] = None,
+        microbatch_index: int = 0,
     ) -> torch.Tensor:
         """Backward compatible method that routes to platform-specific implementation."""
         return self.fuse_moe_impl(
@@ -145,6 +148,8 @@ def experts(
             topk_group=topk_group,
             num_expert_group=num_expert_group,
             is_prefill=is_prefill,
+            moe_layer_index=self.moe_layer_index,
+            microbatch_index=microbatch_index,
         )
 
     def low_latency_dispatch(
@@ -295,6 +300,7 @@ def _create_weight(self):
             device_id=self.device_id_,
             num_experts=self.local_n_routed_experts,
         )
+        self.w1, self.w3 = w13_param_list
         self.w1_list: List[WeightPack] = self._get_expert_weight_list(w13_param_list[0])
         self.w3_list: List[WeightPack] = self._get_expert_weight_list(w13_param_list[1])
         self.w2_list: List[WeightPack] = self._get_expert_weight_list(self.w2)
@@ -307,7 +313,8 @@ def _get_expert_weight_list(self, weight_pack: WeightPack):
         return weight_list
 
     def _load_weight(self, expert_idx_to_local_idx: Dict[int, int], weights: Dict[str, torch.Tensor]):
-
+        # for merged weights
+        self._load_merge_weight(weights)
         # Load each expert with TP slicing
         for expert_idx, local_expert_idx in expert_idx_to_local_idx.items():
             with self.lock:
@@ -332,6 +339,7 @@ def _load_expert(
         w1_weight = f"{self.weight_prefix}.{expert_idx}.{self.w1_weight_name}.{self.quant_method.weight_suffix}"
         w2_weight = f"{self.weight_prefix}.{expert_idx}.{self.w2_weight_name}.{self.quant_method.weight_suffix}"
         w3_weight = f"{self.weight_prefix}.{expert_idx}.{self.w3_weight_name}.{self.quant_method.weight_suffix}"
+
         row_slice_func = self.row_slicer._slice_weight
         col_slice_func = self.col_slicer._slice_weight
         if w1_weight in weights:
@@ -341,6 +349,19 @@ def _load_expert(
         if w2_weight in weights:
             self.quant_method.load_weight(col_slice_func(weights[w2_weight]), self.w2_list[local_expert_idx])
 
+    def _load_merge_weight(self, weights: Dict[str, torch.Tensor]):
+        w1_merge_weight = f"{self.weight_prefix}.{self.w1_weight_name}"
+        w2_merge_weight = f"{self.weight_prefix}.{self.w2_weight_name}"
+        w3_merge_weight = f"{self.weight_prefix}.{self.w3_weight_name}"
+        row_slice_func = self.row_slicer._slice_weight
+        col_slice_func = self.col_slicer._slice_weight
+        if w1_merge_weight in weights:
+            self.quant_method.load_weight(row_slice_func(weights[w1_merge_weight]), self.w1)
+        if w2_merge_weight in weights:
+            self.quant_method.load_weight(col_slice_func(weights[w2_merge_weight]), self.w2)
+        if w3_merge_weight in weights:
+            self.quant_method.load_weight(row_slice_func(weights[w3_merge_weight]), self.w3)
+
     def _load_expert_scale(
         self,
         expert_idx: int,

diff --git a/...tllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py b/...tllm/common/basemodel/layer_weights/meta_weights/fused_moe/gpt_oss_fused_moe_weight_tp.py
@@ -8,6 +8,7 @@
 from lightllm.common.quantization import Quantcfg
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from lightllm.utils.log_utils import init_logger
+from lightllm.common.basemodel import routing_manager as _routing_mgr
 
 logger = init_logger(__name__)
 
@@ -46,6 +47,7 @@ def __init__(
         num_fused_shared_experts: int = 0,
         layer_num: int = 0,
         network_config: Dict[str, Any] = None,
+        moe_layer_index: int = 0,
     ) -> None:
         network_config["norm_topk_prob"] = None
         super().__init__(
@@ -62,6 +64,7 @@ def __init__(
             num_fused_shared_experts=num_fused_shared_experts,
             layer_num=layer_num,
             network_config=network_config,
+            moe_layer_index=moe_layer_index,
         )
 
         self.hidden_size = network_config["hidden_size"]
@@ -144,10 +147,15 @@ def experts(
         topk_group: int,
         num_expert_group: int,
         is_prefill: Optional[bool] = None,
+        microbatch_index: int = 0,
     ):
 
         topk_weights, topk_ids = self._router(router_logits, top_k)
 
+        # Rollout router replay
+        if _routing_mgr.g_routing_capture_manager is not None:
+            _routing_mgr.g_routing_capture_manager.capture(self.moe_layer_index, topk_ids, microbatch_index)
+
         w1, w1_scale = self.w1
         w2, w2_scale = self.w2
         use_fp8_w8a8 = self.quant_method is not None

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/base_impl.py
@@ -62,5 +62,7 @@ def __call__(
         topk_group: int,
         num_expert_group: int,
         is_prefill: Optional[bool] = None,
+        moe_layer_index: Optional[int] = None,
+        microbatch_index: int = 0,
     ) -> torch.Tensor:
         pass
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/triton_impl.py
@@ -3,6 +3,7 @@
 from lightllm.common.quantization.no_quant import WeightPack
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from .base_impl import FuseMoeBaseImpl
+from lightllm.common.basemodel import routing_manager as _routing_mgr
 
 
 class FuseMoeTriton(FuseMoeBaseImpl):
@@ -125,6 +126,8 @@ def __call__(
         topk_group: int,
         num_expert_group: int,
         is_prefill: Optional[bool] = None,
+        moe_layer_index: Optional[int] = None,
+        microbatch_index: int = 0,
     ):
         topk_weights, topk_ids = self._select_experts(
             input_tensor=input_tensor,
@@ -137,6 +140,10 @@ def __call__(
             num_expert_group=num_expert_group,
             scoring_func=scoring_func,
         )
+
+        if _routing_mgr.g_routing_capture_manager is not None and moe_layer_index is not None:
+            _routing_mgr.g_routing_capture_manager.capture(moe_layer_index, topk_ids, microbatch_index)
+
         output = self._fused_experts(
             input_tensor=input_tensor,
             w13=w13,
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ dist @@
     .vscode
     tmp/
     requirements-musa.txt
+    CLAUDE.md