feat: Add PRESHARDED LoadFormat for zero-disk P2P RDMA weight loading

KavinKrishnan · KavinKrishnan · commit dba4208bccd9 · 2026-04-09T13:03:14.000-07:00
Add LoadFormat.PRESHARDED for loading model weights that are already
sharded per TP rank, enabling zero-disk P2P RDMA weight transfers
where each MPI worker receives only its own shard directly into GPU
memory via ModelExpress.

Changes:
- llm_args.py: Add PRESHARDED = 3 to LoadFormat enum
- model_loader.py: PRESHARDED branch with _weights_presharded flag,
  publish hook before post_load_weights (auto-detect via MODEL_EXPRESS_URL)
- linear.py: Override tp_size to 1 when _weights_presharded=True
- worker.py: publish_from_worker hook in setup_engine (auto-detect)

Source publishes weights before post_load_weights so targets receive
pre-processed weights and run their own transforms independently.
Auto-detects source role when MODEL_EXPRESS_URL is set and
MODEL_EXPRESS_TARGET is not set.

Validated: Kimi K2.5 (TP=8, MoE, nvfp4) on GCP GB200 at 365-509 Gbps.
Signed-off-by: Kavin Krishnan &lt;kavink@nvidia.com&gt;
Made-with: Cursor
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -157,11 +157,17 @@ def maybe_convert_to_torch_tensor(
 
 
 def copy_weight(dst: Parameter, src: torch.Tensor):
-    # TODO check that is it a reasonable change or not
     if dst.dtype != src.dtype:
         src = src.to(dst.dtype)
     assert dst.dtype == src.dtype, f"Incompatible dtype. dst: {dst.dtype}, src: {src.dtype}"
-    dst.data.copy_(src)
+    # Zero-copy pointer swap when source is already on the correct device with matching shape
+    if (src.data_ptr() == dst.data_ptr()):
+        return  # Already in place (e.g., NIXL wrote directly into param buffer)
+    if (src.device == dst.device and src.shape == dst.shape and src.is_contiguous()
+            and dst.is_contiguous()):
+        dst.data = src
+    else:
+        dst.data.copy_(src)
 
 
 def copy_weight_shard(dst: Parameter, src: torch.Tensor, shard_offset: int,
@@ -183,8 +189,10 @@ def load_weights_vanilla_helper(module: Linear,
         if module.bias is not None:
             assert "bias" in weights[0]
     device = torch.device('cuda')
+    # Skip TP slicing for pre-sharded weights (e.g., from P2P RDMA)
+    tp_size = 1 if getattr(module, '_weights_presharded', False) else module.tp_size
 
-    weight = load_weight_shard(weights[0]['weight'], module.tp_size,
+    weight = load_weight_shard(weights[0]['weight'], tp_size,
                                module.tp_rank, module.tp_mode,
                                device) if "weight" in weights[0] else None
 
@@ -201,7 +209,7 @@ def load_weights_vanilla_helper(module: Linear,
         copy_weight(module.weight, weight_transform(weight))
 
     if module.bias is not None:
-        bias = load_weight_shard(weights[0]['bias'], module.tp_size,
+        bias = load_weight_shard(weights[0]['bias'], tp_size,
                                  module.tp_rank, module.tp_mode,
                                  device) if "bias" in weights[0] else None
         if bias is not None:
@@ -224,25 +232,27 @@ def load_weights_fused_qkv_helper(
             module, "fused_weight_shard_indices_mapping", None
         ) is not None, "Fused weight shard indices mapping is required in partial loading"
     device = torch.device('cuda')
+    # Skip TP slicing for pre-sharded weights (e.g., from P2P RDMA)
+    tp_size = 1 if getattr(module, '_weights_presharded', False) else module.tp_size
 
-    q_weight = load_weight_shard(weights[0]['weight'], module.tp_size,
+    q_weight = load_weight_shard(weights[0]['weight'], tp_size,
                                  module.tp_rank, module.tp_mode,
                                  device) if "weight" in weights[0] else None
-    k_weight = load_weight_shard(weights[1]['weight'], module.tp_size,
+    k_weight = load_weight_shard(weights[1]['weight'], tp_size,
                                  module.tp_rank, module.tp_mode,
                                  device) if "weight" in weights[1] else None
-    v_weight = load_weight_shard(weights[2]['weight'], module.tp_size,
+    v_weight = load_weight_shard(weights[2]['weight'], tp_size,
                                  module.tp_rank, module.tp_mode,
                                  device) if "weight" in weights[2] else None
 
     if module.bias is not None:
-        q_bias = load_weight_shard(weights[0]['bias'], module.tp_size,
+        q_bias = load_weight_shard(weights[0]['bias'], tp_size,
                                    module.tp_rank, module.tp_mode,
                                    device) if "bias" in weights[0] else None
-        k_bias = load_weight_shard(weights[1]['bias'], module.tp_size,
+        k_bias = load_weight_shard(weights[1]['bias'], tp_size,
                                    module.tp_rank, module.tp_mode,
                                    device) if "bias" in weights[1] else None
-        v_bias = load_weight_shard(weights[2]['bias'], module.tp_size,
+        v_bias = load_weight_shard(weights[2]['bias'], tp_size,
                                    module.tp_rank, module.tp_mode,
                                    device) if "bias" in weights[2] else None
         if not allow_partial_loading:
@@ -277,18 +287,20 @@ def load_weights_fused_gate_up_helper(
             module, "fused_weight_shard_indices_mapping", None
         ) is not None, "Fused weight shard indices mapping is required in partial loading"
     device = torch.device('cuda')
+    # Skip TP slicing for pre-sharded weights (e.g., from P2P RDMA)
+    tp_size = 1 if getattr(module, '_weights_presharded', False) else module.tp_size
 
-    gate_weight = load_weight_shard(weights[0]['weight'], module.tp_size,
+    gate_weight = load_weight_shard(weights[0]['weight'], tp_size,
                                     module.tp_rank, module.tp_mode,
                                     device) if "weight" in weights[0] else None
-    up_weight = load_weight_shard(weights[1]['weight'], module.tp_size,
+    up_weight = load_weight_shard(weights[1]['weight'], tp_size,
                                   module.tp_rank, module.tp_mode,
                                   device) if "weight" in weights[1] else None
     if module.bias is not None:
-        gate_bias = load_weight_shard(weights[0]['bias'], module.tp_size,
+        gate_bias = load_weight_shard(weights[0]['bias'], tp_size,
                                       module.tp_rank, module.tp_mode,
                                       device) if "bias" in weights[0] else None
-        up_bias = load_weight_shard(weights[1]['bias'], module.tp_size,
+        up_bias = load_weight_shard(weights[1]['bias'], tp_size,
                                     module.tp_rank, module.tp_mode,
                                     device) if "bias" in weights[1] else None
         if not allow_partial_loading:
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -410,6 +410,32 @@ def init_meta_tensor(t: torch.Tensor):
                     self._call_load_weights(model.load_draft_weights, weights,
                                             draft_weight_mapper)
 
+            elif load_format == LoadFormat.PRESHARDED:
+                # P2P RDMA target: source published weights BEFORE
+                # post_load_weights (pre-processed state). The checkpoint
+                # loader injects directly into model params via RDMA.
+                # If it returns empty dict, weights are already in GPU
+                # memory — skip model.load_weights() but DO run
+                # post_load_weights() to apply kernel-ready transforms.
+                from tensorrt_llm._torch.modules.linear import Linear
+
+                for m in model.modules():
+                    if isinstance(m, Linear):
+                        m._weights_presharded = True
+
+                ckpt_dir = model.llm_checkpoint_dir if hasattr(
+                    model, 'llm_checkpoint_dir') else checkpoint_dir
+                weights = checkpoint_loader.load_weights(
+                    ckpt_dir, mapping=self.mapping, model=model)
+
+                if weights:
+                    self.weight_mapper = checkpoint_loader.get_initialized_weight_mapper(
+                        model, config)
+                    self._call_load_weights(model.load_weights, weights,
+                                            self.weight_mapper)
+                else:
+                    logger.info("PRESHARDED: weights injected via P2P RDMA, skipping load_weights()")
+
             elif load_format == LoadFormat.DUMMY:
                 self.weight_mapper = checkpoint_loader.get_initialized_weight_mapper(
                     model, config)
@@ -428,6 +454,17 @@ def init_meta_tensor(t: torch.Tensor):
                 raise NotImplementedError(
                     f"No load support for load format: {load_format}")
 
+            # ModelExpress source: publish pre-processed weights BEFORE
+            # post_load_weights so targets receive raw loaded state and can
+            # run their own post_load_weights() transforms.
+            if os.environ.get("MODEL_EXPRESS_URL") and not os.environ.get("MODEL_EXPRESS_TARGET"):
+                try:
+                    from modelexpress.trtllm_live_transfer import publish_model_params
+                    publish_model_params(model)
+                    model._mx_source_published = True
+                except Exception as e:
+                    logger.warning("ModelExpress publish failed: %s", e)
+
             for module in model.modules():
                 if hasattr(module, 'post_load_weights') and not getattr(
                         module, '_weights_removed', False):
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -300,6 +300,19 @@ def notify_proxy_threads_to_quit():
                 logger.error("Failed to deliver error message to proxy")
         return
 
+    # ModelExpress source: publish this rank's model params via NIXL.
+    # Skip if already published from ModelLoader.load() (pre-post_load_weights).
+    if os.environ.get("MODEL_EXPRESS_URL") and not os.environ.get("MODEL_EXPRESS_TARGET"):
+        model = getattr(getattr(getattr(worker, 'engine', None), 'model_engine', None), 'model', None)
+        if model and getattr(model, '_mx_source_published', False):
+            logger.info("ModelExpress: already published from model_loader, skipping worker publish")
+        else:
+            try:
+                from modelexpress.trtllm_live_transfer import publish_from_worker
+                publish_from_worker(worker)
+            except Exception as e:
+                logger.warning("ModelExpress publish_from_worker failed on rank %d: %s", mpi_rank(), e)
+
     # Optionally disable GC (default: not disabled)
     if os.getenv("TRTLLM_WORKER_DISABLE_GC", "0") == "1":
         gc.disable()
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -3306,6 +3306,11 @@ class LoadFormat(Enum):
     DUMMY = 1
     # Only load the multimodal(vision) encoder weights
     VISION_ONLY = 2
+    # Weights are already sharded per TP rank — skip TP slicing during loading.
+    # The weight mapper still handles name mapping and fusing (q+k+v → qkv),
+    # but load_weight_shard() returns weights as-is without TP slicing.
+    # Use case: P2P RDMA transfers where each worker receives its own shard.
+    PRESHARDED = 3
 
 
 class SamplerType(StrEnum):