Fused all-gather matmul

kwen2501 · kwen2501 · commit c188781ad746 · 2026-01-19T21:30:48.000-08:00
diff --git a/samples/all_gather_matmul.py b/samples/all_gather_matmul.py
@@ -0,0 +1,149 @@
+# SPDX-FileCopyrightText: Copyright (c) <2025> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Example demonstrating all-gather and matrix multiplication in a single kernel.
+
+Run with:
+    torchrun --nproc-per-node 4 --standalone all_gather_matmul.py
+"""
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import cuda.tile as ct
+
+# cuTile kernel for gather then matmul
+# Limitation:
+# Only support 4 ranks because cuTile does not support tuple as input
+@ct.kernel
+def gather_matmul_kernel(
+    inp_0, inp_1, inp_2, inp_3,
+    w,
+    out,
+    tile_m: ct.Constant[int],
+    tile_n: ct.Constant[int],
+    tile_k: ct.Constant[int],
+):
+    # Number of m tiles per peer
+    num_tiles_m_per_peer = ct.cdiv(inp_0.shape[0], tile_m)
+    num_tiles_k = ct.num_tiles(w, axis=0, shape=(tile_k, tile_n))
+
+    # 0-dim maps to m_tile_idx, 1-dim maps to n_tile_idx
+    m_tile_idx = ct.bid(0)
+    n_tile_idx = ct.bid(1)
+
+    # Which peer is this tile in?
+    peer = m_tile_idx // num_tiles_m_per_peer
+    peer_inp = inp_0 if peer == 0 else inp_1 if peer == 1 else inp_2 if peer == 2 else inp_3
+    m_tile_idx_in_peer = m_tile_idx % num_tiles_m_per_peer
+
+    # Initialize accumulator
+    accumulator = ct.full((tile_m, tile_n), 0, dtype=ct.float32)
+    zero_pad = ct.PaddingMode.ZERO
+
+    # Convert fp32 to tf32 to use tensorcore
+    dtype = ct.tfloat32 if peer_inp.dtype == ct.float32 else peer_inp.dtype
+
+    for k in range(num_tiles_k):
+        # Load remote input tile
+        a = ct.load(peer_inp, index=(m_tile_idx_in_peer, k), shape=(tile_m, tile_k), padding_mode=zero_pad).astype(dtype)
+        # Load weight tile
+        b = ct.load(w, index=(k, n_tile_idx), shape=(tile_k, tile_n), padding_mode=zero_pad).astype(dtype)
+        # Perform matrix multiplication
+        accumulator = ct.mma(a, b, accumulator)
+
+    # Cast result back to output dtype
+    accumulator = ct.astype(accumulator, out.dtype)
+
+    # Store result tile
+    ct.store(out, index=(m_tile_idx, n_tile_idx), tile=accumulator)
+
+
+# Host-side launcher for all-gather
+def cutile_gather_matmul(
+    inp: torch.Tensor,
+    w: torch.Tensor,
+    group: dist.ProcessGroup,
+):
+    handle = symm_mem.rendezvous(inp, group.group_name)
+    world_size = handle.world_size
+    inp_tuple = tuple(
+        handle.get_buffer(rank, inp.shape, inp.dtype, 0) for rank in range(world_size)
+    )
+    assert world_size == 4
+
+    # Allocate output tensor
+    M = inp.shape[0]
+    M_out = M * world_size
+    N = w.shape[1]
+    out = torch.empty(M_out, N, device=inp.device)
+
+    assert inp.shape[1] == w.shape[0], "reduction dimension mismatch"
+    K = inp.shape[1]
+    tile_m = 128
+    tile_n = 128
+    tile_k = 128
+    assert M % tile_m == 0
+    assert N % tile_n == 0
+    assert K % tile_k == 0
+
+    # Map each output tile to a block
+    grid = (ct.cdiv(M_out, tile_m), ct.cdiv(N, tile_n),)
+    ct.launch(
+        torch.cuda.current_stream(),
+        grid,
+        gather_matmul_kernel,
+        (*inp_tuple, w, out, tile_m, tile_n, tile_k),
+    )
+
+    return out
+
+
+# Reference gather then matmul implementation
+def ref_gather_matmul(
+    inp: torch.Tensor,
+    w: torch.Tensor,
+    group: dist.ProcessGroup,
+):
+    world_size = dist.get_world_size(group)
+    ag_scratch = torch.empty((world_size * inp.shape[0], inp.shape[1]), device=inp.device)
+    dist.all_gather_into_tensor(ag_scratch, inp, group=group)
+    out = ag_scratch @ w
+    return out
+
+
+def main():
+    dist.init_process_group(backend="nccl")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    device = torch.device(f"cuda:{rank}")
+    group = dist.group.WORLD
+    torch.manual_seed(rank + 52)
+    print(f"Rank {rank} of {world_size} is initializing")
+
+    bs = 256
+    hid = 1024
+    out_hid = 512
+    ref_inp = torch.rand((bs, hid), device=device)
+    inp = symm_mem.empty(bs, hid, device=device).copy_(ref_inp)
+    w = torch.rand((hid, out_hid), device=device)
+
+    expected_out = ref_gather_matmul(ref_inp, w, group)
+
+    out = cutile_gather_matmul(inp, w, group)
+
+    torch.testing.assert_close(
+        out,
+        expected_out,
+        atol=1e-3,
+        rtol=1e-3,
+    )
+
+    print(f"Rank {rank} of {world_size}: correct")
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()