From 1fd1e17d568279213c3e2c7e6da4dada02f9263f Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Mon, 26 Jan 2026 15:25:08 +0800
Subject: [PATCH 1/9] Initial commit

---
 .../att/prefill_att/context_flashattention_nopad.py              | 1 +
 lightllm/server/router/model_infer/mode_backend/base_backend.py  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py b/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py
index 5ba6d0beb..8cd60e76b 100644
--- a/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py
+++ b/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py
@@ -124,6 +124,7 @@ def context_attention_fwd(
     q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, max_input_len, req_to_token_indexs
 ):
     BLOCK_M = 128 if not is_tesla() else 64
+    BLOCK_M = 64
     # shape constraints
     Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
     assert Lq == Lk and Lk == Lv
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
index 64310d6b0..2a2d462a0 100644
--- a/lightllm/server/router/model_infer/mode_backend/base_backend.py
+++ b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -132,6 +132,7 @@ def init_model(self, kvargs):
         self.infer_state_lock = g_infer_state_lock
         # 防止InferStateLock 中的全局共享信息被重复异常初始化,导致同步异常的问题。
         # 所以做一次barrier等待
+        time.sleep(0.1)
         dist.barrier()
 
         wait_events = []

From 65ca5d8bb009a788ef0a9b070f9a003e6b36101b Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Mon, 26 Jan 2026 16:44:09 +0800
Subject: [PATCH 2/9] Add Dockerfile

---
 docker/Dockerfile.metax | 5 +++++
 requirements_metax.txt  | 8 ++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 docker/Dockerfile.metax
 create mode 100644 requirements_metax.txt

diff --git a/docker/Dockerfile.metax b/docker/Dockerfile.metax
new file mode 100644
index 000000000..7dc5f2167
--- /dev/null
+++ b/docker/Dockerfile.metax
@@ -0,0 +1,5 @@
+FROM cr.metax-tech.com/public-ai-release/maca/vllm:maca.ai3.1.0.7-torch2.6-py310-ubuntu22.04-amd64
+
+ENV PATH=/opt/conda/bin:/opt/conda/condabin:${PATH}
+COPY . /lightllm
+RUN pip install -r /lightllm/requirements_metax.txt && pip install -e /lightllm --no-cache-dir
diff --git a/requirements_metax.txt b/requirements_metax.txt
new file mode 100644
index 000000000..55c5e93dc
--- /dev/null
+++ b/requirements_metax.txt
@@ -0,0 +1,8 @@
+rpyc==6.0.2
+setproctitle==1.3.7
+easydict==1.13
+atomics==1.0.3
+sortedcontainers==2.4.0
+librosa==0.11.0
+gunicorn==24.0.0
+ujson==5.11.0

From 79737eee998b1409699582264d1c872676582d00 Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Thu, 29 Jan 2026 14:34:45 +0800
Subject: [PATCH 3/9] Fix openai interface in benchmark

---
 docker/Dockerfile.metax                    | 1 +
 test/benchmark/service/benchmark_client.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.metax b/docker/Dockerfile.metax
index 7dc5f2167..a1f1d3228 100644
--- a/docker/Dockerfile.metax
+++ b/docker/Dockerfile.metax
@@ -1,5 +1,6 @@
 FROM cr.metax-tech.com/public-ai-release/maca/vllm:maca.ai3.1.0.7-torch2.6-py310-ubuntu22.04-amd64
 
+ENV MACA_PATH=/opt/maca
 ENV PATH=/opt/conda/bin:/opt/conda/condabin:${PATH}
 COPY . /lightllm
 RUN pip install -r /lightllm/requirements_metax.txt && pip install -e /lightllm --no-cache-dir
diff --git a/test/benchmark/service/benchmark_client.py b/test/benchmark/service/benchmark_client.py
index 09009fc9e..57b16abcf 100644
--- a/test/benchmark/service/benchmark_client.py
+++ b/test/benchmark/service/benchmark_client.py
@@ -96,7 +96,7 @@ def post_stream_lightllm(url: str, text_input: str, max_new_tokens: int) -> List
 def post_stream_openai(url: str, text_input: str, max_new_tokens: int) -> List[float]:
     data = {
         "model": model_name[0],
-        "prompt": text_input,
+        "messages": [{"role": "user", "content": text_input}],
         "n": 1,
         "ignore_eos": True,
         "max_tokens": max_new_tokens,
@@ -115,7 +115,7 @@ def post_stream_openai(url: str, text_input: str, max_new_tokens: int) -> List[f
             if line == "[DONE]":
                 continue
             data = json.loads(line)
-            if not data["choices"][0]["text"]:
+            if not data["choices"][0]["delta"]["content"]:
                 continue
             current_time = time.time()
             elapsed_time = current_time - last_time

From 1f45c5783a44c5af8c23158527c001ed57d0fab6 Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Thu, 29 Jan 2026 16:37:28 +0800
Subject: [PATCH 4/9] Add is_metax()

---
 .../att/prefill_att/context_flashattention_nopad.py        | 7 ++++---
 lightllm/utils/device_utils.py                             | 5 +++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py b/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py
index 8cd60e76b..6c3c297ad 100644
--- a/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py
+++ b/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py
@@ -7,7 +7,7 @@
 import math
 import torch.nn.functional as F
 
-from lightllm.utils.device_utils import is_tesla
+from lightllm.utils.device_utils import is_metax, is_tesla
 
 
 @triton.jit
@@ -123,8 +123,9 @@ def _fwd_kernel(
 def context_attention_fwd(
     q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, max_input_len, req_to_token_indexs
 ):
-    BLOCK_M = 128 if not is_tesla() else 64
-    BLOCK_M = 64
+    BLOCK_M = 128
+    if is_tesla() or is_metax():
+        BLOCK_M = 64
     # shape constraints
     Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
     assert Lq == Lk and Lk == Lv
diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py
index a1ed6ed95..719c8017a 100644
--- a/lightllm/utils/device_utils.py
+++ b/lightllm/utils/device_utils.py
@@ -31,6 +31,11 @@ def is_4090():
     return "4090" in torch.cuda.get_device_name(0) or "RTX 4090" in torch.cuda.get_device_name(0)
 
 
+@lru_cache(maxsize=None)
+def is_metax():
+    return torch.cuda.is_available() and "MetaX" in torch.cuda.get_device_name(0)
+
+
 @lru_cache(maxsize=None)
 def get_device_sm_count():
     import triton

From e11d2d7b5681c08726c5971efe20d24af9a27e37 Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Fri, 30 Jan 2026 17:48:06 +0800
Subject: [PATCH 5/9] fix hang of multimodal server

---
 .../basemodel/attention_vit/xformers/fp.py    |  7 ++++-
 lightllm/utils/dist_utils.py                  |  6 ++++
 lightllm/utils/kv_cache_utils.py              | 29 ++++++++++++++++++-
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/lightllm/common/basemodel/attention_vit/xformers/fp.py b/lightllm/common/basemodel/attention_vit/xformers/fp.py
index 361b5db05..643fc7d4b 100644
--- a/lightllm/common/basemodel/attention_vit/xformers/fp.py
+++ b/lightllm/common/basemodel/attention_vit/xformers/fp.py
@@ -1,3 +1,4 @@
+from lightllm.utils.device_utils import is_metax
 import torch
 import torch.nn.functional as F
 
@@ -34,7 +35,11 @@ def _vit_att_fwd(
         if max_seqlen:
             assert max(seqlens) <= max_seqlen
 
-        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens, device=q.device)
+        # The version of xformers on metex is 0.0.22 (nv is 0.0.32.post1), no device param
+        if is_metax():
+            attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        else:
+            attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens, device=q.device)
 
         q_ = q.unsqueeze(0)  # [1, T, H, D]
         k_ = k.unsqueeze(0)  # [1, T, H, D]
diff --git a/lightllm/utils/dist_utils.py b/lightllm/utils/dist_utils.py
index 65ac401d4..197aa0d1f 100644
--- a/lightllm/utils/dist_utils.py
+++ b/lightllm/utils/dist_utils.py
@@ -1,3 +1,4 @@
+from lightllm.utils.device_utils import is_metax
 import torch.distributed as dist
 import os
 import torch
@@ -61,10 +62,15 @@ def init_vision_distributed_env(kvargs):
     set_dp_size(dp_size)
     set_dp_world_size(tp_world_size)
     set_current_rank_in_dp(tp_rank_id)
+
     visual_gpu_ids = kvargs["visual_gpu_ids"]
     device_id = visual_gpu_ids[kvargs["vit_rank_id"]]
     set_current_device_id(device_id)
     torch.cuda.set_device(device_id)
+
+    if is_metax():
+        return
+
     dist.init_process_group(
         "nccl",
         init_method=f'tcp://127.0.0.1:{kvargs["visual_nccl_port"]}',
diff --git a/lightllm/utils/kv_cache_utils.py b/lightllm/utils/kv_cache_utils.py
index 3256fdd1f..52c0a5ac1 100644
--- a/lightllm/utils/kv_cache_utils.py
+++ b/lightllm/utils/kv_cache_utils.py
@@ -1,3 +1,4 @@
+from lightllm.utils.device_utils import is_metax
 import torch
 import ctypes
 import dataclasses
@@ -265,7 +266,33 @@ def _worker():
         assert host_ptr.value == device_ptr.value
         handle.tasks_finished.set()
 
-    th = threading.Thread(target=_worker, name=f"cpu_cache_register_{shm_ptr}", daemon=True)
+    def _metax_worker():
+        mc = ctypes.CDLL("/opt/maca/lib/libmcruntime.so")
+        mc.mcHostRegister.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint]
+        mc.mcHostRegister.restype = ctypes.c_int
+        mc.mcHostGetDevicePointer.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p, ctypes.c_int]
+        mc.mcHostGetDevicePointer.restype = ctypes.c_int
+
+        cudaHostRegisterFlag = 3
+
+        torch.cuda.set_device(get_current_device_id())
+        # TODO 这个地方的分块注册是否具备合法性和合理性。
+        for offset, seg_len in tasks:
+            ptr = ctypes.c_void_p(shm_ptr + offset)
+            r = mc.mcHostRegister(ptr, ctypes.c_size_t(seg_len), cudaHostRegisterFlag)
+            if r != 0:
+                raise Exception(f"cudaHostRegister failed with error code {r}, prefer to use hugetlb")
+            handle.task_count += 1
+
+        device_ptr = ctypes.c_void_p()
+        host_ptr = ctypes.c_void_p(shm_ptr)
+        res = mc.mcHostGetDevicePointer(ctypes.byref(device_ptr), host_ptr, 0)
+        if res != 0:
+            raise Exception(f"cudaHostGetDevicePointer failed with error code {res}")
+        handle.tasks_finished.set()
+
+    _worker_func = _metax_worker() if is_metax() else _worker()
+    th = threading.Thread(target=_worker_func, name=f"cpu_cache_register_{shm_ptr}", daemon=True)
     handle.thread = th
     th.start()
     return handle

From c4ffeb628e7abc28c6b510175c53c6720c1b16bb Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Fri, 30 Jan 2026 19:07:26 +0800
Subject: [PATCH 6/9] misc

---
 lightllm/utils/dist_utils.py     | 2 +-
 lightllm/utils/kv_cache_utils.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lightllm/utils/dist_utils.py b/lightllm/utils/dist_utils.py
index 197aa0d1f..9a51036eb 100644
--- a/lightllm/utils/dist_utils.py
+++ b/lightllm/utils/dist_utils.py
@@ -62,12 +62,12 @@ def init_vision_distributed_env(kvargs):
     set_dp_size(dp_size)
     set_dp_world_size(tp_world_size)
     set_current_rank_in_dp(tp_rank_id)
-
     visual_gpu_ids = kvargs["visual_gpu_ids"]
     device_id = visual_gpu_ids[kvargs["vit_rank_id"]]
     set_current_device_id(device_id)
     torch.cuda.set_device(device_id)
 
+    # Can't init process group in device twice, we don't init it vision env
     if is_metax():
         return
 
diff --git a/lightllm/utils/kv_cache_utils.py b/lightllm/utils/kv_cache_utils.py
index 52c0a5ac1..6377e29e3 100644
--- a/lightllm/utils/kv_cache_utils.py
+++ b/lightllm/utils/kv_cache_utils.py
@@ -267,7 +267,7 @@ def _worker():
         handle.tasks_finished.set()
 
     def _metax_worker():
-        mc = ctypes.CDLL("/opt/maca/lib/libmcruntime.so")
+        mc = ctypes.CDLL(os.path.join(os.getenv("MACA_PATH", "/opt/maca"), "lib/libmcruntime.so"))
         mc.mcHostRegister.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint]
         mc.mcHostRegister.restype = ctypes.c_int
         mc.mcHostGetDevicePointer.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p, ctypes.c_int]
@@ -291,7 +291,7 @@ def _metax_worker():
             raise Exception(f"cudaHostGetDevicePointer failed with error code {res}")
         handle.tasks_finished.set()
 
-    _worker_func = _metax_worker() if is_metax() else _worker()
+    _worker_func = _metax_worker if is_metax() else _worker
     th = threading.Thread(target=_worker_func, name=f"cpu_cache_register_{shm_ptr}", daemon=True)
     handle.thread = th
     th.start()

From b3cb03180729748779bc2b574165697012602939 Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Tue, 3 Feb 2026 11:08:01 +0800
Subject: [PATCH 7/9] Fix backend of nccl on MetaX

---
 lightllm/utils/dist_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lightllm/utils/dist_utils.py b/lightllm/utils/dist_utils.py
index 9a51036eb..ec78e9f18 100644
--- a/lightllm/utils/dist_utils.py
+++ b/lightllm/utils/dist_utils.py
@@ -105,8 +105,12 @@ def init_distributed_env(kvargs):
     device_id = kvargs["rank_id"] % get_node_world_size()
     set_current_device_id(device_id)
     torch.cuda.set_device(device_id)
+    backend = "nccl"
+    # NCCL internal error when using 8 or 16 gpus.
+    if is_metax():
+        backend = "cpu:gloo,cuda:nccl"
     dist.init_process_group(
-        "nccl",
+        backend=backend,
         init_method=f'tcp://{kvargs["nccl_host"]}:{kvargs["nccl_port"]}',
         rank=kvargs["rank_id"],
         world_size=kvargs["world_size"],

From f1774e949a2228e62292bb95dee8a65b1feb3f27 Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Wed, 4 Feb 2026 14:34:49 +0800
Subject: [PATCH 8/9] remove time.sleep

---
 lightllm/server/router/model_infer/mode_backend/base_backend.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
index 2a2d462a0..64310d6b0 100644
--- a/lightllm/server/router/model_infer/mode_backend/base_backend.py
+++ b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -132,7 +132,6 @@ def init_model(self, kvargs):
         self.infer_state_lock = g_infer_state_lock
         # 防止InferStateLock 中的全局共享信息被重复异常初始化,导致同步异常的问题。
         # 所以做一次barrier等待
-        time.sleep(0.1)
         dist.barrier()
 
         wait_events = []

From c85dc5ed8748f66571d8bd0c72245917d8208221 Mon Sep 17 00:00:00 2001
From: zhangtaoshan <zhangtaoshan@sensetime.com>
Date: Wed, 4 Feb 2026 14:47:35 +0800
Subject: [PATCH 9/9] add note for dockerfile

---
 docker/Dockerfile.metax | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile.metax b/docker/Dockerfile.metax
index a1f1d3228..7e30dd806 100644
--- a/docker/Dockerfile.metax
+++ b/docker/Dockerfile.metax
@@ -1,3 +1,4 @@
+# docker pull from https://sw-download.metax-tech.com/docker
 FROM cr.metax-tech.com/public-ai-release/maca/vllm:maca.ai3.1.0.7-torch2.6-py310-ubuntu22.04-amd64
 
 ENV MACA_PATH=/opt/maca