From 1fd1e17d568279213c3e2c7e6da4dada02f9263f Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Mon, 26 Jan 2026 15:25:08 +0800 Subject: [PATCH 1/9] Initial commit --- .../att/prefill_att/context_flashattention_nopad.py | 1 + lightllm/server/router/model_infer/mode_backend/base_backend.py | 1 + 2 files changed, 2 insertions(+) diff --git a/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py b/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py index 5ba6d0beb..8cd60e76b 100644 --- a/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py +++ b/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py @@ -124,6 +124,7 @@ def context_attention_fwd( q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, max_input_len, req_to_token_indexs ): BLOCK_M = 128 if not is_tesla() else 64 + BLOCK_M = 64 # shape constraints Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] assert Lq == Lk and Lk == Lv diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py index 64310d6b0..2a2d462a0 100644 --- a/lightllm/server/router/model_infer/mode_backend/base_backend.py +++ b/lightllm/server/router/model_infer/mode_backend/base_backend.py @@ -132,6 +132,7 @@ def init_model(self, kvargs): self.infer_state_lock = g_infer_state_lock # 防止InferStateLock 中的全局共享信息被重复异常初始化,导致同步异常的问题。 # 所以做一次barrier等待 + time.sleep(0.1) dist.barrier() wait_events = [] From 65ca5d8bb009a788ef0a9b070f9a003e6b36101b Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Mon, 26 Jan 2026 16:44:09 +0800 Subject: [PATCH 2/9] Add Dockerfile --- docker/Dockerfile.metax | 5 +++++ requirements_metax.txt | 8 ++++++++ 2 files changed, 13 insertions(+) create mode 100644 docker/Dockerfile.metax create mode 100644 requirements_metax.txt diff --git a/docker/Dockerfile.metax b/docker/Dockerfile.metax new file mode 100644 index 000000000..7dc5f2167 --- /dev/null +++ b/docker/Dockerfile.metax @@ -0,0 +1,5 @@ +FROM cr.metax-tech.com/public-ai-release/maca/vllm:maca.ai3.1.0.7-torch2.6-py310-ubuntu22.04-amd64 + +ENV PATH=/opt/conda/bin:/opt/conda/condabin:${PATH} +COPY . /lightllm +RUN pip install -r /lightllm/requirements_metax.txt && pip install -e /lightllm --no-cache-dir diff --git a/requirements_metax.txt b/requirements_metax.txt new file mode 100644 index 000000000..55c5e93dc --- /dev/null +++ b/requirements_metax.txt @@ -0,0 +1,8 @@ +rpyc==6.0.2 +setproctitle==1.3.7 +easydict==1.13 +atomics==1.0.3 +sortedcontainers==2.4.0 +librosa==0.11.0 +gunicorn==24.0.0 +ujson==5.11.0 From 79737eee998b1409699582264d1c872676582d00 Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Thu, 29 Jan 2026 14:34:45 +0800 Subject: [PATCH 3/9] Fix openai interface in benchmark --- docker/Dockerfile.metax | 1 + test/benchmark/service/benchmark_client.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.metax b/docker/Dockerfile.metax index 7dc5f2167..a1f1d3228 100644 --- a/docker/Dockerfile.metax +++ b/docker/Dockerfile.metax @@ -1,5 +1,6 @@ FROM cr.metax-tech.com/public-ai-release/maca/vllm:maca.ai3.1.0.7-torch2.6-py310-ubuntu22.04-amd64 +ENV MACA_PATH=/opt/maca ENV PATH=/opt/conda/bin:/opt/conda/condabin:${PATH} COPY . /lightllm RUN pip install -r /lightllm/requirements_metax.txt && pip install -e /lightllm --no-cache-dir diff --git a/test/benchmark/service/benchmark_client.py b/test/benchmark/service/benchmark_client.py index 09009fc9e..57b16abcf 100644 --- a/test/benchmark/service/benchmark_client.py +++ b/test/benchmark/service/benchmark_client.py @@ -96,7 +96,7 @@ def post_stream_lightllm(url: str, text_input: str, max_new_tokens: int) -> List def post_stream_openai(url: str, text_input: str, max_new_tokens: int) -> List[float]: data = { "model": model_name[0], - "prompt": text_input, + "messages": [{"role": "user", "content": text_input}], "n": 1, "ignore_eos": True, "max_tokens": max_new_tokens, @@ -115,7 +115,7 @@ def post_stream_openai(url: str, text_input: str, max_new_tokens: int) -> List[f if line == "[DONE]": continue data = json.loads(line) - if not data["choices"][0]["text"]: + if not data["choices"][0]["delta"]["content"]: continue current_time = time.time() elapsed_time = current_time - last_time From 1f45c5783a44c5af8c23158527c001ed57d0fab6 Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Thu, 29 Jan 2026 16:37:28 +0800 Subject: [PATCH 4/9] Add is_metax() --- .../att/prefill_att/context_flashattention_nopad.py | 7 ++++--- lightllm/utils/device_utils.py | 5 +++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py b/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py index 8cd60e76b..6c3c297ad 100644 --- a/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py +++ b/lightllm/common/basemodel/triton_kernel/att/prefill_att/context_flashattention_nopad.py @@ -7,7 +7,7 @@ import math import torch.nn.functional as F -from lightllm.utils.device_utils import is_tesla +from lightllm.utils.device_utils import is_metax, is_tesla @triton.jit @@ -123,8 +123,9 @@ def _fwd_kernel( def context_attention_fwd( q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, max_input_len, req_to_token_indexs ): - BLOCK_M = 128 if not is_tesla() else 64 - BLOCK_M = 64 + BLOCK_M = 128 + if is_tesla() or is_metax(): + BLOCK_M = 64 # shape constraints Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] assert Lq == Lk and Lk == Lv diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py index a1ed6ed95..719c8017a 100644 --- a/lightllm/utils/device_utils.py +++ b/lightllm/utils/device_utils.py @@ -31,6 +31,11 @@ def is_4090(): return "4090" in torch.cuda.get_device_name(0) or "RTX 4090" in torch.cuda.get_device_name(0) +@lru_cache(maxsize=None) +def is_metax(): + return torch.cuda.is_available() and "MetaX" in torch.cuda.get_device_name(0) + + @lru_cache(maxsize=None) def get_device_sm_count(): import triton From e11d2d7b5681c08726c5971efe20d24af9a27e37 Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Fri, 30 Jan 2026 17:48:06 +0800 Subject: [PATCH 5/9] fix hang of multimodal server --- .../basemodel/attention_vit/xformers/fp.py | 7 ++++- lightllm/utils/dist_utils.py | 6 ++++ lightllm/utils/kv_cache_utils.py | 29 ++++++++++++++++++- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/lightllm/common/basemodel/attention_vit/xformers/fp.py b/lightllm/common/basemodel/attention_vit/xformers/fp.py index 361b5db05..643fc7d4b 100644 --- a/lightllm/common/basemodel/attention_vit/xformers/fp.py +++ b/lightllm/common/basemodel/attention_vit/xformers/fp.py @@ -1,3 +1,4 @@ +from lightllm.utils.device_utils import is_metax import torch import torch.nn.functional as F @@ -34,7 +35,11 @@ def _vit_att_fwd( if max_seqlen: assert max(seqlens) <= max_seqlen - attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens, device=q.device) + # The version of xformers on metex is 0.0.22 (nv is 0.0.32.post1), no device param + if is_metax(): + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + else: + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens, device=q.device) q_ = q.unsqueeze(0) # [1, T, H, D] k_ = k.unsqueeze(0) # [1, T, H, D] diff --git a/lightllm/utils/dist_utils.py b/lightllm/utils/dist_utils.py index 65ac401d4..197aa0d1f 100644 --- a/lightllm/utils/dist_utils.py +++ b/lightllm/utils/dist_utils.py @@ -1,3 +1,4 @@ +from lightllm.utils.device_utils import is_metax import torch.distributed as dist import os import torch @@ -61,10 +62,15 @@ def init_vision_distributed_env(kvargs): set_dp_size(dp_size) set_dp_world_size(tp_world_size) set_current_rank_in_dp(tp_rank_id) + visual_gpu_ids = kvargs["visual_gpu_ids"] device_id = visual_gpu_ids[kvargs["vit_rank_id"]] set_current_device_id(device_id) torch.cuda.set_device(device_id) + + if is_metax(): + return + dist.init_process_group( "nccl", init_method=f'tcp://127.0.0.1:{kvargs["visual_nccl_port"]}', diff --git a/lightllm/utils/kv_cache_utils.py b/lightllm/utils/kv_cache_utils.py index 3256fdd1f..52c0a5ac1 100644 --- a/lightllm/utils/kv_cache_utils.py +++ b/lightllm/utils/kv_cache_utils.py @@ -1,3 +1,4 @@ +from lightllm.utils.device_utils import is_metax import torch import ctypes import dataclasses @@ -265,7 +266,33 @@ def _worker(): assert host_ptr.value == device_ptr.value handle.tasks_finished.set() - th = threading.Thread(target=_worker, name=f"cpu_cache_register_{shm_ptr}", daemon=True) + def _metax_worker(): + mc = ctypes.CDLL("/opt/maca/lib/libmcruntime.so") + mc.mcHostRegister.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint] + mc.mcHostRegister.restype = ctypes.c_int + mc.mcHostGetDevicePointer.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p, ctypes.c_int] + mc.mcHostGetDevicePointer.restype = ctypes.c_int + + cudaHostRegisterFlag = 3 + + torch.cuda.set_device(get_current_device_id()) + # TODO 这个地方的分块注册是否具备合法性和合理性。 + for offset, seg_len in tasks: + ptr = ctypes.c_void_p(shm_ptr + offset) + r = mc.mcHostRegister(ptr, ctypes.c_size_t(seg_len), cudaHostRegisterFlag) + if r != 0: + raise Exception(f"cudaHostRegister failed with error code {r}, prefer to use hugetlb") + handle.task_count += 1 + + device_ptr = ctypes.c_void_p() + host_ptr = ctypes.c_void_p(shm_ptr) + res = mc.mcHostGetDevicePointer(ctypes.byref(device_ptr), host_ptr, 0) + if res != 0: + raise Exception(f"cudaHostGetDevicePointer failed with error code {res}") + handle.tasks_finished.set() + + _worker_func = _metax_worker() if is_metax() else _worker() + th = threading.Thread(target=_worker_func, name=f"cpu_cache_register_{shm_ptr}", daemon=True) handle.thread = th th.start() return handle From c4ffeb628e7abc28c6b510175c53c6720c1b16bb Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Fri, 30 Jan 2026 19:07:26 +0800 Subject: [PATCH 6/9] misc --- lightllm/utils/dist_utils.py | 2 +- lightllm/utils/kv_cache_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lightllm/utils/dist_utils.py b/lightllm/utils/dist_utils.py index 197aa0d1f..9a51036eb 100644 --- a/lightllm/utils/dist_utils.py +++ b/lightllm/utils/dist_utils.py @@ -62,12 +62,12 @@ def init_vision_distributed_env(kvargs): set_dp_size(dp_size) set_dp_world_size(tp_world_size) set_current_rank_in_dp(tp_rank_id) - visual_gpu_ids = kvargs["visual_gpu_ids"] device_id = visual_gpu_ids[kvargs["vit_rank_id"]] set_current_device_id(device_id) torch.cuda.set_device(device_id) + # Can't init process group in device twice, we don't init it vision env if is_metax(): return diff --git a/lightllm/utils/kv_cache_utils.py b/lightllm/utils/kv_cache_utils.py index 52c0a5ac1..6377e29e3 100644 --- a/lightllm/utils/kv_cache_utils.py +++ b/lightllm/utils/kv_cache_utils.py @@ -267,7 +267,7 @@ def _worker(): handle.tasks_finished.set() def _metax_worker(): - mc = ctypes.CDLL("/opt/maca/lib/libmcruntime.so") + mc = ctypes.CDLL(os.path.join(os.getenv("MACA_PATH", "/opt/maca"), "lib/libmcruntime.so")) mc.mcHostRegister.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint] mc.mcHostRegister.restype = ctypes.c_int mc.mcHostGetDevicePointer.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p, ctypes.c_int] @@ -291,7 +291,7 @@ def _metax_worker(): raise Exception(f"cudaHostGetDevicePointer failed with error code {res}") handle.tasks_finished.set() - _worker_func = _metax_worker() if is_metax() else _worker() + _worker_func = _metax_worker if is_metax() else _worker th = threading.Thread(target=_worker_func, name=f"cpu_cache_register_{shm_ptr}", daemon=True) handle.thread = th th.start() From b3cb03180729748779bc2b574165697012602939 Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Tue, 3 Feb 2026 11:08:01 +0800 Subject: [PATCH 7/9] Fix backend of nccl on MetaX --- lightllm/utils/dist_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lightllm/utils/dist_utils.py b/lightllm/utils/dist_utils.py index 9a51036eb..ec78e9f18 100644 --- a/lightllm/utils/dist_utils.py +++ b/lightllm/utils/dist_utils.py @@ -105,8 +105,12 @@ def init_distributed_env(kvargs): device_id = kvargs["rank_id"] % get_node_world_size() set_current_device_id(device_id) torch.cuda.set_device(device_id) + backend = "nccl" + # NCCL internal error when using 8 or 16 gpus. + if is_metax(): + backend = "cpu:gloo,cuda:nccl" dist.init_process_group( - "nccl", + backend=backend, init_method=f'tcp://{kvargs["nccl_host"]}:{kvargs["nccl_port"]}', rank=kvargs["rank_id"], world_size=kvargs["world_size"], From f1774e949a2228e62292bb95dee8a65b1feb3f27 Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Wed, 4 Feb 2026 14:34:49 +0800 Subject: [PATCH 8/9] remove time.sleep --- lightllm/server/router/model_infer/mode_backend/base_backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py index 2a2d462a0..64310d6b0 100644 --- a/lightllm/server/router/model_infer/mode_backend/base_backend.py +++ b/lightllm/server/router/model_infer/mode_backend/base_backend.py @@ -132,7 +132,6 @@ def init_model(self, kvargs): self.infer_state_lock = g_infer_state_lock # 防止InferStateLock 中的全局共享信息被重复异常初始化,导致同步异常的问题。 # 所以做一次barrier等待 - time.sleep(0.1) dist.barrier() wait_events = [] From c85dc5ed8748f66571d8bd0c72245917d8208221 Mon Sep 17 00:00:00 2001 From: zhangtaoshan Date: Wed, 4 Feb 2026 14:47:35 +0800 Subject: [PATCH 9/9] add note for dockerfile --- docker/Dockerfile.metax | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile.metax b/docker/Dockerfile.metax index a1f1d3228..7e30dd806 100644 --- a/docker/Dockerfile.metax +++ b/docker/Dockerfile.metax @@ -1,3 +1,4 @@ +# docker pull from https://sw-download.metax-tech.com/docker FROM cr.metax-tech.com/public-ai-release/maca/vllm:maca.ai3.1.0.7-torch2.6-py310-ubuntu22.04-amd64 ENV MACA_PATH=/opt/maca