From 2fc31f0bbcd8bd82ffcb353dc3407b0705c220df Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Sat, 9 May 2026 07:34:36 +0000 Subject: [PATCH 1/9] refactor(kv-cache): embed KvCacheAllocator in MemoryManager as allocator --- .../common/kv_cache_mem_manager/__init__.py | 2 + .../kv_cache_mem_manager/mem_manager.py | 84 ++----------------- .../dynamic_prompt/linear_att_radix_cache.py | 4 +- .../router/dynamic_prompt/radix_cache.py | 10 +-- .../server/router/model_infer/infer_batch.py | 6 +- .../decode_node_impl/decode_infer_rpyc.py | 4 +- 6 files changed, 19 insertions(+), 91 deletions(-) diff --git a/lightllm/common/kv_cache_mem_manager/__init__.py b/lightllm/common/kv_cache_mem_manager/__init__.py index 2609cfb8ab..05544e149a 100644 --- a/lightllm/common/kv_cache_mem_manager/__init__.py +++ b/lightllm/common/kv_cache_mem_manager/__init__.py @@ -1,3 +1,4 @@ +from .allocator import KvCacheAllocator from .mem_manager import MemoryManager, ReadOnlyStaticsMemoryManager from .ppl_int8kv_mem_manager import PPLINT8KVMemoryManager from .ppl_int4kv_mem_manager import PPLINT4KVMemoryManager @@ -9,6 +10,7 @@ from .qwen3next_mem_manager import Qwen3NextMemManager __all__ = [ + "KvCacheAllocator", "MemoryManager", "ReadOnlyStaticsMemoryManager", "PPLINT4KVMemoryManager", diff --git a/lightllm/common/kv_cache_mem_manager/mem_manager.py b/lightllm/common/kv_cache_mem_manager/mem_manager.py index 0cc84db322..023be8af9b 100755 --- a/lightllm/common/kv_cache_mem_manager/mem_manager.py +++ b/lightllm/common/kv_cache_mem_manager/mem_manager.py @@ -3,10 +3,11 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp -from typing import List, Union, Tuple, Any +from typing import List, Tuple, Any, Union from lightllm.server.pd_io_struct import KVMoveTask from lightllm.utils.log_utils import init_logger from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt +from .allocator import KvCacheAllocator from lightllm.utils.profile_max_tokens import get_available_gpu_memory, get_total_gpu_memory from lightllm.common.kv_trans_kernel.kv_trans import kv_trans from lightllm.utils.dist_utils import get_current_rank_in_node, get_node_world_size @@ -38,27 +39,8 @@ def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False # profile the max total token num if the size is None self.profile_size(mem_fraction) - self.mem_state = torch.arange( - 0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True - ) - self._mem_state_return = torch.arange( - 0, self.size * 3, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True - ) - self._return_start = 0 - self.mark_start = 0 - self.mark_end = self.size - - self.can_use_mem_size = self.size - - # 用共享内存进行共享,router 模块读取进行精确的调度估计, nccl port 作为一个单机中单实列的标记。防止冲突。 - from lightllm.utils.envs_utils import get_unique_server_name + self.allocator = KvCacheAllocator(self.size) - rank_in_node = get_current_rank_in_node() - self.shared_can_use_token_num = SharedInt( - f"{get_unique_server_name()}_mem_manger_can_use_token_num_{rank_in_node}" - ) - - self.shared_can_use_token_num.set_value(self.can_use_mem_size) self._init_buffers( self.size, dtype, @@ -338,57 +320,13 @@ def _free_buffers(self): self.kv_buffer = None def alloc(self, need_size) -> torch.Tensor: - if need_size > self.mark_end - self.mark_start: - logger.error(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}") - assert False, "error alloc state" - - start = self.mark_start - end = self.mark_start + need_size - self.mark_start += need_size - - self.can_use_mem_size -= need_size - self.shared_can_use_token_num.set_value(self.can_use_mem_size) - - # 利用缓冲区返回,避免异步情况下的内存竞争 - if self._return_start + need_size > self._mem_state_return.shape[0]: - self._return_start = 0 - ans = self._mem_state_return[self._return_start : self._return_start + need_size] - ans.copy_(self.mem_state[start:end]) - self._return_start += need_size - return ans - - def free(self, free_index: Union[torch.Tensor, List[int]]): - """_summary_ - - Args: - free_index (torch.Tensor): _description_ - """ + return self.allocator.alloc(need_size) - end = self.mark_start - start = self.mark_start - len(free_index) - assert start >= 0, f"error free state start: {self.mark_start} free len {len(free_index)}" - - if isinstance(free_index, list): - self.mem_state.numpy()[start:end] = free_index - else: - # 从 gpu 到 cpu 的拷贝操作是流内阻塞操作 - self.mem_state[start:end] = free_index - - self.mark_start -= len(free_index) - - self.can_use_mem_size += len(free_index) - self.shared_can_use_token_num.set_value(self.can_use_mem_size) - - if self.can_use_mem_size == len(self.mem_state): - logger.debug(f"freed all gpu mem size {self.can_use_mem_size}") - return + def free(self, free_index: Union[torch.Tensor, List[int]]) -> None: + self.allocator.free(free_index) def free_all(self): - self.can_use_mem_size = len(self.mem_state) - self.shared_can_use_token_num.set_value(self.can_use_mem_size) - self.mem_state.numpy()[:] = list(range(0, len(self.mem_state))) - self.mark_start = 0 - self.mark_end = len(self.mem_state) + self.allocator.free_all() def resize_mem(self, new_size): """ @@ -401,13 +339,7 @@ def resize_mem(self, new_size): layer_num = self.layer_num self.size = new_size - self.mem_state = torch.arange( - 0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True - ) - self.mark_start = 0 - self.mark_end = self.size - self.can_use_mem_size = self.size - self.shared_can_use_token_num.set_value(self.can_use_mem_size) + self.allocator.resize(new_size) self._free_buffers() self._init_buffers(size, dtype, head_num, head_dim, layer_num) return diff --git a/lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py b/lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py index 73c6dba54d..c7408add39 100644 --- a/lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py +++ b/lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py @@ -568,8 +568,8 @@ def _print_helper(self, node: LinearAttPagedTreeNode, indent): def free_radix_cache_to_get_enough_token(self, need_token_num): assert self.mem_manager is not None - if need_token_num > self.mem_manager.can_use_mem_size: - need_evict_token_num = need_token_num - self.mem_manager.can_use_mem_size + if need_token_num > self.mem_manager.allocator.can_use_mem_size: + need_evict_token_num = need_token_num - self.mem_manager.allocator.can_use_mem_size release_mems = [] small_page_buffer_ids = [] diff --git a/lightllm/server/router/dynamic_prompt/radix_cache.py b/lightllm/server/router/dynamic_prompt/radix_cache.py index 88b099459b..21e26c5854 100644 --- a/lightllm/server/router/dynamic_prompt/radix_cache.py +++ b/lightllm/server/router/dynamic_prompt/radix_cache.py @@ -401,12 +401,6 @@ def merge_unreferenced_nodes(self): if merged_node: worklist.append(merged_node) - def assert_leafs_is_right(self): - for node in self.evict_tree_set: - if node.is_leaf() and node.ref_counter == 0: - a = node.token_mem_index_value.cuda() - assert (self.mem_manager.mem_state[a] == 1).sum().item() == len(a) - def clear_tree_nodes(self): """ 该函数只在测试时调用 @@ -497,8 +491,8 @@ def _print_helper(self, node: TreeNode, indent): def free_radix_cache_to_get_enough_token(self, need_token_num): assert self.mem_manager is not None - if need_token_num > self.mem_manager.can_use_mem_size: - need_evict_token_num = need_token_num - self.mem_manager.can_use_mem_size + if need_token_num > self.mem_manager.allocator.can_use_mem_size: + need_evict_token_num = need_token_num - self.mem_manager.allocator.can_use_mem_size release_mems = [] def release_mem(mem_index): diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py index 3a27c082de..1a5d05784f 100644 --- a/lightllm/server/router/model_infer/infer_batch.py +++ b/lightllm/server/router/model_infer/infer_batch.py @@ -280,8 +280,8 @@ def _filter(self, finished_request_ids: List[int]): f"free a batch state:\n" f"radix refed token num {self.radix_cache.get_refed_tokens_num()}\n" f"radix hold token num {self.radix_cache.get_tree_total_tokens_num()}\n" - f"mem manager can alloc token num {self.req_manager.mem_manager.can_use_mem_size}\n" - f"mem manager total size {self.req_manager.mem_manager.size}" + f"mem manager can alloc token num {self.req_manager.mem_manager.allocator.can_use_mem_size}\n" + f"mem manager total size {self.req_manager.mem_manager.allocator.size}\n" ) return @@ -348,7 +348,7 @@ def get_can_alloc_token_num(self): radix_cache_unref_token_num = ( self.radix_cache.get_tree_total_tokens_num() - self.radix_cache.get_refed_tokens_num() ) - return self.req_manager.mem_manager.can_use_mem_size + radix_cache_unref_token_num + return self.req_manager.mem_manager.allocator.can_use_mem_size + radix_cache_unref_token_num def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: List["InferReq"]): """ diff --git a/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py b/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py index 696452b419..6d97714b8c 100644 --- a/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py +++ b/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py @@ -80,8 +80,8 @@ def _alloc_to_frozen_some_tokens(self, move_task: KVMoveTask): logger.debug( f"radix refed token num {self.backend.radix_cache.get_refed_tokens_num()}\n" f"radix hold token num {self.backend.radix_cache.get_tree_total_tokens_num()}\n" - f"mem manager can alloc token num {self.backend.model.mem_manager.can_use_mem_size}\n" - f"mem manager total size {self.backend.model.mem_manager.size}" + f"mem manager can alloc token num {self.backend.model.mem_manager.allocator.can_use_mem_size}\n" + f"mem manager total size {self.backend.model.mem_manager.allocator.size}\n" f"frozened token num {frozen_token_num}\n" f"estimated peak token num {estimated_peak_token_num}\n" ) From 5696d120fb1eb885928cef294d97b87e499b3b56 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Sat, 9 May 2026 14:27:18 +0000 Subject: [PATCH 2/9] fix --- lightllm/server/api_start.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index 8c6af128c8..affddabb4b 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -130,6 +130,7 @@ def normal_or_p_d_start(args): args.running_max_req_size = 3 args.batch_max_tokens = 2048 args.chunked_prefill_size = 1024 + args.embed_cache_storage_size = 0.8 if args.mem_fraction > 0.82: args.mem_fraction = 0.82 args.graph_max_batch_size = 32 From 88c34bbf010ccbfddd993187a5ef2cc9e11cc155 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Sun, 10 May 2026 01:11:12 +0000 Subject: [PATCH 3/9] fix --- lightllm/server/build_prompt.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lightllm/server/build_prompt.py b/lightllm/server/build_prompt.py index 84044fccce..3418277139 100644 --- a/lightllm/server/build_prompt.py +++ b/lightllm/server/build_prompt.py @@ -94,6 +94,15 @@ async def build_prompt(request, tools) -> str: if request.chat_template_kwargs: kwargs.update(request.chat_template_kwargs) + # 修复一些类型是默认打开thinking,但是 tokenizer有时候不知道打开了thinking。导致 + # 构建的reasoning parser 和 tokenizer 的行为不对齐导致的问题。 + from .api_openai import _get_reasoning_from_request + + thinking = _get_reasoning_from_request(request) + + kwargs["thinking"] = thinking + kwargs["enable_thinking"] = thinking + try: input_str = tokenizer.apply_chat_template(**kwargs, tokenize=False, add_generation_prompt=True, tools=tools) except BaseException as e: From dd80a5e29bc9fbf35424db6f9e8e4178a1700da1 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Sun, 10 May 2026 01:11:59 +0000 Subject: [PATCH 4/9] fix --- lightllm/server/httpserver/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 4c049f77c0..d54de63f36 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -529,7 +529,7 @@ async def _encode( if self.args.detail_log: logger.debug( - f"req_id: {sampling_params.group_request_id} prompt: {prompt},\n" + f"req_id: {sampling_params.group_request_id} prompt: {prompt}\n" f"samplingparmas: {sampling_params.to_dict()}\n" f"token_ids: {prompt_ids}" ) From df1d7bed445791e8ca021b5a8587d4596e36d7f0 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Sun, 10 May 2026 05:25:26 +0000 Subject: [PATCH 5/9] fix --- lightllm/server/build_prompt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/server/build_prompt.py b/lightllm/server/build_prompt.py index 3418277139..de893800fd 100644 --- a/lightllm/server/build_prompt.py +++ b/lightllm/server/build_prompt.py @@ -94,7 +94,7 @@ async def build_prompt(request, tools) -> str: if request.chat_template_kwargs: kwargs.update(request.chat_template_kwargs) - # 修复一些类型是默认打开thinking,但是 tokenizer有时候不知道打开了thinking。导致 + # 修复一些parser类型是默认打开thinking,但是 tokenizer有时候不知道打开了thinking。导致 # 构建的reasoning parser 和 tokenizer 的行为不对齐导致的问题。 from .api_openai import _get_reasoning_from_request From eb468035f9c6a564516bce6920b9dbd88f0611ca Mon Sep 17 00:00:00 2001 From: wzj Date: Sun, 10 May 2026 10:26:54 +0000 Subject: [PATCH 6/9] fix mem_fraction calcu --- docs/CN/source/tutorial/api_server_args.rst | 1 - docs/EN/source/tutorial/api_server_args.rst | 1 - lightllm/common/kv_cache_mem_manager/mem_manager.py | 5 +++-- lightllm/server/api_start.py | 4 +--- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst index 1472f0fc43..6d372fee27 100644 --- a/docs/CN/source/tutorial/api_server_args.rst +++ b/docs/CN/source/tutorial/api_server_args.rst @@ -27,7 +27,6 @@ APIServer 参数详解 - ``running_max_req_size`` 为 3 - ``batch_max_tokens`` 为 2048 (2k) - ``chunked_prefill_size`` 为 1024 (1k) - - ``mem_fraction`` 为 0.85 .. option:: --host diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst index 66b01c2ffc..ab5143a476 100644 --- a/docs/EN/source/tutorial/api_server_args.rst +++ b/docs/EN/source/tutorial/api_server_args.rst @@ -27,7 +27,6 @@ Basic Configuration Parameters - ``running_max_req_size`` to 3 - ``batch_max_tokens`` to 2048 (2k) - ``chunked_prefill_size`` to 1024 (1k) - - ``mem_fraction`` to 0.85 .. option:: --host diff --git a/lightllm/common/kv_cache_mem_manager/mem_manager.py b/lightllm/common/kv_cache_mem_manager/mem_manager.py index 023be8af9b..f901b72a9e 100755 --- a/lightllm/common/kv_cache_mem_manager/mem_manager.py +++ b/lightllm/common/kv_cache_mem_manager/mem_manager.py @@ -65,9 +65,10 @@ def profile_size(self, mem_fraction): if self.size is not None: return + torch.cuda.empty_cache() world_size = dist.get_world_size() - total_memory = get_total_gpu_memory() - available_memory = get_available_gpu_memory(world_size) - total_memory * (1 - mem_fraction) + + available_memory = get_available_gpu_memory(world_size) * mem_fraction cell_size = self.get_cell_size() self.size = int(available_memory * 1024 ** 3 / cell_size) if world_size > 1: diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index affddabb4b..967112d036 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -131,12 +131,10 @@ def normal_or_p_d_start(args): args.batch_max_tokens = 2048 args.chunked_prefill_size = 1024 args.embed_cache_storage_size = 0.8 - if args.mem_fraction > 0.82: - args.mem_fraction = 0.82 args.graph_max_batch_size = 32 logger.info( f"performance_mode is personal, set running_max_req_size to 3," - f"batch_max_tokens to 2048, chunked_prefill_size to 1024, mem_fraction to 0.82," + f"batch_max_tokens to 2048, chunked_prefill_size to 1024," f"graph_max_batch_size to 32" ) From f5fb520d1d233b84d3cd61a37a47a3423fb6e1c1 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Mon, 11 May 2026 01:54:41 +0000 Subject: [PATCH 7/9] fix --- skills/test_model/deepseekr1-mtp-ep/SKILL.md | 145 +++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 skills/test_model/deepseekr1-mtp-ep/SKILL.md diff --git a/skills/test_model/deepseekr1-mtp-ep/SKILL.md b/skills/test_model/deepseekr1-mtp-ep/SKILL.md new file mode 100644 index 0000000000..0b5c8debc6 --- /dev/null +++ b/skills/test_model/deepseekr1-mtp-ep/SKILL.md @@ -0,0 +1,145 @@ +--- +name: test-model-deepseekr1-mtp-ep +description: >- + Runs LightLLM DeepSeek-R1 EP MoE + MTP (EAGLE) server variants and GSM8K lm_eval + against localhost. Requires each full run to use a dedicated log directory: persist every + api_server process log under that tree (per-variant subdirectories recommended), + write the consolidated summary to summary.txt in that same log directory, and keep artifacts + separated from other test runs. Use when running DeepSeek-R1 MTP EP accuracy workflows + or when the user asks to run these four server configurations one-by-one with logged results. +--- + +# DeepSeek-R1 MTP + EP MoE 串行评测流程 + +按固定顺序依次启动四种 `api_server` 配置;每次待服务就绪后执行 `lm_eval`。整轮评测须落在**同一日志目录**内归档日志与最终结论(见「日志目录」);具体操作见「启动说明」。 + +## 日志目录(含 `summary.txt`) + +- 每次完整评测(四种变体串行)先选定或新建**一个日志目录**(例如带时间戳或任务名),与其它测试轮次分开,便于区分管理。 +- **所有 `api_server` 进程的标准输出/错误**须写入该目录下文件(建议每种变体单独子目录,如 `variant_01_baseline/`、`variant_02_tpsp_mix/`;或同级命名 `server_01_baseline.log` 等,团队任选其一,保持可追溯)。 +- **`summary.txt` 固定放在该日志目录下**,汇总整轮测试:各变体启动参数摘要、`lm_eval` 关键结果、失败原因与最终对比;**不再**把「最终总结」散落在当前工作目录或其它路径。 +- `lm_eval` 终端输出也要有单独的日志文件(如 `eval_gsm8k.log`),**`summary.txt`** 仍承担**总览结论**角色。 + +## 启动说明 + +本节包含:启动前检查 → 启动服务的命令模板(可变项说明)→ 四种完整 server 命令 → 评测命令。 + +### 启动前检查 + +开跑前先确认资源可用;**不满足则先清理相关进程,再进入后续变体**。 + +1. **显卡占用**:用 `nvidia-smi`(或与集群一致的占用查看方式)检查目标 GPU 是否被无关任务占满;若有冲突进程,结束后再启动本评测。 +2. **端口**:服务固定 **`8089`**;用 `ss -tlnp`、`lsof -i :8089` 等确认**无进程监听**该端口;若已被占用,查出 PID 并结束占用进程后再启动。 + +### 启动服务的命令模板(可变项) + +下列命令中出现的可变项含义如下(其余为固定写法): + +| 可变项 | 含义 | +|--------|------| +| `LOG_DIR` | 本轮评测日志目录,建议**绝对路径**;执行前 `export LOG_DIR=…`。 | +| `MODEL_DIR` | 主模型目录,对应 `--model_dir`;与 `lm_eval` 的 `tokenizer` 必须一致。 | +| `MTP_DRAFT_DIR` | MTP 草稿模型目录,对应 `--mtp_draft_model_dir`。 | +| `server_*.log`、`eval_*.log` | 仅文件名示例,可按变体重命名。 | + +开跑前在同一 shell 中导出三类路径(将引号内整段替换为本机绝对路径;**勿写死下文未给出的机器路径**): + +```bash +export LOG_DIR='〈日志根目录〉' +export MODEL_DIR='〈主模型目录,对应 --model_dir〉' +export MTP_DRAFT_DIR='〈MTP 草稿目录,对应 --mtp_draft_model_dir〉' +``` + +首次试跑可用的**默认路径组合**见「执行约定」;与当前环境不符时再改为用户提供的目录。 + +### 四种 server 启动命令(按顺序逐个测) + +每条 **单独** 跑完「启动 → 等就绪 → 评测 → 写入日志目录下的日志 → 停服务」再进入下一条,不要并行多个 server。以下为**可直接执行**的后台启动形式(已含 `nohup` 与日志重定向);若暂时不需落盘,可自行去掉 `nohup`、`>> … 2>&1 &` 并在前台调试。命令中 **`${MODEL_DIR}`、`${MTP_DRAFT_DIR}`** 须已由上文 `export` 赋值。 + +#### 变体 1:基线(EP + MTP) + +```bash +LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \ +nohup python -m lightllm.server.api_server \ + --enable_ep_moe --model_dir "${MODEL_DIR}" --tp 8 --dp 8 --port 8089 \ + --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 \ + --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ + >> "${LOG_DIR}/server_01_baseline.log" 2>&1 & +``` + +#### 变体 2:`--enable_tpsp_mix_mode` + +```bash +LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \ +nohup python -m lightllm.server.api_server \ + --enable_ep_moe --model_dir "${MODEL_DIR}" --tp 8 --dp 8 --port 8089 \ + --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 \ + --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ + --enable_tpsp_mix_mode \ + >> "${LOG_DIR}/server_02_tpsp_mix.log" 2>&1 & +``` + +#### 变体 3:prefill / decode microbatch overlap + +```bash +LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \ +nohup python -m lightllm.server.api_server \ + --enable_ep_moe --model_dir "${MODEL_DIR}" --tp 8 --dp 8 --port 8089 \ + --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 \ + --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ + --enable_prefill_microbatch_overlap --enable_decode_microbatch_overlap \ + >> "${LOG_DIR}/server_03_overlap.log" 2>&1 & +``` + +#### 变体 4:overlap + `--enable_dp_prefill_balance` + +```bash +LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \ +nohup python -m lightllm.server.api_server \ + --enable_ep_moe --model_dir "${MODEL_DIR}" --tp 8 --dp 8 --port 8089 \ + --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 \ + --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ + --enable_prefill_microbatch_overlap --enable_decode_microbatch_overlap \ + --enable_dp_prefill_balance \ + >> "${LOG_DIR}/server_04_overlap_dp_balance.log" 2>&1 & +``` + +### 评测命令(每个变体各执行一次) + +服务就绪后执行(本地回环走代理时用 `no_proxy` / `NO_PROXY` 排除本机)。**`model_args` 中 `tokenizer` 必须与本次 server 的 `--model_dir`(即 **`${MODEL_DIR}`**)为同一字符串路径**。以下为带日志落盘的**完整命令**(`--model_args` 使用双引号以便展开 **`${MODEL_DIR}`**): + +```bash +HF_ALLOW_CODE_EVAL=1 HF_DATASETS_OFFLINE=0 \ +no_proxy=127.0.0.1,localhost,::1 \ +lm_eval --model local-completions \ + --model_args "{\"model\":\"deepseek-ai/DeepSeek-R1\", \"base_url\":\"http://localhost:8089/v1/completions\", \"max_length\": 16384, \"tokenizer\":\"${MODEL_DIR}\"}" \ + --tasks gsm8k --batch_size 32 --confirm_run_unsafe_code \ + >> "${LOG_DIR}/eval_gsm8k.log" 2>&1 +``` + +- **`LOG_DIR`**:与启动服务一节相同;若仅调试不重定向,去掉 `\` 续行及最后的 `>> "${LOG_DIR}/eval_gsm8k.log" 2>&1` 即可在前台查看输出。 +- **`MODEL_DIR`**:须与 server 启动命令中的 `--model_dir` 一致;路径随环境变化时的默认试跑与向用户确认见「执行约定」。 +- 若环境需要,可同时设置 `NO_PROXY=127.0.0.1,localhost,::1`(或与团队约定一致的列表)。 + +## 执行约定(不要额外写“专用启动脚本”) + +**模型与 MTP 目录(随环境变化)**:`MODEL_DIR`(主模型)、`MTP_DRAFT_DIR`(MTP 草稿)在不同机器上路径不同。**首轮试跑**可先用下列默认组合(与本文档常见部署对应;若本机不存在则跳过默认、直接执行下一步「向用户确认」): + +```bash +export MODEL_DIR=/mtc/models/DeepSeek-R1 +export MTP_DRAFT_DIR=/mtc/models/DeepSeek-R1-NextN +``` + +若按默认路径 **export** 后仍无法启动服务,或日志中出现**明确的模型路径 / 权重加载 / 文件不存在**等错误,**不要反复盲试**:根据日志判断为路径问题时,**请用户提供**当前环境下实际的主模型目录与 MTP 草稿目录,更新 `export MODEL_DIR=…`、`export MTP_DRAFT_DIR=…` 后再执行(且保证 **`MODEL_DIR` 与 `lm_eval` 的 `tokenizer` 仍为同一路径**)。 + +1. **后台启动 server**:用 shell 后台或终端任务跑当前变体的 `python -m lightllm.server.api_server ...`,**并将该进程输出重定向到本轮日志目录下的日志文件**(见上文「日志目录(含 summary.txt)」);排查问题时 tail 该文件,而不是依赖未落盘的终端缓冲。 +2. **不要用 health 接口** 判断就绪;改为探测 **端口 8089 是否处于 listen**(例如 `ss -tlnp` / `lsof -i :8089` 等,与系统一致即可)。 +3. **等待启动**:若端口未就绪,约 **每 20 秒** 查看一次**该变体对应的服务日志文件**,区分仍在启动还是已报错退出;报错则写入日志目录下的 `summary.txt`(或先写变体日志再在该汇总文件中引用)并停止该变体,不要继续盲等。 +4. **维护 `summary.txt`**:位于**日志目录**;随进度追加每个变体的标记块——**本条使用的完整启动命令**(或等价摘要)、**端口检测结果**、**lm_eval 关键输出**;全部结束后在该文件内写**最终汇总**(各配置成败、指标对比或失败原因)。可与用户口头摘要对照,但以日志目录中 **`summary.txt`** 为归档准绳。 +5. **变体之间**:停止上一进程的 server,再启动下一变体(避免端口占用)。 +6. **全部完成后**:确认日志目录下的 **`summary.txt`** 已包含完整最终总结;原始 server / eval 日志保留在同目录(或子目录)中备查。 + +## 输出文件 + +- **`summary.txt`**:仅位于**本轮日志目录**,作为整次四变体测试的**最终总结**文档。 +- **服务与评测日志**:全部落在**同一日志目录**(建议按变体分子目录或分文件名),不得与未指定目录混写。 From c7e701a58ca249dafe0b43ed36c3fd6d3a635be9 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Mon, 11 May 2026 02:05:59 +0000 Subject: [PATCH 8/9] fix --- skills/test_model/deepseekr1-mtp-ep/SKILL.md | 4 ++++ test/acc/test_deepseekr1_mtp_ep.sh | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/skills/test_model/deepseekr1-mtp-ep/SKILL.md b/skills/test_model/deepseekr1-mtp-ep/SKILL.md index 0b5c8debc6..dd6da77c3b 100644 --- a/skills/test_model/deepseekr1-mtp-ep/SKILL.md +++ b/skills/test_model/deepseekr1-mtp-ep/SKILL.md @@ -63,6 +63,7 @@ LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \ nohup python -m lightllm.server.api_server \ --enable_ep_moe --model_dir "${MODEL_DIR}" --tp 8 --dp 8 --port 8089 \ --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 \ + --max_req_total_len 56000 \ --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ >> "${LOG_DIR}/server_01_baseline.log" 2>&1 & ``` @@ -74,6 +75,7 @@ LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \ nohup python -m lightllm.server.api_server \ --enable_ep_moe --model_dir "${MODEL_DIR}" --tp 8 --dp 8 --port 8089 \ --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 \ + --max_req_total_len 56000 \ --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ --enable_tpsp_mix_mode \ >> "${LOG_DIR}/server_02_tpsp_mix.log" 2>&1 & @@ -86,6 +88,7 @@ LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \ nohup python -m lightllm.server.api_server \ --enable_ep_moe --model_dir "${MODEL_DIR}" --tp 8 --dp 8 --port 8089 \ --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 \ + --max_req_total_len 56000 \ --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ --enable_prefill_microbatch_overlap --enable_decode_microbatch_overlap \ >> "${LOG_DIR}/server_03_overlap.log" 2>&1 & @@ -98,6 +101,7 @@ LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \ nohup python -m lightllm.server.api_server \ --enable_ep_moe --model_dir "${MODEL_DIR}" --tp 8 --dp 8 --port 8089 \ --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 \ + --max_req_total_len 56000 \ --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ --enable_prefill_microbatch_overlap --enable_decode_microbatch_overlap \ --enable_dp_prefill_balance \ diff --git a/test/acc/test_deepseekr1_mtp_ep.sh b/test/acc/test_deepseekr1_mtp_ep.sh index c1dd3bc508..29c7515b27 100644 --- a/test/acc/test_deepseekr1_mtp_ep.sh +++ b/test/acc/test_deepseekr1_mtp_ep.sh @@ -1,19 +1,19 @@ -LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 python -m lightllm.server.api_server --enable_ep_moe --model_dir /mtc/models/DeepSeek-R1 --tp 8 --dp 8 --port 8089 --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 +LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 python -m lightllm.server.api_server --enable_ep_moe --model_dir /mtc/models/DeepSeek-R1 --tp 8 --dp 8 --port 8089 --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 --max_req_total_len 56000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 HF_ALLOW_CODE_EVAL=1 HF_DATASETS_OFFLINE=0 lm_eval --model local-completions --model_args '{"model":"deepseek-ai/DeepSeek-R1", "base_url":"http://localhost:8089/v1/completions", "max_length": 16384}' --tasks gsm8k --batch_size 32 --confirm_run_unsafe_code -LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 python -m lightllm.server.api_server --enable_ep_moe --model_dir /mtc/models/DeepSeek-R1 --tp 8 --dp 8 --port 8089 --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 --enable_tpsp_mix_mode +LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 python -m lightllm.server.api_server --enable_ep_moe --model_dir /mtc/models/DeepSeek-R1 --tp 8 --dp 8 --port 8089 --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 --max_req_total_len 56000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 --enable_tpsp_mix_mode HF_ALLOW_CODE_EVAL=1 HF_DATASETS_OFFLINE=0 lm_eval --model local-completions --model_args '{"model":"deepseek-ai/DeepSeek-R1", "base_url":"http://localhost:8089/v1/completions", "max_length": 16384}' --tasks gsm8k --batch_size 32 --confirm_run_unsafe_code -LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 python -m lightllm.server.api_server --enable_ep_moe --model_dir /mtc/models/DeepSeek-R1 --tp 8 --dp 8 --port 8089 --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 --enable_prefill_microbatch_overlap --enable_decode_microbatch_overlap +LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 python -m lightllm.server.api_server --enable_ep_moe --model_dir /mtc/models/DeepSeek-R1 --tp 8 --dp 8 --port 8089 --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 --max_req_total_len 56000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 --enable_prefill_microbatch_overlap --enable_decode_microbatch_overlap HF_ALLOW_CODE_EVAL=1 HF_DATASETS_OFFLINE=0 lm_eval --model local-completions --model_args '{"model":"deepseek-ai/DeepSeek-R1", "base_url":"http://localhost:8089/v1/completions", "max_length": 16384}' --tasks gsm8k --batch_size 32 --confirm_run_unsafe_code -LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 python -m lightllm.server.api_server --enable_ep_moe --model_dir /mtc/models/DeepSeek-R1 --tp 8 --dp 8 --port 8089 --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 --enable_prefill_microbatch_overlap --enable_decode_microbatch_overlap --enable_dp_prefill_balance +LOADWORKER=18 NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 python -m lightllm.server.api_server --enable_ep_moe --model_dir /mtc/models/DeepSeek-R1 --tp 8 --dp 8 --port 8089 --max_total_token_num 60000 --graph_max_batch_size 16 --batch_max_tokens 6000 --max_req_total_len 56000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 --enable_prefill_microbatch_overlap --enable_decode_microbatch_overlap --enable_dp_prefill_balance HF_ALLOW_CODE_EVAL=1 HF_DATASETS_OFFLINE=0 lm_eval --model local-completions --model_args '{"model":"deepseek-ai/DeepSeek-R1", "base_url":"http://localhost:8089/v1/completions", "max_length": 16384}' --tasks gsm8k --batch_size 32 --confirm_run_unsafe_code From 4507e5a3f0c1d4c4f27787a39e902f1a0b61fc36 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Mon, 11 May 2026 02:24:20 +0000 Subject: [PATCH 9/9] fix --- skills/test_model/deepseekr1-mtp-tp/SKILL.md | 104 +++++++++++++++++++ test/acc/test_deepseekr1_mtp.sh | 2 +- 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 skills/test_model/deepseekr1-mtp-tp/SKILL.md diff --git a/skills/test_model/deepseekr1-mtp-tp/SKILL.md b/skills/test_model/deepseekr1-mtp-tp/SKILL.md new file mode 100644 index 0000000000..275d9316a6 --- /dev/null +++ b/skills/test_model/deepseekr1-mtp-tp/SKILL.md @@ -0,0 +1,104 @@ +--- +name: test-model-deepseekr1-mtp-tp +description: >- + DeepSeek-R1 MTP-TP test: LightLLM api_server with MTP (EAGLE) draft, tensor parallel + only (--tp 8, no --dp, no EP MoE), plus GSM8K lm_eval on localhost. Distinct from the + MTP-EP-TPDP skill which uses --tp 8 --dp 8 and EP MoE. Requires a dedicated log directory, + summary.txt, tokenizer aligned with MODEL_DIR. Use for TP-only MTP gsm8k accuracy runs. +--- + +# DeepSeek-R1 **MTP–TP**(仅张量并行 `--tp 8`,无 DP / 无 EP)本地 GSM8K 评测 + +**测试标识**:并行方式为 **`--tp 8` 单路 TP**,不包含 **`--dp`** 与 **`--enable_ep_moe`**。用于与 **MTP–EP–TPDP**(`--tp 8 --dp 8` + EP MoE)流水线区分。 + +启动一组 `api_server`(`eagle_with_att` MTP),待就绪后对同一进程执行一次 `lm_eval`(任务 `gsm8k`)。全过程产物落在**同一日志目录**(见「日志目录」);命令与流程见「启动说明」。 + +## 日志目录(含 `summary.txt`) + +- 先选定或新建**一个日志目录**(例如带时间戳或任务名),与其它测试轮次分开。 +- **`api_server` 的标准输出/错误**写入该目录下文件(示例文件名 `server_mtp_tp.log`;可按团队习惯改名或分子目录)。 +- **`summary.txt` 固定放在该日志目录下**,写入本轮启动参数摘要、`lm_eval` 关键结果与简要结论。 +- `lm_eval` 终端输出建议单独落盘(如 `eval_gsm8k.log`);**`summary.txt`** 仍为整次任务的**总览结论**。 + +## 启动说明 + +本节包含:启动前检查 → 启动服务的命令模板(可变项说明)→ 一条完整 server 命令 → 评测命令。 + +### 启动前检查 + +开跑前先确认资源可用;**不满足则先清理相关进程**。 + +1. **显卡占用**:用 `nvidia-smi`(或与集群一致的占用查看方式)检查目标 GPU 是否被无关任务占满;若有冲突进程,结束后再启动本评测。 +2. **端口**:服务固定 **`8089`**;用 `ss -tlnp`、`lsof -i :8089` 等确认**无进程监听**该端口;若已被占用,查出 PID 并结束占用进程后再启动。 + +### 启动服务的命令模板(可变项) + +下列符号与 EP–TPDP 版评测共用含义: + +| 可变项 | 含义 | +|--------|------| +| `LOG_DIR` | 本轮评测日志目录,建议**绝对路径**;执行前 `export LOG_DIR=…`。 | +| `MODEL_DIR` | 主模型目录,对应 `--model_dir`;与 `lm_eval` 的 `tokenizer` 必须一致。 | +| `MTP_DRAFT_DIR` | MTP 草稿模型目录,对应 `--mtp_draft_model_dir`。 | + +开跑前在同一 shell 中导出路径(引号内替换为本机绝对路径): + +```bash +export LOG_DIR='〈日志根目录〉' +export MODEL_DIR='〈主模型目录,对应 --model_dir〉' +export MTP_DRAFT_DIR='〈MTP 草稿目录,对应 --mtp_draft_model_dir〉' +``` + +首次试跑可用的**默认路径组合**见「执行约定」。 + +### 一条 server 启动命令(后台落盘) + +以下为 **MTP–TP** 固定形态:**`--tp 8`**,**无 `--dp`**。可直接执行的后台形式(已含 `nohup` 与日志重定向);调试时可去掉 `nohup` 与 `>> … 2>&1 &` 改前台。**`${MODEL_DIR}`、`${MTP_DRAFT_DIR}`、`${LOG_DIR}`** 须已由上文 `export` 赋值。 + +```bash +LOADWORKER=18 \ +nohup python -m lightllm.server.api_server \ + --model_dir "${MODEL_DIR}" --tp 8 --port 8089 \ + --mem_fraction 0.75 --batch_max_tokens 6000 \ + --mtp_mode eagle_with_att --mtp_draft_model_dir "${MTP_DRAFT_DIR}" --mtp_step 2 \ + >> "${LOG_DIR}/server_mtp_tp.log" 2>&1 & +``` + +### 评测命令(服务就绪后执行一次) + +本地回环需排除代理:`no_proxy` / `NO_PROXY`。**`tokenizer` 与 `--model_dir`(`${MODEL_DIR}`)须为同一路径**。以下为带日志落盘的**完整命令**: + +```bash +HF_ALLOW_CODE_EVAL=1 HF_DATASETS_OFFLINE=0 \ +no_proxy=127.0.0.1,localhost,::1 \ +lm_eval --model local-completions \ + --model_args "{\"model\":\"deepseek-ai/DeepSeek-R1\", \"base_url\":\"http://localhost:8089/v1/completions\", \"max_length\": 16384, \"tokenizer\":\"${MODEL_DIR}\"}" \ + --tasks gsm8k --batch_size 500 --confirm_run_unsafe_code \ + >> "${LOG_DIR}/eval_gsm8k.log" 2>&1 +``` + +- **`LOG_DIR`**:与 server 一节一致;若仅调试不重定向,可去掉末尾 `>> "${LOG_DIR}/eval_gsm8k.log" 2>&1`。 +- **`MODEL_DIR`**:与 server 的 `--model_dir` 一致;默认试跑与用户确认路径见「执行约定」。 +- 若环境需要,可同时设置 `NO_PROXY=127.0.0.1,localhost,::1`。 + +## 执行约定(不要额外写“专用启动脚本”) + +**模型与 MTP 目录(随环境变化)**:`MODEL_DIR`、`MTP_DRAFT_DIR` 在不同机器上路径不同。**首轮试跑**可先使用: + +```bash +export MODEL_DIR=/mtc/models/DeepSeek-R1 +export MTP_DRAFT_DIR=/mtc/models/DeepSeek-R1-NextN +``` + +若默认路径不存在或服务报错指向路径/权重加载失败,**请用户提供**本机实际目录并更新两个 `export`;**保持 `MODEL_DIR` 与 `lm_eval` 中 `tokenizer` 一致**。 + +1. **后台启动 server**:将 `api_server` 输出重定向到日志目录下文件(见「日志目录」);排查时用 `tail` 查看该日志。 +2. **不要用 health 接口** 判断就绪;改为探测 **端口 8089 是否 listen**(例如 `ss -tlnp` / `lsof -i :8089`)。 +3. **等待启动**:端口未就绪时约 **每 20 秒** 查看服务日志,区分仍在启动或已报错;路径类错误按上文向用户确认目录。 +4. **维护 `summary.txt`**:记录完整启动命令摘要(须能看出 **`--tp 8`、无 `--dp`**)、端口检测结果、`lm_eval` 关键输出与最终结论。 +5. **全部完成后**:确认 **`summary.txt`** 完整;server / eval 原始日志保留在同一日志目录备查。 + +## 输出文件 + +- **`summary.txt`**:位于**本轮日志目录**,作为本次 **MTP–TP** 评测的**最终总结**。 +- **服务与评测日志**:与 **`summary.txt`** 落在**同一日志目录**。 diff --git a/test/acc/test_deepseekr1_mtp.sh b/test/acc/test_deepseekr1_mtp.sh index ec4ce71984..7b511e41ac 100644 --- a/test/acc/test_deepseekr1_mtp.sh +++ b/test/acc/test_deepseekr1_mtp.sh @@ -1,6 +1,6 @@ LOADWORKER=18 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-R1 --tp 8 --port 8089 --mem_fraction 0.75 --batch_max_tokens 6000 --mtp_mode eagle_with_att --mtp_draft_model_dir /mtc/models/DeepSeek-R1-NextN --mtp_step 2 -HF_ALLOW_CODE_EVAL=1 HF_DATASETS_OFFLINE=0 lm_eval --model local-completions --model_args '{"model":"deepseek-ai/DeepSeek-R1", "base_url":"http://localhost:8089/v1/completions", "max_length": 16384}' --tasks gsm8k --batch_size 500 --confirm_run_unsafe_code +HF_ALLOW_CODE_EVAL=1 HF_DATASETS_OFFLINE=0 lm_eval --model local-completions --model_args '{"model":"deepseek-ai/DeepSeek-R1", "base_url":"http://localhost:8089/v1/completions", "max_length": 16384, "tokenizer":"/mtc/models/DeepSeek-R1"}' --tasks gsm8k --batch_size 500 --confirm_run_unsafe_code # 帮我写一段提示词,告诉AI单独一个一个的进行上述测试的启动服务,然后再执行评测脚本,将结果写入out.txt 中,注意需要标记启动的参数和结果信息。不要用health 接口去判断服务是否启动,直接探测端口是否处于listen状态即可, 执行评测命令的时候,需要用no_proxy 将本地local ip 排除。 # 不要写额外的脚本来启动服务,就是单独一个一个的按照上面的描述启动服务,然后再执行评测脚本,然后注意等待服务启动完成,可以20s检测一次其控制台输出,看是否启动完成,还是启动报错。