diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst index 1472f0fc43..6d372fee27 100644 --- a/docs/CN/source/tutorial/api_server_args.rst +++ b/docs/CN/source/tutorial/api_server_args.rst @@ -27,7 +27,6 @@ APIServer 参数详解 - ``running_max_req_size`` 为 3 - ``batch_max_tokens`` 为 2048 (2k) - ``chunked_prefill_size`` 为 1024 (1k) - - ``mem_fraction`` 为 0.85 .. option:: --host diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst index 66b01c2ffc..ab5143a476 100644 --- a/docs/EN/source/tutorial/api_server_args.rst +++ b/docs/EN/source/tutorial/api_server_args.rst @@ -27,7 +27,6 @@ Basic Configuration Parameters - ``running_max_req_size`` to 3 - ``batch_max_tokens`` to 2048 (2k) - ``chunked_prefill_size`` to 1024 (1k) - - ``mem_fraction`` to 0.85 .. option:: --host diff --git a/lightllm/common/kv_cache_mem_manager/__init__.py b/lightllm/common/kv_cache_mem_manager/__init__.py index 2609cfb8ab..05544e149a 100644 --- a/lightllm/common/kv_cache_mem_manager/__init__.py +++ b/lightllm/common/kv_cache_mem_manager/__init__.py @@ -1,3 +1,4 @@ +from .allocator import KvCacheAllocator from .mem_manager import MemoryManager, ReadOnlyStaticsMemoryManager from .ppl_int8kv_mem_manager import PPLINT8KVMemoryManager from .ppl_int4kv_mem_manager import PPLINT4KVMemoryManager @@ -9,6 +10,7 @@ from .qwen3next_mem_manager import Qwen3NextMemManager __all__ = [ + "KvCacheAllocator", "MemoryManager", "ReadOnlyStaticsMemoryManager", "PPLINT4KVMemoryManager", diff --git a/lightllm/common/kv_cache_mem_manager/mem_manager.py b/lightllm/common/kv_cache_mem_manager/mem_manager.py index 0cc84db322..f901b72a9e 100755 --- a/lightllm/common/kv_cache_mem_manager/mem_manager.py +++ b/lightllm/common/kv_cache_mem_manager/mem_manager.py @@ -3,10 +3,11 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp -from typing import List, Union, Tuple, Any +from typing import List, Tuple, Any, Union from lightllm.server.pd_io_struct import KVMoveTask from lightllm.utils.log_utils import init_logger from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt +from .allocator import KvCacheAllocator from lightllm.utils.profile_max_tokens import get_available_gpu_memory, get_total_gpu_memory from lightllm.common.kv_trans_kernel.kv_trans import kv_trans from lightllm.utils.dist_utils import get_current_rank_in_node, get_node_world_size @@ -38,27 +39,8 @@ def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False # profile the max total token num if the size is None self.profile_size(mem_fraction) - self.mem_state = torch.arange( - 0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True - ) - self._mem_state_return = torch.arange( - 0, self.size * 3, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True - ) - self._return_start = 0 - self.mark_start = 0 - self.mark_end = self.size - - self.can_use_mem_size = self.size - - # 用共享内存进行共享,router 模块读取进行精确的调度估计, nccl port 作为一个单机中单实列的标记。防止冲突。 - from lightllm.utils.envs_utils import get_unique_server_name + self.allocator = KvCacheAllocator(self.size) - rank_in_node = get_current_rank_in_node() - self.shared_can_use_token_num = SharedInt( - f"{get_unique_server_name()}_mem_manger_can_use_token_num_{rank_in_node}" - ) - - self.shared_can_use_token_num.set_value(self.can_use_mem_size) self._init_buffers( self.size, dtype, @@ -83,9 +65,10 @@ def profile_size(self, mem_fraction): if self.size is not None: return + torch.cuda.empty_cache() world_size = dist.get_world_size() - total_memory = get_total_gpu_memory() - available_memory = get_available_gpu_memory(world_size) - total_memory * (1 - mem_fraction) + + available_memory = get_available_gpu_memory(world_size) * mem_fraction cell_size = self.get_cell_size() self.size = int(available_memory * 1024 ** 3 / cell_size) if world_size > 1: @@ -338,57 +321,13 @@ def _free_buffers(self): self.kv_buffer = None def alloc(self, need_size) -> torch.Tensor: - if need_size > self.mark_end - self.mark_start: - logger.error(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}") - assert False, "error alloc state" - - start = self.mark_start - end = self.mark_start + need_size - self.mark_start += need_size - - self.can_use_mem_size -= need_size - self.shared_can_use_token_num.set_value(self.can_use_mem_size) - - # 利用缓冲区返回,避免异步情况下的内存竞争 - if self._return_start + need_size > self._mem_state_return.shape[0]: - self._return_start = 0 - ans = self._mem_state_return[self._return_start : self._return_start + need_size] - ans.copy_(self.mem_state[start:end]) - self._return_start += need_size - return ans - - def free(self, free_index: Union[torch.Tensor, List[int]]): - """_summary_ - - Args: - free_index (torch.Tensor): _description_ - """ - - end = self.mark_start - start = self.mark_start - len(free_index) - assert start >= 0, f"error free state start: {self.mark_start} free len {len(free_index)}" - - if isinstance(free_index, list): - self.mem_state.numpy()[start:end] = free_index - else: - # 从 gpu 到 cpu 的拷贝操作是流内阻塞操作 - self.mem_state[start:end] = free_index - - self.mark_start -= len(free_index) + return self.allocator.alloc(need_size) - self.can_use_mem_size += len(free_index) - self.shared_can_use_token_num.set_value(self.can_use_mem_size) - - if self.can_use_mem_size == len(self.mem_state): - logger.debug(f"freed all gpu mem size {self.can_use_mem_size}") - return + def free(self, free_index: Union[torch.Tensor, List[int]]) -> None: + self.allocator.free(free_index) def free_all(self): - self.can_use_mem_size = len(self.mem_state) - self.shared_can_use_token_num.set_value(self.can_use_mem_size) - self.mem_state.numpy()[:] = list(range(0, len(self.mem_state))) - self.mark_start = 0 - self.mark_end = len(self.mem_state) + self.allocator.free_all() def resize_mem(self, new_size): """ @@ -401,13 +340,7 @@ def resize_mem(self, new_size): layer_num = self.layer_num self.size = new_size - self.mem_state = torch.arange( - 0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True - ) - self.mark_start = 0 - self.mark_end = self.size - self.can_use_mem_size = self.size - self.shared_can_use_token_num.set_value(self.can_use_mem_size) + self.allocator.resize(new_size) self._free_buffers() self._init_buffers(size, dtype, head_num, head_dim, layer_num) return diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index 8c6af128c8..967112d036 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -130,12 +130,11 @@ def normal_or_p_d_start(args): args.running_max_req_size = 3 args.batch_max_tokens = 2048 args.chunked_prefill_size = 1024 - if args.mem_fraction > 0.82: - args.mem_fraction = 0.82 + args.embed_cache_storage_size = 0.8 args.graph_max_batch_size = 32 logger.info( f"performance_mode is personal, set running_max_req_size to 3," - f"batch_max_tokens to 2048, chunked_prefill_size to 1024, mem_fraction to 0.82," + f"batch_max_tokens to 2048, chunked_prefill_size to 1024," f"graph_max_batch_size to 32" ) diff --git a/lightllm/server/build_prompt.py b/lightllm/server/build_prompt.py index 84044fccce..de893800fd 100644 --- a/lightllm/server/build_prompt.py +++ b/lightllm/server/build_prompt.py @@ -94,6 +94,15 @@ async def build_prompt(request, tools) -> str: if request.chat_template_kwargs: kwargs.update(request.chat_template_kwargs) + # 修复一些parser类型是默认打开thinking,但是 tokenizer有时候不知道打开了thinking。导致 + # 构建的reasoning parser 和 tokenizer 的行为不对齐导致的问题。 + from .api_openai import _get_reasoning_from_request + + thinking = _get_reasoning_from_request(request) + + kwargs["thinking"] = thinking + kwargs["enable_thinking"] = thinking + try: input_str = tokenizer.apply_chat_template(**kwargs, tokenize=False, add_generation_prompt=True, tools=tools) except BaseException as e: diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 4c049f77c0..d54de63f36 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -529,7 +529,7 @@ async def _encode( if self.args.detail_log: logger.debug( - f"req_id: {sampling_params.group_request_id} prompt: {prompt},\n" + f"req_id: {sampling_params.group_request_id} prompt: {prompt}\n" f"samplingparmas: {sampling_params.to_dict()}\n" f"token_ids: {prompt_ids}" ) diff --git a/lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py b/lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py index 73c6dba54d..c7408add39 100644 --- a/lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py +++ b/lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py @@ -568,8 +568,8 @@ def _print_helper(self, node: LinearAttPagedTreeNode, indent): def free_radix_cache_to_get_enough_token(self, need_token_num): assert self.mem_manager is not None - if need_token_num > self.mem_manager.can_use_mem_size: - need_evict_token_num = need_token_num - self.mem_manager.can_use_mem_size + if need_token_num > self.mem_manager.allocator.can_use_mem_size: + need_evict_token_num = need_token_num - self.mem_manager.allocator.can_use_mem_size release_mems = [] small_page_buffer_ids = [] diff --git a/lightllm/server/router/dynamic_prompt/radix_cache.py b/lightllm/server/router/dynamic_prompt/radix_cache.py index 88b099459b..21e26c5854 100644 --- a/lightllm/server/router/dynamic_prompt/radix_cache.py +++ b/lightllm/server/router/dynamic_prompt/radix_cache.py @@ -401,12 +401,6 @@ def merge_unreferenced_nodes(self): if merged_node: worklist.append(merged_node) - def assert_leafs_is_right(self): - for node in self.evict_tree_set: - if node.is_leaf() and node.ref_counter == 0: - a = node.token_mem_index_value.cuda() - assert (self.mem_manager.mem_state[a] == 1).sum().item() == len(a) - def clear_tree_nodes(self): """ 该函数只在测试时调用 @@ -497,8 +491,8 @@ def _print_helper(self, node: TreeNode, indent): def free_radix_cache_to_get_enough_token(self, need_token_num): assert self.mem_manager is not None - if need_token_num > self.mem_manager.can_use_mem_size: - need_evict_token_num = need_token_num - self.mem_manager.can_use_mem_size + if need_token_num > self.mem_manager.allocator.can_use_mem_size: + need_evict_token_num = need_token_num - self.mem_manager.allocator.can_use_mem_size release_mems = [] def release_mem(mem_index): diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py index 3a27c082de..1a5d05784f 100644 --- a/lightllm/server/router/model_infer/infer_batch.py +++ b/lightllm/server/router/model_infer/infer_batch.py @@ -280,8 +280,8 @@ def _filter(self, finished_request_ids: List[int]): f"free a batch state:\n" f"radix refed token num {self.radix_cache.get_refed_tokens_num()}\n" f"radix hold token num {self.radix_cache.get_tree_total_tokens_num()}\n" - f"mem manager can alloc token num {self.req_manager.mem_manager.can_use_mem_size}\n" - f"mem manager total size {self.req_manager.mem_manager.size}" + f"mem manager can alloc token num {self.req_manager.mem_manager.allocator.can_use_mem_size}\n" + f"mem manager total size {self.req_manager.mem_manager.allocator.size}\n" ) return @@ -348,7 +348,7 @@ def get_can_alloc_token_num(self): radix_cache_unref_token_num = ( self.radix_cache.get_tree_total_tokens_num() - self.radix_cache.get_refed_tokens_num() ) - return self.req_manager.mem_manager.can_use_mem_size + radix_cache_unref_token_num + return self.req_manager.mem_manager.allocator.can_use_mem_size + radix_cache_unref_token_num def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: List["InferReq"]): """ diff --git a/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py b/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py index 696452b419..6d97714b8c 100644 --- a/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py +++ b/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py @@ -80,8 +80,8 @@ def _alloc_to_frozen_some_tokens(self, move_task: KVMoveTask): logger.debug( f"radix refed token num {self.backend.radix_cache.get_refed_tokens_num()}\n" f"radix hold token num {self.backend.radix_cache.get_tree_total_tokens_num()}\n" - f"mem manager can alloc token num {self.backend.model.mem_manager.can_use_mem_size}\n" - f"mem manager total size {self.backend.model.mem_manager.size}" + f"mem manager can alloc token num {self.backend.model.mem_manager.allocator.can_use_mem_size}\n" + f"mem manager total size {self.backend.model.mem_manager.allocator.size}\n" f"frozened token num {frozen_token_num}\n" f"estimated peak token num {estimated_peak_token_num}\n" )