Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/CN/source/tutorial/api_server_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ APIServer 参数详解
- ``running_max_req_size`` 为 3
- ``batch_max_tokens`` 为 2048 (2k)
- ``chunked_prefill_size`` 为 1024 (1k)
- ``mem_fraction`` 为 0.85

.. option:: --host

Expand Down
1 change: 0 additions & 1 deletion docs/EN/source/tutorial/api_server_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ Basic Configuration Parameters
- ``running_max_req_size`` to 3
- ``batch_max_tokens`` to 2048 (2k)
- ``chunked_prefill_size`` to 1024 (1k)
- ``mem_fraction`` to 0.85

.. option:: --host

Expand Down
2 changes: 2 additions & 0 deletions lightllm/common/kv_cache_mem_manager/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .allocator import KvCacheAllocator
from .mem_manager import MemoryManager, ReadOnlyStaticsMemoryManager
from .ppl_int8kv_mem_manager import PPLINT8KVMemoryManager
from .ppl_int4kv_mem_manager import PPLINT4KVMemoryManager
Expand All @@ -9,6 +10,7 @@
from .qwen3next_mem_manager import Qwen3NextMemManager

__all__ = [
"KvCacheAllocator",
"MemoryManager",
"ReadOnlyStaticsMemoryManager",
"PPLINT4KVMemoryManager",
Expand Down
89 changes: 11 additions & 78 deletions lightllm/common/kv_cache_mem_manager/mem_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from typing import List, Union, Tuple, Any
from typing import List, Tuple, Any, Union
from lightllm.server.pd_io_struct import KVMoveTask
from lightllm.utils.log_utils import init_logger
from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
from .allocator import KvCacheAllocator
from lightllm.utils.profile_max_tokens import get_available_gpu_memory, get_total_gpu_memory
from lightllm.common.kv_trans_kernel.kv_trans import kv_trans
from lightllm.utils.dist_utils import get_current_rank_in_node, get_node_world_size
Expand Down Expand Up @@ -38,27 +39,8 @@ def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False
# profile the max total token num if the size is None
self.profile_size(mem_fraction)

self.mem_state = torch.arange(
0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True
)
self._mem_state_return = torch.arange(
0, self.size * 3, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True
)
self._return_start = 0
self.mark_start = 0
self.mark_end = self.size

self.can_use_mem_size = self.size

# 用共享内存进行共享,router 模块读取进行精确的调度估计, nccl port 作为一个单机中单实列的标记。防止冲突。
from lightllm.utils.envs_utils import get_unique_server_name
self.allocator = KvCacheAllocator(self.size)

rank_in_node = get_current_rank_in_node()
self.shared_can_use_token_num = SharedInt(
f"{get_unique_server_name()}_mem_manger_can_use_token_num_{rank_in_node}"
)

self.shared_can_use_token_num.set_value(self.can_use_mem_size)
self._init_buffers(
self.size,
dtype,
Expand All @@ -83,9 +65,10 @@ def profile_size(self, mem_fraction):
if self.size is not None:
return

torch.cuda.empty_cache()
world_size = dist.get_world_size()
total_memory = get_total_gpu_memory()
available_memory = get_available_gpu_memory(world_size) - total_memory * (1 - mem_fraction)

available_memory = get_available_gpu_memory(world_size) * mem_fraction
cell_size = self.get_cell_size()
self.size = int(available_memory * 1024 ** 3 / cell_size)
if world_size > 1:
Expand Down Expand Up @@ -338,57 +321,13 @@ def _free_buffers(self):
self.kv_buffer = None

def alloc(self, need_size) -> torch.Tensor:
if need_size > self.mark_end - self.mark_start:
logger.error(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}")
assert False, "error alloc state"

start = self.mark_start
end = self.mark_start + need_size
self.mark_start += need_size

self.can_use_mem_size -= need_size
self.shared_can_use_token_num.set_value(self.can_use_mem_size)

# 利用缓冲区返回,避免异步情况下的内存竞争
if self._return_start + need_size > self._mem_state_return.shape[0]:
self._return_start = 0
ans = self._mem_state_return[self._return_start : self._return_start + need_size]
ans.copy_(self.mem_state[start:end])
self._return_start += need_size
return ans

def free(self, free_index: Union[torch.Tensor, List[int]]):
"""_summary_

Args:
free_index (torch.Tensor): _description_
"""

end = self.mark_start
start = self.mark_start - len(free_index)
assert start >= 0, f"error free state start: {self.mark_start} free len {len(free_index)}"

if isinstance(free_index, list):
self.mem_state.numpy()[start:end] = free_index
else:
# 从 gpu 到 cpu 的拷贝操作是流内阻塞操作
self.mem_state[start:end] = free_index

self.mark_start -= len(free_index)
return self.allocator.alloc(need_size)

self.can_use_mem_size += len(free_index)
self.shared_can_use_token_num.set_value(self.can_use_mem_size)

if self.can_use_mem_size == len(self.mem_state):
logger.debug(f"freed all gpu mem size {self.can_use_mem_size}")
return
def free(self, free_index: Union[torch.Tensor, List[int]]) -> None:
self.allocator.free(free_index)

def free_all(self):
self.can_use_mem_size = len(self.mem_state)
self.shared_can_use_token_num.set_value(self.can_use_mem_size)
self.mem_state.numpy()[:] = list(range(0, len(self.mem_state)))
self.mark_start = 0
self.mark_end = len(self.mem_state)
self.allocator.free_all()

def resize_mem(self, new_size):
Comment on lines 323 to 332
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For consistency with the free method, please add type hints for the parameters and return values of alloc, free_all, and resize_mem.

Suggested change
def alloc(self, need_size) -> torch.Tensor:
if need_size > self.mark_end - self.mark_start:
logger.error(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}")
assert False, "error alloc state"
start = self.mark_start
end = self.mark_start + need_size
self.mark_start += need_size
self.can_use_mem_size -= need_size
self.shared_can_use_token_num.set_value(self.can_use_mem_size)
# 利用缓冲区返回,避免异步情况下的内存竞争
if self._return_start + need_size > self._mem_state_return.shape[0]:
self._return_start = 0
ans = self._mem_state_return[self._return_start : self._return_start + need_size]
ans.copy_(self.mem_state[start:end])
self._return_start += need_size
return ans
def free(self, free_index: Union[torch.Tensor, List[int]]):
"""_summary_
Args:
free_index (torch.Tensor): _description_
"""
return self.allocator.alloc(need_size)
end = self.mark_start
start = self.mark_start - len(free_index)
assert start >= 0, f"error free state start: {self.mark_start} free len {len(free_index)}"
if isinstance(free_index, list):
self.mem_state.numpy()[start:end] = free_index
else:
# 从 gpu 到 cpu 的拷贝操作是流内阻塞操作
self.mem_state[start:end] = free_index
self.mark_start -= len(free_index)
self.can_use_mem_size += len(free_index)
self.shared_can_use_token_num.set_value(self.can_use_mem_size)
if self.can_use_mem_size == len(self.mem_state):
logger.debug(f"freed all gpu mem size {self.can_use_mem_size}")
return
def free(self, free_index: Union[torch.Tensor, List[int]]) -> None:
self.allocator.free(free_index)
def free_all(self):
self.can_use_mem_size = len(self.mem_state)
self.shared_can_use_token_num.set_value(self.can_use_mem_size)
self.mem_state.numpy()[:] = list(range(0, len(self.mem_state)))
self.mark_start = 0
self.mark_end = len(self.mem_state)
self.allocator.free_all()
def resize_mem(self, new_size):
def alloc(self, need_size: int) -> torch.Tensor:
return self.allocator.alloc(need_size)
def free(self, free_index: Union[torch.Tensor, List[int]]) -> None:
self.allocator.free(free_index)
def free_all(self) -> None:
self.allocator.free_all()
def resize_mem(self, new_size: int) -> None:

"""
Expand All @@ -401,13 +340,7 @@ def resize_mem(self, new_size):
layer_num = self.layer_num

self.size = new_size
self.mem_state = torch.arange(
0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True
)
self.mark_start = 0
self.mark_end = self.size
self.can_use_mem_size = self.size
self.shared_can_use_token_num.set_value(self.can_use_mem_size)
self.allocator.resize(new_size)
Comment on lines 342 to +343
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The HOLD_TOKEN_MEMINDEX attribute is initialized to self.size in __init__. When resizing the memory, this attribute should also be updated to reflect the new size, ensuring consistency for any components relying on this marker.

Suggested change
self.size = new_size
self.mem_state = torch.arange(
0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True
)
self.mark_start = 0
self.mark_end = self.size
self.can_use_mem_size = self.size
self.shared_can_use_token_num.set_value(self.can_use_mem_size)
self.allocator.resize(new_size)
self.size = new_size
self.allocator.resize(new_size)
self.HOLD_TOKEN_MEMINDEX = self.size

self._free_buffers()
self._init_buffers(size, dtype, head_num, head_dim, layer_num)
return
Expand Down
5 changes: 2 additions & 3 deletions lightllm/server/api_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,11 @@ def normal_or_p_d_start(args):
args.running_max_req_size = 3
args.batch_max_tokens = 2048
args.chunked_prefill_size = 1024
if args.mem_fraction > 0.82:
args.mem_fraction = 0.82
args.embed_cache_storage_size = 0.8
args.graph_max_batch_size = 32
logger.info(
f"performance_mode is personal, set running_max_req_size to 3,"
f"batch_max_tokens to 2048, chunked_prefill_size to 1024, mem_fraction to 0.82,"
f"batch_max_tokens to 2048, chunked_prefill_size to 1024,"
f"graph_max_batch_size to 32"
)

Expand Down
9 changes: 9 additions & 0 deletions lightllm/server/build_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ async def build_prompt(request, tools) -> str:
if request.chat_template_kwargs:
kwargs.update(request.chat_template_kwargs)

# 修复一些parser类型是默认打开thinking,但是 tokenizer有时候不知道打开了thinking。导致
# 构建的reasoning parser 和 tokenizer 的行为不对齐导致的问题。
from .api_openai import _get_reasoning_from_request

thinking = _get_reasoning_from_request(request)

kwargs["thinking"] = thinking
kwargs["enable_thinking"] = thinking

try:
input_str = tokenizer.apply_chat_template(**kwargs, tokenize=False, add_generation_prompt=True, tools=tools)
except BaseException as e:
Expand Down
2 changes: 1 addition & 1 deletion lightllm/server/httpserver/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ async def _encode(

if self.args.detail_log:
logger.debug(
f"req_id: {sampling_params.group_request_id} prompt: {prompt},\n"
f"req_id: {sampling_params.group_request_id} prompt: {prompt}\n"
f"samplingparmas: {sampling_params.to_dict()}\n"
f"token_ids: {prompt_ids}"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,8 @@ def _print_helper(self, node: LinearAttPagedTreeNode, indent):

def free_radix_cache_to_get_enough_token(self, need_token_num):
assert self.mem_manager is not None
if need_token_num > self.mem_manager.can_use_mem_size:
need_evict_token_num = need_token_num - self.mem_manager.can_use_mem_size
if need_token_num > self.mem_manager.allocator.can_use_mem_size:
need_evict_token_num = need_token_num - self.mem_manager.allocator.can_use_mem_size
release_mems = []
small_page_buffer_ids = []

Expand Down
10 changes: 2 additions & 8 deletions lightllm/server/router/dynamic_prompt/radix_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,12 +401,6 @@ def merge_unreferenced_nodes(self):
if merged_node:
worklist.append(merged_node)

def assert_leafs_is_right(self):
for node in self.evict_tree_set:
if node.is_leaf() and node.ref_counter == 0:
a = node.token_mem_index_value.cuda()
assert (self.mem_manager.mem_state[a] == 1).sum().item() == len(a)

def clear_tree_nodes(self):
"""
该函数只在测试时调用
Expand Down Expand Up @@ -497,8 +491,8 @@ def _print_helper(self, node: TreeNode, indent):

def free_radix_cache_to_get_enough_token(self, need_token_num):
assert self.mem_manager is not None
if need_token_num > self.mem_manager.can_use_mem_size:
need_evict_token_num = need_token_num - self.mem_manager.can_use_mem_size
if need_token_num > self.mem_manager.allocator.can_use_mem_size:
need_evict_token_num = need_token_num - self.mem_manager.allocator.can_use_mem_size
release_mems = []

def release_mem(mem_index):
Expand Down
6 changes: 3 additions & 3 deletions lightllm/server/router/model_infer/infer_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,8 @@ def _filter(self, finished_request_ids: List[int]):
f"free a batch state:\n"
f"radix refed token num {self.radix_cache.get_refed_tokens_num()}\n"
f"radix hold token num {self.radix_cache.get_tree_total_tokens_num()}\n"
f"mem manager can alloc token num {self.req_manager.mem_manager.can_use_mem_size}\n"
f"mem manager total size {self.req_manager.mem_manager.size}"
f"mem manager can alloc token num {self.req_manager.mem_manager.allocator.can_use_mem_size}\n"
f"mem manager total size {self.req_manager.mem_manager.allocator.size}\n"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The addition of a trailing newline \n at the end of the log message string will result in an extra empty line in the output. It is better to keep the log message concise without the trailing newline, consistent with the original implementation.

Suggested change
f"mem manager total size {self.req_manager.mem_manager.allocator.size}\n"
f"mem manager total size {self.req_manager.mem_manager.allocator.size}"

)

return
Expand Down Expand Up @@ -348,7 +348,7 @@ def get_can_alloc_token_num(self):
radix_cache_unref_token_num = (
self.radix_cache.get_tree_total_tokens_num() - self.radix_cache.get_refed_tokens_num()
)
return self.req_manager.mem_manager.can_use_mem_size + radix_cache_unref_token_num
return self.req_manager.mem_manager.allocator.can_use_mem_size + radix_cache_unref_token_num

def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: List["InferReq"]):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ def _alloc_to_frozen_some_tokens(self, move_task: KVMoveTask):
logger.debug(
f"radix refed token num {self.backend.radix_cache.get_refed_tokens_num()}\n"
f"radix hold token num {self.backend.radix_cache.get_tree_total_tokens_num()}\n"
f"mem manager can alloc token num {self.backend.model.mem_manager.can_use_mem_size}\n"
f"mem manager total size {self.backend.model.mem_manager.size}"
f"mem manager can alloc token num {self.backend.model.mem_manager.allocator.can_use_mem_size}\n"
f"mem manager total size {self.backend.model.mem_manager.allocator.size}\n"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Similar to the change in infer_batch.py, the trailing newline \n here adds an unnecessary empty line to the debug log output. Please remove it for cleaner log formatting.

Suggested change
f"mem manager total size {self.backend.model.mem_manager.allocator.size}\n"
f"mem manager total size {self.backend.model.mem_manager.allocator.size}"

f"frozened token num {frozen_token_num}\n"
f"estimated peak token num {estimated_peak_token_num}\n"
)
Expand Down
Loading