diff --git a/tests/engine/test_common_engine.py b/tests/engine/test_common_engine.py
index ac30f26d9ab..936c7ea76cd 100644
--- a/tests/engine/test_common_engine.py
+++ b/tests/engine/test_common_engine.py
@@ -1,4 +1,3 @@
-"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
@@ -12,3820 +11,1751 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-"""
-import asyncio
-import os
-import sys
-import tempfile
import threading
import time
-import types
-import unittest
-from unittest.mock import ANY, AsyncMock, MagicMock, Mock, patch
-
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", ".."))
+from types import SimpleNamespace
import numpy as np
-import paddle
-from e2e.utils.serving_utils import clean_ports
-
-if not hasattr(paddle, "enable_compat"):
- paddle.enable_compat = lambda scope=None: None
-
-from fastdeploy.cache_manager.cache_data import CacheStatus
-from fastdeploy.engine.args_utils import EngineArgs
-from fastdeploy.engine.common_engine import (
- EngineService,
- _format_worker_launch_failure_message,
- _read_latest_worker_traceback,
-)
-from fastdeploy.engine.request import (
- ControlRequest,
- ControlResponse,
- Request,
- RequestOutput,
- RequestStatus,
- RequestType,
-)
-from fastdeploy.utils import EngineError
-
-MODEL_NAME = os.getenv("MODEL_PATH", "/workspace/wenlei/models") + "/ERNIE-4.5-0.3B-Paddle"
-
-_STUB_PRETRAINED_CONFIG = {
- "architectures": ["StubForCausalLM"],
- "hidden_size": 64,
- "num_attention_heads": 8,
- "num_hidden_layers": 2,
- "vocab_size": 1000,
-}
-
-
-def _fake_model_post_init(self):
- self.is_unified_ckpt = False
- self.runner_type = "generate"
- self.convert_type = "auto"
- self.supported_tasks = []
- if not hasattr(self, "enable_mm"):
- self.enable_mm = False
-
-
-def _create_engine_config(args):
- with patch(
- "fastdeploy.config.PretrainedConfig.get_config_dict",
- return_value=(_STUB_PRETRAINED_CONFIG, None),
- ):
- with patch("fastdeploy.config.ModelConfig._post_init", _fake_model_post_init):
- return args.create_engine_config()
-
-
-class TestCommonEngine(unittest.TestCase):
- """Test case for EngineService functionality (lines 1215-1664)"""
-
- @classmethod
- def setUpClass(cls):
- """Set up EngineService for testing"""
- try:
- # Clean ports before starting the engine
- print("Pre-test port cleanup...")
- clean_ports()
-
- # Create engine args for testing
- engine_args = EngineArgs(
- model=MODEL_NAME,
- max_model_len=8192,
- tensor_parallel_size=1,
- engine_worker_queue_port=int(os.getenv("FD_ENGINE_QUEUE_PORT", "6778")),
- cache_queue_port=int(os.getenv("FD_CACHE_QUEUE_PORT", "6779")),
- )
-
- # Create and start the engine service
- cls.cfg = _create_engine_config(engine_args)
-
- with (
- patch(
- "fastdeploy.engine.common_engine.EngineWorkerQueue",
- TestCommonEngineAdditionalCoverage._make_full_dummy_q_cls(),
- ),
- patch("fastdeploy.engine.common_engine.EngineCacheQueue"),
- ):
- cls.engine = EngineService(cls.cfg, start_queue=False, use_async_llm=True)
-
- cls.engine.running = True
- cls.engine.ipc_signal_suffix = cls.cfg.parallel_config.local_engine_worker_queue_port
-
- cls.engine.worker_ready_signal = TestCommonEngineAdditionalCoverage._Sig(1)
- cls.engine.loaded_model_signal = TestCommonEngineAdditionalCoverage._Sig(1)
- cls.engine.worker_healthy_live_signal = TestCommonEngineAdditionalCoverage._Sig(int(time.time()))
- cls.engine.worker_proc = Mock(pid=12345)
-
- except Exception as e:
- print(f"Setting up EngineService failed: {e}")
- raise
-
- @classmethod
- def tearDownClass(cls):
- """Clean up after all tests"""
- if hasattr(cls, "engine") and cls.engine is not None:
- try:
- if hasattr(cls.engine, "resource_manager") and hasattr(cls.engine.resource_manager, "cache_manager"):
- cache_manager = cls.engine.resource_manager.cache_manager
- if not hasattr(cache_manager, "shm_cache_task_flag_broadcast"):
- cache_manager.shm_cache_task_flag_broadcast = Mock(clear=Mock())
- if not hasattr(cache_manager, "cache_ready_signal"):
- cache_manager.cache_ready_signal = Mock(clear=Mock())
- if getattr(cls.engine, "cache_manager_processes", None) is None:
- cls.engine.cache_manager_processes = []
- if hasattr(cls.engine, "_finalizer"):
- cls.engine._finalizer.detach()
- cls.engine.worker_proc = None
- cls.engine._exit_sub_services()
- print("Engine cleanup completed")
- except Exception as e:
- print(f"Error during engine cleanup: {e}")
-
- def setUp(self):
- """Set up before each test method"""
- print(f"Starting test: {self._testMethodName}")
-
- def tearDown(self):
- """Clean up after each test method"""
- print(f"Completed test: {self._testMethodName}")
-
- def test_engine_has_expected_attributes(self):
- """Consolidated lightweight attribute/callable checks."""
- expected_methods = [
- "_exit_sub_services",
- "_start_worker_service",
- "_stop_profile",
- "launch_components",
- "check_worker_initialize_status",
- ]
- for name in expected_methods:
- self.assertTrue(hasattr(self.engine, name))
- self.assertTrue(callable(getattr(self.engine, name)))
-
- if hasattr(self.engine, "worker_proc"):
- self.assertIsNotNone(self.engine.worker_proc)
-
- if hasattr(self.engine, "scheduler"):
- self.assertIsNotNone(self.engine.scheduler)
-
- if hasattr(self.engine, "worker_init_status"):
- self.assertIsInstance(self.engine.worker_init_status, dict)
-
- self.assertTrue(hasattr(self.engine, "do_profile"))
- self.assertTrue(self.engine.running)
-
- def test_worker_processes_ready(self):
- """Test _worker_processes_ready method (lines 1292-1299)"""
- # Test with real engine that should have worker_ready_signal
- if hasattr(self.engine, "worker_ready_signal"):
- result = self.engine._worker_processes_ready()
- # Result should be boolean
- self.assertIsInstance(result, bool)
- else:
- self.skipTest("worker_ready_signal not available")
-
- def test_init_worker_signals(self):
- """Test _init_worker_signals method (lines 1301-1361)"""
- # Since engine is already started, signals should be initialized
- self.assertTrue(hasattr(self.engine, "worker_ready_signal"))
- self.assertTrue(hasattr(self.engine, "loaded_model_signal"))
-
- # Test that signals have expected properties
- if hasattr(self.engine, "worker_ready_signal"):
- self.assertIsNotNone(self.engine.worker_ready_signal)
-
- if hasattr(self.engine, "loaded_model_signal"):
- self.assertIsNotNone(self.engine.loaded_model_signal)
-
- def test_setting_environ_variables(self):
- """Test _setting_environ_variables method (lines 1362-1408)"""
- result = self.engine._setting_environ_variables()
-
- # Check that result is a string and contains expected variables
- self.assertIsInstance(result, str)
- self.assertIn("ENABLE_FASTDEPLOY_LOAD_MODEL_CONCURRENCY=0", result)
- self.assertIn("PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python", result)
- self.assertIn("FLAGS_use_append_attn=1", result)
- self.assertIn("NCCL_ALGO=Ring", result)
-
- def test_check_health(self):
- """Test check_health method (lines 1533-1544)"""
- if hasattr(self.engine, "worker_healthy_live_signal"):
- is_healthy, message = self.engine.check_health(time_interval_threashold=30)
-
- # Should return tuple of (bool, str)
- self.assertIsInstance(is_healthy, bool)
- self.assertIsInstance(message, str)
- else:
- self.skipTest("worker_healthy_live_signal not available")
-
- def test_engine_started_successfully(self):
- """Test that engine started successfully and has expected state"""
- # Verify engine is running
- self.assertTrue(self.engine.running)
-
- # Verify data processor was created
- if hasattr(self.engine, "data_processor"):
- self.assertIsNotNone(self.engine.data_processor)
-
- # Verify IPC signal suffix is set
- if hasattr(self.engine, "ipc_signal_suffix"):
- self.assertIsNotNone(self.engine.ipc_signal_suffix)
-
-
-if __name__ == "__main__":
- unittest.main()
-
-
-class TestCommonEngineAdditionalCoverage(unittest.TestCase):
- """Additional unit tests focusing on branch coverage for common_engine.py
-
- These tests heavily mock subprocess/threading/IPC to avoid starting real workers
- and to drive specific code paths that were previously uncovered.
- """
-
- def setUp(self):
- cache_queue_patcher = patch("fastdeploy.engine.common_engine.EngineCacheQueue")
- cache_queue_patcher.start()
- self.addCleanup(cache_queue_patcher.stop)
-
- class _Sig:
- def __init__(self, v=0):
- self.value = np.array([v], dtype=np.int32)
-
- def clear(self):
- pass
-
- @staticmethod
- @staticmethod
- def _make_full_dummy_q_cls():
- class DummyQ:
- def __init__(self, *a, **k):
- self.available_prefill_instances = type("X", (), {"put": lambda *_: None})()
-
- def get_server_port(self):
- return 0
-
- def cleanup(self):
- pass
-
- def num_tasks(self):
- return 0
-
- def num_cache_infos(self):
- return 0
+import pytest
- def disaggregate_queue_empty(self):
- return True
+_noop = lambda *a, **kw: None
+_ns = SimpleNamespace
- def get_disaggregated_tasks(self):
- return []
- return DummyQ
+class _Sig:
+ def __init__(self, v=None):
+ self.value = v if v is not None else np.zeros([1], dtype=np.int32)
+ self.cleared = False
- @staticmethod
- def _make_dummy_executor(eng):
- class DummyExecutor:
- def __init__(self, max_workers=None):
- pass
-
- def submit(self, fn):
- try:
- fn()
- finally:
- eng.running = False
-
- return DummyExecutor
-
- def _make_mixed_engine(self):
- cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
- return self._make_engine(cfg)
-
- def _setup_v1_engine(self, eng):
- eng.running = True
- eng.is_paused = False
- eng._pause_cond = threading.Condition()
- self.addCleanup(lambda: setattr(eng, "running", False))
-
- @staticmethod
- def _make_v1_decode_rm(eng, schedule_result, with_add_request=False):
- class DummyRM:
- def __init__(self):
- self.abort_req_ids_set = set()
- self.waiting = []
- self.real_bsz = 1
- if with_add_request:
- self.add_request = Mock()
-
- def available_batch(self):
- return 1
-
- def schedule(self):
- eng.running = False
- return schedule_result
-
- def get_real_bsz(self):
- return self.real_bsz
-
- return DummyRM()
-
- @staticmethod
- def _make_v1_prefill_continuous_rm(eng, waiting_async_result=False):
- class DummyRM:
- def __init__(self):
- self.abort_req_ids_set = set()
- self.waiting = []
- self.real_bsz = 1
- self.add_request_in_p = Mock()
- self.pre_recycle_resource = Mock()
-
- def available_batch(self):
- return 1
-
- def apply_async_preprocess(self, _task):
- return None
-
- def preallocate_resource_in_p(self, _task):
- return True
-
- def waiting_async_process(self, _task):
- return waiting_async_result
-
- def schedule(self):
- eng.running = False
- return ([], [])
+ def clear(self):
+ self.cleared = True
- def get_real_bsz(self):
- return self.real_bsz
- return DummyRM()
-
- @staticmethod
- def _make_insert_tasks_rm(n=1):
- class DummyRM:
- def __init__(self):
- self.stop_flags = np.array([1] * n, dtype=np.int32)
- self.real_bsz = 1
-
- def check_and_free_block_tables(self):
- pass
-
- def allocate_resources_for_new_tasks(self, tasks):
- return tasks
-
- return DummyRM()
-
- @staticmethod
- def _make_scheduler_with_output(eng, token_ids, decode_type, finished, fmt="dict", include_raw=False):
- class DummyOutput:
- def __init__(self):
- self.token_ids = token_ids
- self.decode_type = decode_type
- self.tool_calls = None
-
- output = RequestOutput(
- request_id="rid",
- outputs=DummyOutput(),
- finished=finished,
- metrics=Mock(),
- )
-
- def get_results():
+def _make_cfg(**kw):
+ c = _ns(
+ parallel_config=_ns(
+ data_parallel_size=1,
+ local_data_parallel_id=0,
+ tensor_parallel_size=1,
+ local_engine_worker_queue_port=12345,
+ engine_worker_queue_port=[12345],
+ device_ids="0",
+ enable_expert_parallel=False,
+ expert_parallel_size=1,
+ chunked_moe_size=1,
+ disable_custom_all_reduce=False,
+ use_internode_ll_two_stage=False,
+ disable_sequence_parallel_moe=False,
+ ),
+ model_config=_ns(
+ model="test",
+ max_model_len=2048,
+ num_hidden_layers=2,
+ enable_mm=False,
+ quantization=None,
+ enable_logprob=False,
+ lm_head_fp32=False,
+ moe_gate_fp32=False,
+ enable_entropy=False,
+ runner="default",
+ convert="default",
+ override_pooler_config=None,
+ logprobs_mode="default",
+ max_logprobs=5,
+ ),
+ cache_config=_ns(
+ enable_prefix_caching=False,
+ enable_chunked_prefill=False,
+ block_size=16,
+ gpu_memory_utilization=0.9,
+ enc_dec_block_num=0,
+ num_gpu_blocks_override=None,
+ local_cache_queue_port=0,
+ max_block_num_per_seq=128,
+ kv_cache_ratio=1.0,
+ cache_transfer_protocol="shm",
+ kvcache_storage_backend=None,
+ num_cpu_blocks=0,
+ ),
+ scheduler_config=_ns(
+ max_num_seqs=32,
+ max_num_batched_tokens=4096,
+ splitwise_role="mixed",
+ name="local",
+ enable_overlap_schedule=False,
+ ),
+ master_ip="127.0.0.1",
+ host_ip="127.0.0.1",
+ worker_num_per_node=1,
+ max_prefill_batch=1,
+ max_num_partial_prefills=1,
+ nnode=1,
+ ips=None,
+ node_rank=0,
+ router_config=_ns(router=None, api_server_host="localhost", api_server_port=8080),
+ register_info={},
+ structured_outputs_config=_ns(
+ guided_decoding_backend="off",
+ disable_any_whitespace=False,
+ reasoning_parser="default",
+ logits_processors=None,
+ ),
+ load_config=_ns(load_strategy="default", rsync_config={}, dynamic_load_weight=False, load_choices="default"),
+ early_stop_config=_ns(to_json_string=lambda: "{}"),
+ speculative_config=_ns(method="none", to_json_string=lambda: "{}"),
+ graph_opt_config=_ns(to_json_string=lambda: "{}"),
+ plas_attention_config=_ns(to_json_string=lambda: "{}"),
+ eplb_config=_ns(enable_eplb=False, to_json_string=lambda: "{}"),
+ limit_mm_per_prompt=None,
+ mm_processor_kwargs=None,
+ tool_parser=None,
+ )
+ for k, v in kw.items():
+ setattr(c, k, v)
+ return c
+
+
+def _eng(mp, **co):
+ from fastdeploy.engine.common_engine import EngineService
+
+ e = object.__new__(EngineService)
+ e.cfg = _make_cfg(**co)
+ e.use_async_llm = e.is_paused = False
+ e.running = True
+ e._pause_cond = threading.Condition()
+ log = _ns(info=_noop, debug=_noop, error=_noop, warning=_noop, exception=_noop)
+ e.llm_logger = log
+ e.resource_manager = _ns(
+ stop_flags=np.array([True] * 4, dtype=bool),
+ check_and_free_block_tables=_noop,
+ cache_manager=_ns(
+ launch_cache_manager=lambda **kw: [], shm_cache_task_flag_broadcast=_Sig(), cache_ready_signal=_Sig()
+ ),
+ )
+ e.scheduler = _ns(
+ put_requests=lambda *a: [],
+ get_requests=lambda **kw: [],
+ put_results=_noop,
+ get_results=lambda: [],
+ start=_noop,
+ reset=_noop,
+ name="local",
+ )
+ for s in (
+ "exist_task",
+ "exist_swapped_task",
+ "exist_prefill_task",
+ "worker_healthy_live",
+ "cache_ready",
+ "swap_space_ready",
+ "cache_transfer_inited",
+ "model_weights_status",
+ "prefix_tree_status",
+ "kv_cache_status",
+ "loaded_model",
+ ):
+ setattr(e, f"{s}_signal", _Sig())
+ e.worker_ready_signal = _Sig(np.array([0], dtype=np.int32))
+ e.token_processor = _ns(clear_data=_noop, number_of_tasks=0, number_of_input_tokens=0)
+ e.engine_worker_queue = _ns(clear_data=_noop, put_tasks=_noop, exist_tasks=lambda: False)
+ e.split_connector = _ns(start_receiver=_noop)
+ e.partial_chunked_tokens = [0, e.cfg.scheduler_config.max_num_batched_tokens]
+ e._ctrl_worker_output_queues = []
+ return e
+
+
+def _task(rid="r1", preempted=False, disagg=None, carrier=None):
+ return _ns(
+ request_id=rid,
+ trace_carrier=carrier,
+ prompt_token_ids_len=32,
+ metrics=_ns(
+ inference_start_time=0,
+ scheduler_recv_req_time=time.time(),
+ add_req_to_resource_manager_time=0,
+ ask_decode_resource_start_time=0,
+ ask_decode_resource_finish_time=0,
+ decode_inference_start_time=0,
+ engine_get_req_time=0,
+ decode_recv_req_time=0,
+ decode_preallocate_req_time=0,
+ ),
+ disaggregate_info=disagg,
+ has_been_preempted_before=preempted,
+ set=lambda k, v: None,
+ user="test",
+ )
+
+
+def _ptr(mp):
+ mp.setattr("fastdeploy.engine.common_engine.trace_print", _noop)
+ mp.setattr("fastdeploy.engine.common_engine.tracing.trace_report_span", _noop)
+ mp.setattr("fastdeploy.engine.common_engine.tracing.trace_set_proc_propagate_context", _noop)
+ mp.setattr("fastdeploy.engine.common_engine.tracing.trace_get_proc_propagate_context", _noop)
+ mp.setattr("fastdeploy.engine.common_engine.tracing.trace_set_thread_info", _noop)
+
+
+def _zmq_recv(eng, items):
+ """Feed items to _insert_zmq_task_to_scheduler, then stop."""
+ idx = [0]
+
+ def recv(block):
+ if idx[0] >= len(items):
eng.running = False
- if fmt == "list":
- return [[output]]
- if include_raw:
- return {"rid": [output, "raw"]}
- return {"rid": [output]}
-
- eng.scheduler = Mock(get_results=get_results)
- return output
-
- @staticmethod
- def _make_ctrl_queue(name, payload, payload_wrapped=True):
- class DummyQueue:
- def __init__(self):
- self.name = name
-
- async def get(self, timeout=None):
- if payload_wrapped:
- return Mock(payload=payload)
- return payload
-
- return DummyQueue()
-
- @staticmethod
- def _make_dummy_recv(eng, payload=None, error=None):
- class DummyRecv:
- def receive_json_once(self, block):
- eng.running = False
- return error, payload
-
- def receive_pyobj_once(self, block):
- eng.running = False
- return error, payload
-
- def close(self):
- pass
-
- return DummyRecv()
-
- @staticmethod
- def _make_zmq_server_cls():
- class DummyServer:
- def __init__(self, *args, **kwargs):
- self.args = args
- self.kwargs = kwargs
-
- def recv_result_handle(self):
- return None
+ return "Context was terminated", None
+ r = items[idx[0]]
+ idx[0] += 1
+ return r
- return DummyServer
+ eng.recv_request_server = _ns(receive_json_once=recv)
+ eng._insert_zmq_task_to_scheduler()
- @staticmethod
- def _make_zmq_thread_cls(counter=None):
- class DummyThread:
- def __init__(self, target=None, daemon=None):
- self.target = target
- self.daemon = daemon
- def start(self):
- if counter is not None:
- counter["threads"] += 1
+# ---------------------------------------------------------------------------
- return DummyThread
- @staticmethod
- def _make_simple_dummy_q_cls():
- class DummyQ:
- def __init__(self, *a, **k):
- pass
+class TestInit:
+ def _deps(self, mp, v1=False, guided=False, dp=1):
+ from fastdeploy.engine.common_engine import EngineService
- return DummyQ
-
- @staticmethod
- def _make_mm_stub_module():
- stub_module = types.ModuleType("fastdeploy.model_executor.ops.gpu")
- stub_module.get_mm_split_fuse = lambda *args, **kwargs: (
- np.array([1], dtype="int64"),
- np.array([4], dtype="int64"),
- )
- return stub_module
-
- class _DummyPbar:
- def __init__(self):
- self.n = 0
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc, tb):
- return False
-
- def update(self, delta=0, *args, **kwargs):
- try:
- self.n += int(delta)
- except Exception:
- self.n = 0
-
- def refresh(self):
- pass
-
- @staticmethod
- def _detach_finalizer(engine):
- if hasattr(engine, "_finalizer"):
- try:
- engine._finalizer.detach()
- except Exception:
- pass
-
- def _make_cfg(self, **kwargs):
- # If DP > 1, we must provide enough engine_worker_queue_port for each dp index
- dp = kwargs.get("data_parallel_size", 1)
- nnode = len(kwargs.get("ips", ["127.0.0.1"]))
- engine_worker_queue_port = int(os.getenv("FD_ENGINE_QUEUE_PORT", "6778"))
- cache_queue_port = int(os.getenv("FD_CACHE_QUEUE_PORT", "6779"))
- if dp and dp > 1:
- engine_worker_queue_port = [engine_worker_queue_port + 21 + i for i in range(dp // nnode)]
- cache_queue_port = [cache_queue_port + 21 + i for i in range(dp // nnode)]
-
- if kwargs.get("num_gpu_blocks_override") is not None and "kv_cache_ratio" not in kwargs:
- kwargs["kv_cache_ratio"] = 1
-
- args = EngineArgs(
- model=MODEL_NAME,
- max_model_len=128,
- tensor_parallel_size=1,
- # give unique ports to avoid collision with other tests
- engine_worker_queue_port=engine_worker_queue_port,
- cache_queue_port=cache_queue_port,
- enable_prefix_caching=True,
- **kwargs,
+ mp.setattr(EngineService, "_exit_sub_services", lambda self: None)
+ cfg = _make_cfg()
+ cfg.scheduler_config.scheduler = lambda: _ns(
+ put_requests=lambda *a: [],
+ get_requests=lambda **kw: [],
+ put_results=_noop,
+ get_results=lambda: [],
+ start=_noop,
)
- # Keep batch tokens small to satisfy FDConfig checks:
- # max_num_batched_tokens <= max_model_len * max_num_seqs
- if getattr(args, "max_num_batched_tokens", None) is None:
- args.max_num_batched_tokens = 128
- # Always enable chunked prefill in tests to avoid another strict check
- args.enable_chunked_prefill = True
-
- return _create_engine_config(args)
-
- def _stub_processor(self):
- class _Tok:
- def __init__(self):
- self.vocab = {"": 42, "\n": 10, "<|IMAGE_PLACEHOLDER|>": 9}
-
- def get_vocab(self):
- return self.vocab
-
- class _Proc:
- def __init__(self):
- self.tokenizer = _Tok()
- self.eos_token_id_len = 1
- self.pad_token_id = 0
-
- return _Proc()
-
- def _make_engine(self, cfg):
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_full_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=False)
- return eng
-
- def test_start_prefill_branch_cache_manager_and_worker_dead(self):
- """Cover lines 184-185, 194-197, 221, 226-227 in start()."""
- # For prefill + local scheduler the core code now requires a router.
- # Also, with the newer CacheConfig semantics we must ensure that
- # prefill_kvcache_block_num (num_gpu_blocks_override * kv_cache_ratio)
- # is >= max_block_num_per_seq; use 3 blocks so that with the default
- # kv_cache_ratio=0.75 we still satisfy the assertion.
- with patch("fastdeploy.engine.args_utils.envs.ENABLE_V1_KVCACHE_SCHEDULER", 0):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
- kv_cache_ratio=1,
- )
-
- # Patch EngineWorkerQueue before EngineService ctor to avoid real IPC
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_simple_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- # Patch heavy pieces
- eng.create_data_processor = lambda: setattr(eng, "data_processor", self._stub_processor())
- eng._process_splitwise_task = lambda: None
- eng._schedule_request_to_worker = lambda: None
- eng._schedule_request_to_worker_v1 = lambda: None
-
- started_cache = {}
-
- def fake_start_cache(device_ids, suffix):
- started_cache["called"] = True
- # return a list to mimic processes
- return [object()]
-
- eng.start_cache_service = fake_start_cache
-
- # Signals: make loaded_model_signal ready immediately; include launched_cache_manager_signal
- def fake_init_signals():
- eng.worker_ready_signal = self._Sig(0)
- eng.loaded_model_signal = self._Sig(1) # ready -> skip wait loop
- eng.launched_cache_manager_signal = self._Sig(0)
-
- eng._init_worker_signals = fake_init_signals
-
- # Worker start stub and initialization status -> False to trigger error path
- eng._start_worker_service = lambda: Mock(stdout=Mock(), poll=lambda: None)
- eng.check_worker_initialize_status = lambda: False
-
- with patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None):
- # Avoid starting token processor loop
- eng.token_processor.run = lambda: None
- ok = eng.start(async_llm_pid=12345)
-
- # start() returns False on failure
- self.assertFalse(ok)
- # cache manager started before workers (lines 184-185)
- self.assertTrue(started_cache.get("called", False))
- # avoid atexit finalizer
- self._detach_finalizer(eng)
-
- def test_start_mixed_branch_cache_after_load_and_zmq(self):
- """Cover lines 215-217 and 231 in start()."""
- cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_simple_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- eng.create_data_processor = lambda: setattr(eng, "data_processor", self._stub_processor())
- eng._process_splitwise_task = lambda: None
- eng._schedule_request_to_worker = lambda: None
- eng._schedule_request_to_worker_v1 = lambda: None
-
- started_cache = {}
-
- def fake_start_cache(device_ids, suffix):
- started_cache["called"] = True
- return [object()]
-
- eng.start_cache_service = fake_start_cache
-
- def fake_init_signals():
- eng.worker_ready_signal = self._Sig(0)
- eng.loaded_model_signal = self._Sig(1)
- eng.launched_cache_manager_signal = self._Sig(0)
-
- eng._init_worker_signals = fake_init_signals
-
- eng._start_worker_service = lambda: Mock(stdout=Mock(), poll=lambda: None)
- eng.check_worker_initialize_status = lambda: True
- eng.do_profile = 0
- eng.cfg.cache_config.enable_prefix_caching = True
-
- zmq_called = {}
- eng.start_zmq_service = lambda pid: zmq_called.setdefault("pid", pid)
-
- with patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None):
- eng.token_processor.run = lambda: None
- eng.start(async_llm_pid=8888)
-
- self.assertTrue(started_cache.get("called", False)) # lines 215-217
- self.assertEqual(zmq_called.get("pid"), 8888) # line 231
- self._detach_finalizer(eng)
-
- def test_update_requests_chunk_size_assigns_chunks(self):
- eng = self._make_mixed_engine()
- eng.partial_chunked_tokens = [0, 32, 16, 8]
- eng.cfg.scheduler_config.max_num_batched_tokens = 32
- eng.cfg.cache_config.block_size = 8
- eng.cfg.cache_config.enable_chunked_prefill = True
-
- requests = [
- Request(request_id="r0", prompt_token_ids=[1] * 24, prompt_token_ids_len=24),
- Request(request_id="r1", prompt_token_ids=[1] * 8, prompt_token_ids_len=8),
- ]
-
- eng.update_requests_chunk_size(requests)
-
- for req in requests:
- chunk_info = req.get("prefill_chunk_info")
- self.assertIsInstance(chunk_info, list)
- self.assertGreater(len(chunk_info), 0)
- self.assertEqual(sum(chunk_info), req.prompt_token_ids_len)
- self._detach_finalizer(eng)
-
- def test_update_mm_requests_chunk_size_with_stub_fuse(self):
- eng = self._make_mixed_engine()
- eng.cfg.cache_config.enable_chunked_prefill = True
- eng.partial_chunked_tokens = [0, 16]
- eng.data_processor = type("DP", (), {"image_patch_id": 9})()
-
- inputs = {
- "input_ids": np.array([9, 1, 2, 3], dtype="int64"),
- "token_type_ids": np.array([0, 0, 0, 0], dtype="int64"),
- "image_type_ids": np.array([1], dtype="int32"),
- "grid_thw": np.array([[1, 2, 2]], dtype="int64"),
- "images": np.ones((4,), dtype="uint8"),
- "position_ids": np.array([0, 1, 2, 3], dtype="int64"),
- }
- req = Request(request_id="mm0", multimodal_inputs=inputs)
-
- with patch.dict("sys.modules", {"fastdeploy.model_executor.ops.gpu": self._make_mm_stub_module()}):
- eng.update_mm_requests_chunk_size([req])
-
- chunk_info = req.get("prefill_chunk_info")
- self.assertIsInstance(chunk_info, list)
- self.assertEqual(len(chunk_info), 1)
- self.assertEqual(chunk_info[0]["input_ids"].tolist(), inputs["input_ids"].tolist())
- self.assertIsNotNone(chunk_info[0]["images"])
- self._detach_finalizer(eng)
-
- def test_send_error_response_routes(self):
- eng = self._make_mixed_engine()
- eng.send_response_server = Mock()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- ):
- eng._send_error_response("rid0", "boom", error_code=400)
- eng.send_response_server.send_response.assert_called_with("rid0", [ANY])
-
- eng.send_response_server.reset_mock()
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True),
- ):
- eng._send_error_response("rid2", "boom", error_code=400)
- eng.send_response_server.send_response.assert_called_with(None, [ANY], worker_pid=None)
-
- eng.send_response_server.reset_mock()
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True):
- eng._send_error_response("rid1", "boom", error_code=500)
- eng.send_response_server.send_response.assert_called_with(None, [ANY])
-
- self._detach_finalizer(eng)
-
- def test_decode_token_with_return_text(self):
- eng = self._make_mixed_engine()
-
- class DummyProcessor:
- def __init__(self):
- self.decode_status = {"rid": (0, 2)}
-
- def ids2tokens(self, token_ids, req_id):
- return "hi", [101, 102], None
-
- eng.data_processor = DummyProcessor()
-
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", True):
- delta, token_ids = eng._decode_token([101, 102], "rid", is_end=True)
-
- self.assertEqual(delta, "hi")
- self.assertEqual(token_ids, [101, 102])
- self.assertNotIn("rid", eng.data_processor.decode_status)
- self._detach_finalizer(eng)
-
- def test_decode_token_without_return_text(self):
- eng = self._make_mixed_engine()
-
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", False):
- delta, token_ids = eng._decode_token([9, 10], "rid", is_end=False)
-
- self.assertEqual(delta, "")
- self.assertEqual(token_ids, [9, 10])
- self._detach_finalizer(eng)
-
- def test_decode_token_return_text_empty_delta(self):
- eng = self._make_mixed_engine()
-
- class DummyProcessor:
- def __init__(self):
- self.decode_status = {"rid": (0, 1)}
-
- def ids2tokens(self, token_ids, req_id):
- return "", [7], None
-
- eng.data_processor = DummyProcessor()
-
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", True):
- delta, token_ids = eng._decode_token([7], "rid", is_end=True)
-
- self.assertEqual(delta, "")
- self.assertEqual(token_ids, [])
- self.assertNotIn("rid", eng.data_processor.decode_status)
- self._detach_finalizer(eng)
-
- def test_clear_data_success_and_failure(self):
- eng = self._make_mixed_engine()
- eng.token_processor = Mock()
- eng.engine_worker_queue = Mock()
- eng.send_response_server = Mock(req_dict={"a": 1})
- eng.recv_request_server = Mock(req_dict={"b": 2})
-
- self.assertTrue(eng.clear_data())
- self.assertEqual(eng.send_response_server.req_dict, {})
- self.assertEqual(eng.recv_request_server.req_dict, {})
-
- eng.token_processor.clear_data.side_effect = RuntimeError("boom")
- self.assertFalse(eng.clear_data())
- self._detach_finalizer(eng)
-
- def test_insert_prefilled_requests_recycles_and_dispatches(self):
- cfg = self._make_cfg(splitwise_role="decode", num_gpu_blocks_override=4, router="0.0.0.0:30000")
- cfg.speculative_config.method = "mtp"
- eng = self._make_engine(cfg)
-
- class DummyRM:
- def __init__(self):
- self.req_dict = {"r0": 0, "r1": 1, "r2": 2}
- self.tasks_list = [
- Request(request_id="r0", prompt_token_ids=[0], prompt_token_ids_len=1),
- Request(request_id="r1", prompt_token_ids=[0], prompt_token_ids_len=1),
- Request(request_id="r2", prompt_token_ids=[0], prompt_token_ids_len=1),
- ]
- self.stop_flags = np.array([False, False, False])
- self.real_bsz = 1
- self.recycled = []
-
- def _recycle_block_tables(self, req):
- self.recycled.append(req.request_id)
-
- eng.resource_manager = DummyRM()
- eng.token_processor = Mock()
- eng.token_processor.tokens_counter = {"r0": 1, "r1": 1}
- eng.scheduler = Mock()
- eng.engine_worker_queue = Mock()
-
- class DummyOutputs:
- def __init__(self, token_ids, draft_token_ids=None):
- self.token_ids = token_ids
- self.draft_token_ids = draft_token_ids or []
- self.tool_calls = None
-
- outputs_empty = DummyOutputs([])
- outputs_error = DummyOutputs([1], [9])
- outputs_ok = DummyOutputs([2], [8])
- req_out_empty = RequestOutput(request_id="r0", outputs=outputs_empty, metrics=Mock(), num_cached_tokens=0)
- req_out_error = RequestOutput(
- request_id="r1",
- outputs=outputs_error,
- metrics=Mock(),
- num_cached_tokens=0,
- error_code=500,
- error_msg="bad",
+ mp.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", v1)
+ mp.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_CACHE_TASK", "0")
+ rm = "ResourceManagerV1" if v1 else "ResourceManager"
+ mp.setattr(
+ f"fastdeploy.engine.common_engine.{rm}",
+ lambda *a, **kw: _ns(
+ scheduler_metrics_logger=None,
+ cache_manager=_ns(shm_cache_task_flag_broadcast=_Sig(), cache_ready_signal=_Sig()),
+ ),
)
- req_out_ok = RequestOutput(request_id="r2", outputs=outputs_ok, metrics=Mock(), num_cached_tokens=3)
-
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True):
- eng._insert_prefilled_requests([req_out_empty, req_out_error, req_out_ok])
-
- self.assertIn("r0", eng.resource_manager.recycled)
- self.assertIn("r1", eng.resource_manager.recycled)
- self.assertIn("r2", eng.token_processor.tokens_counter)
- eng.engine_worker_queue.put_tasks.assert_called()
- self._detach_finalizer(eng)
-
- def test_task_finished_helpers(self):
- eng = self._make_mixed_engine()
-
- class DummyRM:
- def __init__(self):
- self.stop_flags = np.array([True, False, True])
-
- eng.resource_manager = DummyRM()
-
- self.assertTrue(eng.task_is_finished(0))
- self.assertFalse(eng.task_is_finished(1))
- self.assertFalse(eng.all_tasks_finished())
- eng.resource_manager.stop_flags = np.array([True, True])
- self.assertTrue(eng.all_tasks_finished())
- self._detach_finalizer(eng)
-
- def test_start_worker_queue_service_with_servers(self):
- cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
-
- class DummyQueue:
- def __init__(self, *args, **kwargs):
- self.kwargs = kwargs
-
- def get_server_port(self):
- return 12345
-
- def cleanup(self):
- pass
-
- class DummyCacheQueue(DummyQueue):
- pass
-
- eng = self._make_engine(cfg)
- with (
- patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQueue),
- patch("fastdeploy.engine.common_engine.EngineCacheQueue", DummyCacheQueue),
- patch("fastdeploy.engine.common_engine.envs.FD_ENGINE_TASK_QUEUE_WITH_SHM", False),
- ):
- eng.start_worker_queue_service(start_queue=True)
-
- self.assertEqual(eng.cfg.parallel_config.local_engine_worker_queue_port, 12345)
- self._detach_finalizer(eng)
-
- def test_init_worker_monitor_signals_creates_ipc(self):
- cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
-
- created = []
-
- class DummySignal:
- def __init__(self, name, array, dtype, suffix, create):
- self.name = name
- self.array = array
- self.dtype = dtype
- self.suffix = suffix
- self.create = create
- created.append(name)
-
- with (
- patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_full_dummy_q_cls()),
- patch("fastdeploy.engine.common_engine.IPCSignal", DummySignal),
- ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- self.assertIn("exist_task_signal", created)
- self.assertIn("worker_healthy_live_signal", created)
- self.assertTrue(hasattr(eng, "kv_cache_status_signal"))
- self._detach_finalizer(eng)
-
- def test_init_worker_signals_with_profile(self):
- eng = self._make_mixed_engine()
- eng.ipc_signal_suffix = 7777
- eng.do_profile = 1
-
- class DummySignal:
- def __init__(self, *args, **kwargs):
- self.value = np.zeros([1], dtype=np.int32)
-
- def clear(self):
- pass
-
- with patch("fastdeploy.engine.common_engine.IPCSignal", DummySignal):
- eng._init_worker_signals()
-
- self.assertIsNotNone(eng.worker_ready_signal)
- self.assertIsNotNone(eng.loaded_model_signal)
- self.assertTrue(hasattr(eng, "get_profile_block_num_signal"))
- self._detach_finalizer(eng)
-
- def test_worker_processes_ready_and_health(self):
- eng = self._make_mixed_engine()
- eng.worker_ready_signal = type("Sig", (), {"value": np.array([1], dtype=np.int32)})()
- eng.cfg.worker_num_per_node = 1
- self.assertTrue(eng._worker_processes_ready())
-
- eng.worker_healthy_live_signal = type("Sig", (), {"value": np.array([time.time() - 100])})()
- is_healthy, message = eng.check_health(time_interval_threashold=1)
- self.assertFalse(is_healthy)
- self.assertIn("Not Healthy", message)
- self._detach_finalizer(eng)
-
- def test_stop_profile_resets_cache(self):
- cfg = self._make_cfg(splitwise_role="prefill", num_gpu_blocks_override=4, router="0.0.0.0:30000")
- eng = self._make_engine(cfg)
- eng.ipc_signal_suffix = 9999
- eng.do_profile = 1
- eng.get_profile_block_num_signal = type("Sig", (), {"value": np.array([8])})()
- eng.resource_manager = Mock()
- eng.start_cache_service = Mock(return_value=[Mock()])
-
- eng._stop_profile()
-
- self.assertEqual(eng.do_profile, 0)
- eng.resource_manager.reset_cache_config.assert_called_once()
- self.assertIsNotNone(eng.cache_manager_processes)
- self._detach_finalizer(eng)
-
- def test_start_worker_queue_service_with_shm_address(self):
- cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
-
- class DummyQueue:
- def __init__(self, *args, **kwargs):
- self.kwargs = kwargs
-
- def get_server_port(self):
- return 22222
-
- def cleanup(self):
- pass
-
- class DummyCacheQueue(DummyQueue):
- pass
-
- eng = self._make_engine(cfg)
- with (
- patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQueue),
- patch("fastdeploy.engine.common_engine.EngineCacheQueue", DummyCacheQueue),
- patch("fastdeploy.engine.common_engine.envs.FD_ENGINE_TASK_QUEUE_WITH_SHM", True),
- ):
- eng.start_worker_queue_service(start_queue=True)
-
- address = eng.engine_worker_queue.kwargs["address"]
- self.assertTrue(isinstance(address, str))
- self.assertIn("/dev/shm/fd_task_queue_", address)
- self._detach_finalizer(eng)
-
- def test_start_worker_service_builds_command(self):
- eng = self._make_mixed_engine()
- eng.do_profile = 0
- eng.data_processor = type(
- "DP",
- (),
- {
- "tokenizer": type(
- "Tok",
- (),
- {
- "vocab": {"": 5, "<|IMAGE_PLACEHOLDER|>": 9, "\n": 10},
- "get_vocab": lambda self: self.vocab,
- },
- )(),
- "eos_token_id_len": 1,
- "pad_token_id": 0,
- },
- )()
-
- with patch("fastdeploy.engine.common_engine.subprocess.Popen") as popen_mock:
- popen_mock.return_value = Mock()
- proc = eng._start_worker_service()
-
- popen_mock.assert_called_once()
- self.assertIs(proc, popen_mock.return_value)
- self._detach_finalizer(eng)
-
- def test_exit_sub_services_cleans_up(self):
- eng = self._make_mixed_engine()
- eng.use_async_llm = True
- eng.worker_proc = Mock(pid=1234)
- eng.cache_manager_processes = [Mock(pid=2345)]
- eng.cache_task_queue = Mock(cleanup=Mock())
- eng.resource_manager = Mock(
- cache_manager=Mock(
- shm_cache_task_flag_broadcast=Mock(clear=Mock()),
- cache_ready_signal=Mock(clear=Mock()),
- )
+ mp.setattr("fastdeploy.engine.common_engine.FMQ", lambda: _ns(queue=lambda n, r: _ns()))
+ mp.setattr(
+ EngineService, "start_worker_queue_service", lambda self, sq: setattr(self, "engine_worker_queue", _ns())
)
- eng.worker_ready_signal = Mock(clear=Mock())
- eng.loaded_model_signal = Mock(clear=Mock())
- eng.exist_task_signal = Mock(clear=Mock())
- eng.exist_swapped_task_signal = Mock(clear=Mock())
- eng.worker_healthy_live_signal = Mock(clear=Mock())
- eng.cache_ready_signal = Mock(clear=Mock())
- eng.swap_space_ready_signal = Mock(clear=Mock())
- eng.cache_transfer_inited_signal = Mock(clear=Mock())
- eng.exist_prefill_task_signal = Mock(clear=Mock())
- eng.model_weights_status_signal = Mock(clear=Mock())
- eng.prefix_tree_status_signal = Mock(clear=Mock())
- eng.kv_cache_status_signal = Mock(clear=Mock())
- eng.engine_worker_queue_server = Mock(cleanup=Mock())
- eng.send_response_server = Mock(close=Mock())
- eng.recv_request_server = Mock(close=Mock())
- eng.recv_control_cmd_server = Mock(close=Mock())
-
- with (
- patch("fastdeploy.engine.common_engine.os.getpgid", return_value=1111),
- patch("fastdeploy.engine.common_engine.os.killpg"),
- ):
- eng._exit_sub_services()
-
- eng.cache_task_queue.cleanup.assert_called_once()
- eng.engine_worker_queue_server.cleanup.assert_called_once()
- eng.send_response_server.close.assert_called_once()
-
- def test_setting_environ_variables_splitwise_and_mm(self):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ mp.setattr("fastdeploy.engine.common_engine.SplitwiseConnector", lambda *a, **kw: _ns(start_receiver=_noop))
+ mp.setattr(
+ "fastdeploy.engine.common_engine.TokenProcessor",
+ lambda *a, **kw: _ns(set_resource_manager=_noop, set_scheduler_metrics_logger=_noop),
)
- cfg.model_config.enable_mm = True
- eng = self._make_engine(cfg)
-
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True):
- result = eng._setting_environ_variables()
-
- self.assertIn("FLAGS_use_pd_disaggregation_per_chunk=1", result)
- self.assertIn("FLAGS_fmt_write_cache_completed_signal=1", result)
- self.assertIn("FLAGS_max_partition_size=1024", result)
- self._detach_finalizer(eng)
-
- def test_start_cache_service_forwards_args(self):
- eng = self._make_mixed_engine()
- eng.resource_manager.cache_manager = Mock()
- eng.resource_manager.cache_manager.launch_cache_manager = Mock(return_value=["proc"])
-
- result = eng.start_cache_service(["0"], 9999)
-
- eng.resource_manager.cache_manager.launch_cache_manager.assert_called_once()
- self.assertEqual(result, ["proc"])
- self._detach_finalizer(eng)
-
- def test_control_update_weights_success(self):
- eng = self._make_mixed_engine()
- eng.is_paused = True
- eng._pause_cond = threading.Condition()
- eng._call_worker = Mock(return_value={"ok": True})
-
- result = eng._control_update_weights(ControlRequest(request_id="ctrl", method="update_weights"))
- self.assertEqual(result, {"ok": True})
- self._detach_finalizer(eng)
-
- def test_control_update_weights_updates_cfg_version(self):
- eng = self._make_mixed_engine()
- eng.is_paused = True
- eng._pause_cond = threading.Condition()
- eng.cfg.model_config.version = "old-version"
- eng._call_worker = Mock(return_value=[{"version": "new-version"}, {"ok": True}])
-
- result = eng._control_update_weights(ControlRequest(request_id="ctrl", method="update_weights"))
-
- self.assertEqual(result, [{"version": "new-version"}, {"ok": True}])
- self.assertEqual(eng.cfg.model_config.version, "new-version")
- self._detach_finalizer(eng)
-
- def test_control_update_weights_updates_cache_transfer_metadata(self):
- eng = self._make_mixed_engine()
- eng.is_paused = True
- eng._pause_cond = threading.Condition()
- eng.cfg.cache_config.num_cpu_blocks = 1
- eng._call_worker = Mock(return_value=[{"version": "new-version"}])
- eng.cache_task_queue = Mock(put_transfer_task=Mock())
- eng._wait_for_control_responses = AsyncMock(return_value=[{"ok": True}])
-
- result = eng._control_update_weights(ControlRequest(request_id="ctrl", method="update_weights"))
-
- self.assertEqual(result, [{"version": "new-version"}])
- payload = eng.cache_task_queue.put_transfer_task.call_args.args[0]
- self.assertEqual(payload[0], CacheStatus.CTRL)
- self.assertEqual(payload[1].method, "update_weights")
- self.assertIn("update_weights", payload[1].request_id)
- eng._wait_for_control_responses.assert_awaited_once_with(
- payload[1].request_id, 60, executors=["cache_transfer"]
+ mp.setattr("fastdeploy.engine.common_engine.SchedulerMetricsLogger", lambda *a, **kw: _ns())
+ mp.setattr("fastdeploy.engine.common_engine.IPCSignal", lambda **kw: _Sig(kw.get("array")))
+ if dp > 1:
+ cfg.parallel_config.data_parallel_size = dp
+ mp.setattr("fastdeploy.engine.common_engine.get_logger", lambda *a, **kw: _ns(info=_noop))
+ if guided:
+ cfg.structured_outputs_config.guided_decoding_backend = "xgrammar"
+ mp.setattr("fastdeploy.engine.common_engine.schema_checker", lambda *a, **kw: _ns())
+ return cfg
+
+ def test_init_v0_and_v1(self, monkeypatch):
+ from fastdeploy.engine.common_engine import EngineService
+
+ cfg = self._deps(monkeypatch, v1=False)
+ e = EngineService(cfg, start_queue=False, use_async_llm=False)
+ assert e.is_paused is False and e.guided_decoding_checker is None
+ cfg = self._deps(monkeypatch, v1=True)
+ e = EngineService(cfg, start_queue=False, use_async_llm=True)
+ assert e.use_async_llm is True and e.do_profile == 1
+
+ def test_init_options(self, monkeypatch):
+ from fastdeploy.engine.common_engine import EngineService
+
+ cfg = self._deps(monkeypatch, dp=2)
+ e = EngineService(cfg, start_queue=False, use_async_llm=False)
+ assert e.cfg.parallel_config.data_parallel_size == 2
+ cfg = self._deps(monkeypatch, guided=True)
+ e = EngineService(cfg, start_queue=False, use_async_llm=False)
+ assert e.guided_decoding_checker is not None
+ # eplb enabled (L176-177)
+ cfg2 = self._deps(monkeypatch)
+ cfg2.eplb_config.enable_eplb = True
+ monkeypatch.setattr("fastdeploy.engine.common_engine.init_eplb_signals", _noop)
+ e2 = EngineService(cfg2, start_queue=False, use_async_llm=False)
+ assert e2.cfg.eplb_config.enable_eplb
+
+
+class TestLifecycle:
+ def test_start_and_register(self, monkeypatch):
+ e = _eng(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False)
+ e.token_processor.tasks_queue = None
+ e.token_processor.run = _noop
+ e.cfg.router_config.router = None
+ e._schedule_request_to_worker = _noop
+ e.start()
+ assert e.running and hasattr(e, "insert_task_to_worker_thread")
+ # v1
+ e2 = _eng(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True)
+ e2.token_processor.tasks_queue = None
+ e2.token_processor.run = _noop
+ e2.cfg.router_config.router = None
+ e2._schedule_request_to_worker_v1 = _noop
+ e2.start()
+ # decode role
+ e3 = _eng(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False)
+ e3.token_processor.tasks_queue = None
+ e3.token_processor.run = _noop
+ e3.cfg.scheduler_config.splitwise_role = "decode"
+ e3.cfg.router_config.router = None
+ e3._schedule_request_to_worker = _noop
+ dc = []
+ e3._decode_process_splitwise_requests = lambda: dc.append(1)
+ e3.start()
+ assert len(dc) == 1
+ # async_llm → start_worker_service (L193)
+ e4 = _eng(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False)
+ e4.use_async_llm = True
+ e4.start_worker_service = _noop
+ e4.token_processor.tasks_queue = None
+ e4.token_processor.run = _noop
+ e4.cfg.router_config.router = None
+ e4._schedule_request_to_worker = _noop
+ e4.start()
+ # register_to_router with router enabled (L1748-1749)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.time.sleep", _noop)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.check_service_health", lambda u: False)
+ e5 = _eng(monkeypatch)
+ e5.cfg.router_config.router = "http://test:9999"
+ e5._register_to_router()
+ # daemon thread started — give it a tick then stop engine
+ e5.running = False
+ # register_to_router: success + error + exception paths (L1741-1759)
+ import threading as _thr
+
+ _done = _thr.Event()
+ _post_n = [0]
+
+ def _mock_post(*a, **kw):
+ _post_n[0] += 1
+ if _post_n[0] == 1:
+ return _ns(ok=True)
+ if _post_n[0] == 2:
+ return _ns(ok=False, status_code=500, text="err")
+ raise ConnectionError("mock")
+
+ _sleep_n = [0]
+
+ def _mock_sleep(secs):
+ _sleep_n[0] += 1
+ if _sleep_n[0] >= 3:
+ _done.set()
+ raise OSError("stop")
+
+ monkeypatch.setattr("fastdeploy.engine.common_engine.time.sleep", _mock_sleep)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.check_service_health", lambda u: True)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.requests.post", _mock_post)
+ e6 = _eng(monkeypatch)
+ e6.cfg.router_config.router = "http://test:9999"
+ e6.cfg.register_info = {"host": "test"}
+ e6._register_to_router()
+ _done.wait(timeout=5)
+
+ def test_worker_service(self, monkeypatch):
+ EWQ = lambda **kw: _ns(get_server_port=lambda: 12345, cleanup=_noop)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.EngineWorkerQueue", EWQ)
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.EngineCacheQueue", lambda **kw: _ns(get_server_port=lambda: 9999)
)
- self._detach_finalizer(eng)
-
- def test_control_pause_and_resume_paths(self):
- eng = self._make_mixed_engine()
- eng.is_paused = False
- eng._pause_cond = threading.Condition()
- eng.engine_worker_queue = Mock(exist_tasks=Mock(return_value=False), put_tasks=Mock())
- eng.resource_manager = Mock(
- preempted_all=Mock(return_value=[Request(request_id="r1", prompt_token_ids=[1], prompt_token_ids_len=1)]),
- get_real_bsz=Mock(),
- wait_worker_inflight_requests_finish=Mock(),
- log_status=Mock(),
- cache_manager=Mock(reset=Mock()),
- real_bsz=1,
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENGINE_TASK_QUEUE_WITH_SHM", False)
+ # queue service
+ e = _eng(monkeypatch)
+ e.start_worker_queue_service(start_queue=True)
+ assert hasattr(e, "engine_worker_queue_server")
+ e2 = _eng(monkeypatch)
+ e2.cfg.cache_config.enable_prefix_caching = True
+ e2.start_worker_queue_service(start_queue=True)
+ assert hasattr(e2, "cache_task_queue")
+ # SHM queue address (L391)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENGINE_TASK_QUEUE_WITH_SHM", True)
+ e_shm = _eng(monkeypatch)
+ e_shm.start_worker_queue_service(start_queue=True)
+ assert hasattr(e_shm, "engine_worker_queue")
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENGINE_TASK_QUEUE_WITH_SHM", False)
+ # cmd building
+ cap = {}
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.subprocess.Popen",
+ lambda cmd, **kw: (cap.update(cmd=cmd), _ns(pid=9, poll=lambda: None, stdout=iter([])))[1],
)
- eng.token_processor = Mock(clear_data=Mock())
- eng.scheduler = Mock(get_inflight_requests=Mock(return_value=[]), reset=Mock())
- eng._send_error_response = Mock()
-
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True):
- eng._control_pause(ControlRequest(request_id="ctrl1", method="pause"))
- self.assertTrue(eng.is_paused)
-
- eng._control_resume(ControlRequest(request_id="ctrl2", method="resume"))
- self.assertFalse(eng.is_paused)
-
- status = eng._control_is_paused(ControlRequest(request_id="ctrl3", method="is_paused"))
- self.assertEqual(status, {"is_paused": False})
- self._detach_finalizer(eng)
-
- def test_run_control_method_unknown_and_success(self):
- eng = self._make_mixed_engine()
- eng.send_response_server = Mock()
- eng._pause_cond = threading.Condition()
-
- eng.run_control_method(ControlRequest(request_id="bad", method="nope"))
- self.assertTrue(eng.send_response_server.send_response.called)
-
- eng.send_response_server.reset_mock()
- eng.is_paused = True
- eng.run_control_method(ControlRequest(request_id="good", method="is_paused"))
- eng.send_response_server.send_response.assert_called()
- self._detach_finalizer(eng)
-
- def test_run_control_method_handler_exception(self):
- eng = self._make_mixed_engine()
- eng.send_response_server = Mock()
- eng._pause_cond = threading.Condition()
-
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False):
- eng.run_control_method(ControlRequest(request_id="pause", method="pause"))
-
- eng.send_response_server.send_response.assert_called()
- self._detach_finalizer(eng)
-
- def test_call_worker_puts_tasks_and_returns(self):
- eng = self._make_mixed_engine()
- eng.engine_worker_queue = Mock()
-
- class DummyQueue:
- def __init__(self):
- self.name = "q0"
-
- async def get(self, timeout=None):
- return Mock(payload=ControlResponse(request_id="req", result={"ok": True}, error_code=200))
-
- eng._ctrl_output_queues = {"ctrl_w2e_rank0_6778": DummyQueue()}
- result = eng._call_worker(ControlRequest(request_id="req", method="noop"), timeout=1)
- self.assertEqual(result, [{"ok": True}])
- eng.engine_worker_queue.put_tasks.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_control_sleep_defaults_tags_and_dispatches_cache_transfer(self):
- cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
- eng = self._make_engine(cfg)
- eng.cfg.cache_config.num_cpu_blocks = 1
- eng.engine_worker_queue = Mock()
- eng.cache_task_queue = Mock()
- eng.resource_manager.cache_manager.reset = Mock()
- eng._control_pause = Mock()
- eng._wait_for_control_responses = AsyncMock(return_value=[{"ok": True}])
-
- result = eng._control_sleep(ControlRequest(request_id="sleep", method="sleep", args={}))
-
- self.assertEqual(result, [{"ok": True}])
- eng._control_pause.assert_called_once_with(None)
- eng.resource_manager.cache_manager.reset.assert_called_once()
- eng.engine_worker_queue.put_tasks.assert_called_once()
- eng.cache_task_queue.put_transfer_task.assert_called_once()
- sleep_req = eng.engine_worker_queue.put_tasks.call_args.args[0][0][0]
- self.assertEqual(sleep_req.args["tags"], "weight,kv_cache")
- self._detach_finalizer(eng)
-
- def test_control_wakeup_resumes_after_wait(self):
- cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
- eng = self._make_engine(cfg)
- eng.cfg.cache_config.num_cpu_blocks = 1
- eng.engine_worker_queue = Mock()
- eng.cache_task_queue = Mock()
- eng._control_resume = Mock()
- eng._wait_for_control_responses = AsyncMock(return_value=[{"ok": True}])
-
- result = eng._control_wakeup(ControlRequest(request_id="wakeup", method="wakeup", args={"tags": "kv_cache"}))
-
- self.assertEqual(result, [{"ok": True}])
- eng.engine_worker_queue.put_tasks.assert_called_once()
- eng.cache_task_queue.put_transfer_task.assert_called_once()
- eng._control_resume.assert_called_once_with(None)
- self._detach_finalizer(eng)
-
- def test_control_update_weights_requires_pause(self):
- eng = self._make_mixed_engine()
- eng.is_paused = False
- eng._pause_cond = threading.Condition()
-
- with self.assertRaises(Exception):
- eng._control_update_weights(ControlRequest(request_id="ctrl", method="update_weights"))
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_to_scheduler_normal_request(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.is_paused = False
- eng.guided_decoding_checker = None
- eng.resource_manager = Mock(abort_req_ids_set=set(), requests={})
- eng.scheduler = Mock()
- eng.engine_worker_queue = Mock()
-
- class DummyMetrics:
- def __init__(self):
- self.requests_number = Mock(inc=Mock())
- self.num_requests_waiting = Mock(inc=Mock())
-
- class DummyRecv:
- def __init__(self):
- self.calls = 0
-
- def receive_json_once(self, block):
- self.calls += 1
- if self.calls == 1:
- return None, {"request_id": "ctrl", "method": "is_paused", "args": {}}
- if self.calls == 2:
- return None, {
- "request_id": "req1",
- "prompt_token_ids": [1, 2],
- "prompt_token_ids_len": 2,
- "temperature": 1.0,
- }
- eng.running = False
- return None, None
-
- eng.recv_request_server = DummyRecv()
- eng.run_control_method = Mock()
- eng.scheduler.put_requests.return_value = [("req1", None)]
-
- with (
- patch("fastdeploy.engine.common_engine.main_process_metrics", DummyMetrics()),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- eng.run_control_method.assert_called_once()
- eng.scheduler.put_requests.assert_called()
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_to_scheduler_internal_adapter_decode_returns(self):
- cfg = self._make_cfg(
- splitwise_role="decode",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ dp = _ns(
+ tokenizer=_ns(vocab={"a": 0}, get_vocab=lambda: {}, encode=lambda *a, **k: [10]),
+ eos_token_id_len=1,
+ pad_token_id=0,
+ image_patch_id=-1,
)
- eng = self._make_engine(cfg)
- eng.running = True
-
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True):
- eng._insert_zmq_task_to_scheduler()
-
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_sends_tasks(self):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ e3 = _eng(monkeypatch)
+ e3.data_processor = dp
+ e3.mm_max_tokens_per_item = None
+ e3.do_profile = 0
+ e3.ipc_signal_suffix = 12345
+ e3._start_worker_service()
+ assert "--max_num_seqs" in cap["cmd"]
+ # sp_model + think tokens + line_break via encode
+ dp_sp = _ns(
+ tokenizer=_ns(
+ sp_model=[0] * 50,
+ vocab={"a": 0},
+ get_vocab=lambda: {"": 10, "": 11},
+ encode=lambda *a, **k: {"input_ids": [[42]]},
+ ),
+ eos_token_id_len=1,
+ pad_token_id=0,
+ image_patch_id=-1,
)
- eng = self._make_engine(cfg)
- eng.running = True
-
- eng.exist_prefill_task_signal = self._Sig(0)
- eng.engine_worker_queue = Mock(exist_tasks=Mock(return_value=False), num_cache_infos=Mock(return_value=0))
-
- class DummyRM:
- def __init__(self):
- self.abort_req_ids_set = set()
-
- def available_batch(self):
- return 1
-
- def available_block_num(self):
- return 32
-
- def check_and_free_block_tables(self):
- pass
-
- eng.resource_manager = DummyRM()
- eng.split_connector = Mock(current_request_ids=[], has_splitwise_tasks=Mock(return_value=False))
- eng.scheduler = Mock()
- task = Request(request_id="r0", prompt_token_ids=[1], prompt_token_ids_len=1)
- eng.scheduler.get_requests.return_value = [task]
-
- def insert_tasks(tasks, current_id):
- eng.running = False
- return True
-
- eng.insert_tasks = Mock(side_effect=insert_tasks)
-
- with patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None):
- eng._schedule_request_to_worker()
-
- eng.split_connector.send_splitwise_tasks.assert_called_once()
- eng.insert_tasks.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_waits_for_capacity(self):
- eng = self._make_mixed_engine()
- eng.running = True
-
- class DummyRM:
- def available_batch(self):
- eng.running = False
- return 0
-
- eng.resource_manager = DummyRM()
- eng.engine_worker_queue = Mock(exist_tasks=Mock(return_value=False))
-
- with patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None):
- eng._schedule_request_to_worker()
-
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_v1_mixed_single_iteration(self):
- eng = self._make_mixed_engine()
- self._setup_v1_engine(eng)
-
- task = Request(request_id="v1_r0", prompt_token_ids=[1], prompt_token_ids_len=1)
- task.metrics.scheduler_recv_req_time = time.time()
-
- eng.scheduler = Mock(get_requests=Mock(return_value=[task]), put_results=Mock())
- eng.engine_worker_queue = Mock(exist_tasks=Mock(return_value=False), put_tasks=Mock())
-
- eng.resource_manager = self._make_v1_decode_rm(eng, ([], []), with_add_request=True)
-
- try:
- with (
- patch("fastdeploy.engine.common_engine.ThreadPoolExecutor", self._make_dummy_executor(eng)),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._schedule_request_to_worker_v1()
- finally:
- eng.running = False
-
- eng.resource_manager.add_request.assert_called_once_with(task)
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_v1_prefill_decode_alloc_error_safe(self):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
- kv_cache_ratio=1,
+ e3b = _eng(monkeypatch)
+ e3b.data_processor = dp_sp
+ e3b.mm_max_tokens_per_item = None
+ e3b.do_profile = 0
+ e3b.ipc_signal_suffix = 12345
+ e3b._start_worker_service()
+ assert "--think_start_id 10" in cap["cmd"]
+ # line_break via .input_ids attr
+ dp_lb = _ns(
+ tokenizer=_ns(
+ vocab={"a": 0},
+ get_vocab=lambda: {},
+ encode=lambda *a, **k: _ns(input_ids=[99]),
+ ),
+ eos_token_id_len=1,
+ pad_token_id=0,
+ image_patch_id=-1,
)
- eng = self._make_engine(cfg)
- self._setup_v1_engine(eng)
-
- task = Request(request_id="v1_p0", prompt_token_ids=[2], prompt_token_ids_len=1)
- task.idx = 0
- task.metrics.scheduler_recv_req_time = time.time()
-
- eng.scheduler = Mock(get_requests=Mock(return_value=[task]), put_results=Mock())
- eng.engine_worker_queue = Mock(
- exist_tasks=Mock(return_value=False),
- get_finished_add_cache_task_req=Mock(return_value=[]),
+ e3c = _eng(monkeypatch)
+ e3c.data_processor = dp_lb
+ e3c.mm_max_tokens_per_item = None
+ e3c.do_profile = 0
+ e3c.ipc_signal_suffix = 12345
+ e3c._start_worker_service()
+ assert "--line_break_id 99" in cap["cmd"]
+ # line_break via raw int from encode (L1997)
+ dp_int = _ns(
+ tokenizer=_ns(
+ vocab={"a": 0},
+ get_vocab=lambda: {},
+ encode=lambda *a, **k: 77,
+ ),
+ eos_token_id_len=1,
+ pad_token_id=0,
+ image_patch_id=-1,
)
-
- eng.resource_manager = self._make_v1_prefill_continuous_rm(eng, waiting_async_result=False)
- eng.split_connector = Mock(
- send_splitwise_tasks=Mock(),
- check_decode_allocated=Mock(return_value=(False, "decode failed")),
- send_cache_info_to_messager=Mock(),
+ e3e = _eng(monkeypatch)
+ e3e.data_processor = dp_int
+ e3e.mm_max_tokens_per_item = None
+ e3e.do_profile = 0
+ e3e.ipc_signal_suffix = 12345
+ e3e._start_worker_service()
+ assert "--line_break_id 77" in cap["cmd"]
+ # nnode > 1
+ e3d = _eng(monkeypatch)
+ e3d.data_processor = dp
+ e3d.mm_max_tokens_per_item = None
+ e3d.do_profile = 0
+ e3d.ipc_signal_suffix = 12345
+ e3d.cfg.nnode = 2
+ e3d.cfg.ips = ["10.0.0.1", "10.0.0.2"]
+ e3d._start_worker_service()
+ assert "--nnodes 2" in cap["cmd"]
+ # logits_processors + mm_max_tokens_per_item (L2051, L2053)
+ e3f = _eng(monkeypatch)
+ e3f.data_processor = dp
+ e3f.mm_max_tokens_per_item = {"image": 512}
+ e3f.do_profile = 0
+ e3f.ipc_signal_suffix = 12345
+ e3f.cfg.structured_outputs_config.logits_processors = ["xgrammar"]
+ e3f.cfg.cache_config.num_gpu_blocks_override = 64
+ e3f._start_worker_service()
+ assert "--logits-processors xgrammar" in cap["cmd"]
+ assert "--mm_max_tokens_per_item" in cap["cmd"]
+ # full flow with profiling
+ e4 = _eng(monkeypatch)
+ e4.use_async_llm = True
+ e4.do_profile = 1
+ monkeypatch.setattr("fastdeploy.engine.common_engine.IPCSignal", lambda **kw: _Sig(kw.get("array")))
+ e4.ipc_signal_suffix = 12345
+ e4.data_processor = dp
+ e4.mm_max_tokens_per_item = None
+
+ def init4():
+ e4.worker_ready_signal = _Sig(np.array([1], dtype=np.int32))
+ e4.loaded_model_signal = _Sig(np.array([1], dtype=np.int32))
+ e4.get_profile_block_num_signal = _Sig(np.array([100], dtype=np.int32))
+
+ e4._init_worker_signals = init4
+ e4.check_worker_initialize_status = lambda: True
+ e4.cfg.cache_config.reset = _noop
+ e4.resource_manager.reset_cache_config = _noop
+ monkeypatch.setattr("fastdeploy.engine.common_engine.time.sleep", _noop)
+ e4.start_worker_service(async_llm_pid=None)
+ assert e4.do_profile == 0
+ # worker dies
+ e5 = _eng(monkeypatch)
+ e5.use_async_llm = True
+ e5.do_profile = 0
+ e5.ipc_signal_suffix = 12345
+ e5.data_processor = dp
+ e5.mm_max_tokens_per_item = None
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.subprocess.Popen",
+ lambda cmd, **kw: _ns(pid=9, poll=lambda: 1, stdout=iter([])),
)
- try:
- with (
- patch("fastdeploy.engine.common_engine.envs.PREFILL_CONTINUOUS_REQUEST_DECODE_RESOURCES", False),
- patch("fastdeploy.engine.common_engine.ThreadPoolExecutor", self._make_dummy_executor(eng)),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._schedule_request_to_worker_v1()
- finally:
- eng.running = False
-
- eng.scheduler.put_results.assert_called_once()
- eng.resource_manager.add_request_in_p.assert_not_called()
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_v1_decode_preempted_and_errors(self):
- cfg = self._make_cfg(
- splitwise_role="decode",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ def init5():
+ e5.worker_ready_signal = _Sig(np.array([0], dtype=np.int32))
+ e5.loaded_model_signal = _Sig(np.array([0], dtype=np.int32))
+
+ e5._init_worker_signals = init5
+ e5.check_worker_initialize_status = lambda: False
+ assert e5.start_worker_service(async_llm_pid=None) is False
+
+ def test_exit_sub_services(self, monkeypatch):
+ e = _eng(monkeypatch)
+ closed = []
+ e.send_response_server = _ns(close=lambda: closed.append("s"))
+ e.recv_request_server = _ns(close=lambda: closed.append("r"))
+ e._exit_sub_services()
+ assert not e.running and e.exist_task_signal.cleared and "s" in closed
+ # async + worker + cache
+ e2 = _eng(monkeypatch)
+ e2.use_async_llm = True
+ killed = []
+ e2.worker_proc = _ns(pid=100)
+ monkeypatch.setattr("os.getpgid", lambda pid: pid)
+ monkeypatch.setattr("os.killpg", lambda pgid, sig: killed.append(pgid))
+ e2.worker_ready_signal = _Sig()
+ e2.loaded_model_signal = _Sig()
+ e2.cache_manager_processes = [_ns(pid=200)]
+ e2.resource_manager.cache_manager.shm_cache_task_flag_broadcast = _Sig()
+ e2.resource_manager.cache_manager.cache_ready_signal = _Sig()
+ e2.cache_task_queue = _ns(cleanup=_noop)
+ e2.recv_control_cmd_server = _ns(close=_noop)
+ e2.get_profile_block_num_signal = _Sig()
+ e2._exit_sub_services()
+ assert 100 in killed and 200 in killed
+ # cache_task_queue with manager (no cleanup attr)
+ e3 = _eng(monkeypatch)
+ e3.use_async_llm = True
+ e3.worker_proc = None
+ e3.worker_ready_signal = _Sig()
+ e3.loaded_model_signal = _Sig()
+ shut = []
+ e3.cache_task_queue = _ns(manager=_ns(shutdown=lambda: shut.append(1)))
+ e3._exit_sub_services()
+ assert len(shut) == 1
+ # dp_processed join + cleanup
+ e4 = _eng(monkeypatch)
+ e4.use_async_llm = True
+ e4.worker_proc = None
+ e4.worker_ready_signal = _Sig()
+ e4.loaded_model_signal = _Sig()
+ joined = []
+ cleaned = []
+ e4.dp_processed = [_ns(pid=300, join=lambda: joined.append(1))]
+ e4.dp_engine_worker_queue_server = [_ns(cleanup=lambda: cleaned.append(1))]
+ e4._exit_sub_services()
+ assert len(joined) == 1 and len(cleaned) == 1
+ # worker kill exception (OSError)
+ e5 = _eng(monkeypatch)
+ e5.use_async_llm = True
+ e5.worker_proc = _ns(pid=999)
+ monkeypatch.setattr("os.getpgid", lambda pid: (_ for _ in ()).throw(OSError("no pid")))
+ e5.worker_ready_signal = _Sig()
+ e5.loaded_model_signal = _Sig()
+ e5._exit_sub_services() # should not raise
+ # engine_worker_queue_server cleanup
+ e6 = _eng(monkeypatch)
+ qc = []
+ e6.engine_worker_queue_server = _ns(cleanup=lambda: qc.append(1))
+ e6._exit_sub_services()
+ assert len(qc) == 1
+
+ def test_signals_and_setup(self, monkeypatch):
+ monkeypatch.setattr("fastdeploy.engine.common_engine.IPCSignal", lambda **kw: _Sig(kw.get("array")))
+ monkeypatch.setattr("fastdeploy.engine.common_engine.paddle.is_compiled_with_custom_device", lambda x: False)
+ # monitor signals
+ e = _eng(monkeypatch)
+ e._init_worker_monitor_signals()
+ assert hasattr(e, "exist_task_signal")
+ # worker signals: basic + profile + prefix caching
+ e.ipc_signal_suffix = 12345
+ e.do_profile = 0
+ e._init_worker_signals()
+ assert hasattr(e, "worker_ready_signal")
+ e.do_profile = 1
+ e._init_worker_signals()
+ assert hasattr(e, "get_profile_block_num_signal")
+ e2 = _eng(monkeypatch)
+ e2.ipc_signal_suffix = 12345
+ e2.do_profile = 0
+ e2.cfg.cache_config.enable_prefix_caching = True
+ e2._init_worker_signals()
+ assert hasattr(e2, "launched_cache_manager_signal")
+ # expert parallel
+ e3 = _eng(monkeypatch)
+ e3.ipc_signal_suffix = 12345
+ e3.do_profile = 0
+ e3.cfg.parallel_config.enable_expert_parallel = True
+ e3.cfg.parallel_config.data_parallel_size = 2
+ e3.cfg.nnode = 1
+ e3._init_worker_signals()
+ assert hasattr(e3, "launched_expert_service_signal")
+ # create_data_processor
+ e4 = _eng(monkeypatch)
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.InputPreprocessor",
+ lambda *a, **kw: _ns(create_processor=lambda: _ns(get_mm_max_tokens_per_item=lambda ml: {"image": 128})),
)
- eng = self._make_engine(cfg)
- self._setup_v1_engine(eng)
-
- task = Request(request_id="v1_d0", prompt_token_ids=[3], prompt_token_ids_len=1)
- task.task_type = RequestType.PREEMPTED
- task.metrics.scheduler_recv_req_time = time.time()
-
- eng.scheduler = Mock(get_requests=Mock(return_value=[]), put_results=Mock())
- eng.engine_worker_queue = Mock(
- exist_tasks=Mock(return_value=False), put_tasks=Mock(), num_tasks=Mock(return_value=0)
+ e4.cfg.get_max_chunk_tokens = lambda mm: 256
+ e4.cfg.cache_config.postprocess = _noop
+ e4.create_data_processor()
+ assert e4.mm_max_tokens_per_item == {"image": 128}
+ # launch_components
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_MULTI_API_SERVER", False)
+ e5 = _eng(monkeypatch)
+ e5.cfg.scheduler_config.splitwise_role = "prefill"
+ e5.cfg.scheduler_config.name = "local"
+ e5.launch_components()
+ assert hasattr(e5, "splitwise_receive_thread")
+ # launch_components: splitwise scheduler
+ e5b = _eng(monkeypatch)
+ e5b.cfg.scheduler_config.splitwise_role = "prefill"
+ e5b.cfg.scheduler_config.name = "splitwise"
+ e5b.scheduler.start = lambda role, host, info: None
+ e5b.launch_components()
+ # launch_components: dp scheduler
+ e5c = _eng(monkeypatch)
+ e5c.cfg.scheduler_config.splitwise_role = "mixed"
+ e5c.cfg.scheduler_config.name = "dp"
+ e5c.scheduler.start = lambda rank: None
+ e5c.launch_components()
+ # stop_profile
+ e6 = _eng(monkeypatch)
+ e6.do_profile = 1
+ e6.get_profile_block_num_signal = _Sig(np.array([100], dtype=np.int32))
+ e6.worker_proc = None
+ e6.cfg.cache_config.reset = _noop
+ e6.resource_manager.reset_cache_config = _noop
+ e6.ipc_signal_suffix = 12345
+ e6._stop_profile()
+ assert e6.do_profile == 0
+ # stop_profile with prefix_caching → starts cache service
+ e6b = _eng(monkeypatch)
+ e6b.do_profile = 1
+ e6b.get_profile_block_num_signal = _Sig(np.array([100], dtype=np.int32))
+ e6b.worker_proc = None
+ e6b.cfg.cache_config.reset = _noop
+ e6b.cfg.cache_config.enable_prefix_caching = True
+ e6b.resource_manager.reset_cache_config = _noop
+ e6b.ipc_signal_suffix = 12345
+ started_cache = []
+ e6b.start_cache_service = lambda d, s: (started_cache.append(1), [])[1]
+ e6b._stop_profile()
+ assert len(started_cache) == 1
+ # stop_profile: worker_proc dies during profiling
+ e6c = _eng(monkeypatch)
+ e6c.do_profile = 1
+ e6c.get_profile_block_num_signal = _Sig(np.array([0], dtype=np.int32))
+ e6c.worker_proc = _ns(poll=lambda: 1)
+ with pytest.raises(RuntimeError, match="Worker process failed"):
+ e6c._stop_profile()
+ # check_worker_initialize_status: success
+ e7 = _eng(monkeypatch)
+ e7.cfg.worker_num_per_node = 1
+ e7.worker_ready_signal = _Sig(np.array([0], dtype=np.int32))
+ e7.worker_init_status = {}
+ e7.worker_proc = _ns(stdout=iter([b"Loading checkpoint shards: 100\n"]), poll=lambda: None)
+
+ def ready7():
+ time.sleep(0.05)
+ e7.worker_ready_signal.value[0] = 1
+
+ threading.Thread(target=ready7, daemon=True).start()
+ assert e7.check_worker_initialize_status()
+ # check_worker_initialize_status: layer_loading (covers L2219, 2237)
+ e7b = _eng(monkeypatch)
+ e7b.cfg.worker_num_per_node = 1
+ e7b.cfg.model_config.num_hidden_layers = 4
+ e7b.worker_ready_signal = _Sig(np.array([0], dtype=np.int32))
+ e7b.worker_init_status = {}
+ e7b.worker_proc = _ns(
+ stdout=iter([b"Start load layer 0\n", b"Start load layer 1\n", b"set state for layer 2\n"]),
+ poll=lambda: None,
)
- eng._send_error_response = Mock()
-
- eng.resource_manager = self._make_v1_decode_rm(eng, ([task], [("rid_x", None), ("rid_y", "bad")]))
-
- try:
- with (
- patch("fastdeploy.engine.common_engine.ThreadPoolExecutor", self._make_dummy_executor(eng)),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._schedule_request_to_worker_v1()
- finally:
- eng.running = False
-
- eng.scheduler.put_results.assert_called_once()
- eng.engine_worker_queue.put_tasks.assert_called_once()
- eng._send_error_response.assert_called_once_with("rid_y", "bad")
- self._detach_finalizer(eng)
- def test_schedule_request_to_worker_v1_decode_prefill_task_path(self):
- cfg = self._make_cfg(
- splitwise_role="decode",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ def ready7b():
+ time.sleep(0.1)
+ e7b.worker_ready_signal.value[0] = 1
+
+ threading.Thread(target=ready7b, daemon=True).start()
+ assert e7b.check_worker_initialize_status()
+ # check_worker_initialize_status: proc dies
+ e8 = _eng(monkeypatch)
+ e8.cfg.worker_num_per_node = 1
+ e8.worker_ready_signal = _Sig(np.array([0], dtype=np.int32))
+ e8.worker_init_status = {}
+ e8.worker_proc = _ns(stdout=iter([]), poll=lambda: 1)
+ assert not e8.check_worker_initialize_status()
+ # custom device path in _init_worker_signals
+ monkeypatch.setattr("fastdeploy.engine.common_engine.paddle.is_compiled_with_custom_device", lambda x: True)
+ e9 = _eng(monkeypatch)
+ e9.ipc_signal_suffix = 12345
+ e9.do_profile = 0
+ e9._init_worker_signals()
+ assert hasattr(e9, "worker_ready_signal")
+
+
+class TestQueryControlMisc:
+ def test_health_and_queries(self, monkeypatch):
+ e = _eng(monkeypatch)
+ e.resource_manager.stop_flags = np.array([True, False], dtype=bool)
+ assert e.task_is_finished(0) and not e.task_is_finished(1)
+ assert not e.all_tasks_finished()
+ e.resource_manager.stop_flags[:] = True
+ assert e.all_tasks_finished()
+ # unhandled request num
+ e.scheduler.get_unhandled_request_num = lambda: 5
+ assert e._get_scheduler_unhandled_request_num() == 5
+ e.scheduler.get_unhandled_request_num = "nope"
+ assert e._get_scheduler_unhandled_request_num() == 0
+ e.scheduler.get_unhandled_request_num = lambda: -3
+ assert e._get_scheduler_unhandled_request_num() == 0
+ # check_health
+ e.worker_healthy_live_signal.value[0] = 0
+ ok, _ = e.check_health()
+ assert ok
+ e.worker_healthy_live_signal.value = np.array([time.time() - 60], dtype=np.float64)
+ ok, msg = e.check_health(time_interval_threashold=30)
+ assert not ok and "Not Healthy" in msg
+ # worker_processes_ready
+ e.cfg.worker_num_per_node = 2
+ e.worker_ready_signal.value = np.array([1, 0], dtype=np.int32)
+ assert not e._worker_processes_ready()
+ e.worker_ready_signal.value = np.array([1, 1], dtype=np.int32)
+ assert e._worker_processes_ready()
+
+ def test_control_api(self, monkeypatch):
+ e = _eng(monkeypatch)
+ e.is_paused = True
+ assert e._control_is_paused(_ns(request_id="r")) == {"is_paused": True}
+ e._control_resume(_ns(request_id="r"))
+ assert not e.is_paused
+ # update_weights requires pause
+ with pytest.raises(Exception, match="Pause"):
+ e._control_update_weights(_ns(request_id="r"))
+ e.is_paused = True
+ called = []
+ e._call_worker = lambda cr, t: called.append(cr.request_id)
+ e._control_update_weights(_ns(request_id="r"))
+ assert called == ["r"]
+ # run_control_method unknown
+ sent = []
+ e.send_response_server = _ns(send_response=lambda rid, r: sent.append(rid))
+ e.run_control_method(_ns(request_id="r1", method="x", params={}, get_method=lambda: "x"))
+ assert sent[-1] == "r1"
+ # v1 pause
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True)
+ e.cfg.scheduler_config.name = "local"
+ e.is_paused = False
+ e.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop, clear_data=_noop)
+ e.resource_manager.log_status = _noop
+ e.resource_manager.preempted_all = lambda: []
+ e.resource_manager.cache_manager = _ns(reset=_noop)
+ e.scheduler.get_inflight_requests = lambda: []
+ e._control_pause(_ns(request_id="p"))
+ assert e.is_paused
+
+ # inflight requests aborted during pause
+ e.is_paused = False
+ inflight = [_ns(request_id="inf1")]
+ e.scheduler.get_inflight_requests = lambda: inflight
+ e._control_pause(_ns(request_id="p1c"))
+ assert e.is_paused
+ # non-local scheduler raises
+ e.is_paused = False
+ e.cfg.scheduler_config.name = "dp"
+ with pytest.raises(Exception, match="local scheduler"):
+ e._control_pause(_ns(request_id="p1d"))
+ # non-v1 pause raises
+ e.is_paused = False
+ e.cfg.scheduler_config.name = "local"
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False)
+ with pytest.raises(Exception, match="pause only supported"):
+ e._control_pause(_ns(request_id="p2"))
+ # run_control_method exception path — handler raises
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True)
+ e.is_paused = True
+ e._call_worker = lambda cr, t: (_ for _ in ()).throw(RuntimeError("fail"))
+ e.run_control_method(
+ _ns(request_id="ex", method="update_weights", params={}, get_method=lambda: "update_weights")
)
- eng = self._make_engine(cfg)
- self._setup_v1_engine(eng)
-
- task = Request(request_id="v1_d1", prompt_token_ids=[4], prompt_token_ids_len=1)
- task.task_type = RequestType.PREFILL
- task.trace_carrier = {}
- task.metrics.scheduler_recv_req_time = time.time()
-
- eng.scheduler = Mock(get_requests=Mock(return_value=[]), put_results=Mock())
- eng.engine_worker_queue = Mock(
- exist_tasks=Mock(return_value=False), put_tasks=Mock(), num_tasks=Mock(return_value=0)
+ assert sent[-1] == "ex"
+ # pause with running requests
+ e.is_paused = False
+ e.cfg.scheduler_config.name = "local"
+ et_cc = [0]
+ e.engine_worker_queue = _ns(
+ exist_tasks=lambda: (et_cc.__setitem__(0, et_cc[0] + 1) or et_cc[0] <= 1), put_tasks=_noop
)
-
- eng.resource_manager = self._make_v1_decode_rm(eng, ([task], []))
-
- try:
- with (
- patch("fastdeploy.engine.common_engine.ThreadPoolExecutor", self._make_dummy_executor(eng)),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._schedule_request_to_worker_v1()
- finally:
- eng.running = False
-
- eng.engine_worker_queue.put_tasks.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_v1_error_task_none_skips_send(self):
- cfg = self._make_cfg(
- splitwise_role="decode",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ e.resource_manager.preempted_all = lambda: [_ns(request_id="rr1")]
+ e.resource_manager.get_real_bsz = _noop
+ e.resource_manager.real_bsz = 1
+ e.resource_manager.wait_worker_inflight_requests_finish = _noop
+ e.resource_manager.cache_manager = _ns(reset=_noop)
+ e.scheduler.get_inflight_requests = lambda: []
+ e._control_pause(_ns(request_id="p_run"))
+ assert e.is_paused
+ # run_control_method success path (known method completes normally)
+ e.run_control_method(_ns(request_id="sp", method="is_paused", params={}, get_method=lambda: "is_paused"))
+ assert sent[-1] == "sp"
+ # pause timeout — exist_tasks always True (L1315-1319)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.time.sleep", _noop)
+ e.is_paused = False
+ e.resource_manager.log_status = _noop
+ e.engine_worker_queue = _ns(exist_tasks=lambda: True)
+ with pytest.raises(Exception, match="timeout"):
+ e._control_pause(_ns(request_id="p_to"))
+
+ def test_misc_utils(self, monkeypatch):
+ e = _eng(monkeypatch)
+ # decode_token: text disabled
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", False)
+ d, t = e._decode_token([1, 2], "r", is_end=False)
+ assert d == "" and t == [1, 2]
+ # text enabled
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", True)
+ e.data_processor = _ns(ids2tokens=lambda t, r: ("hi", [1, 2, 3], None), decode_status={"r1": [0, 2]})
+ d, _ = e._decode_token([1, 2, 3], "r1", is_end=False)
+ assert d == "hi"
+ e.data_processor.decode_status["r2"] = [0, 1]
+ e.data_processor.ids2tokens = lambda t, r: ("end", [10], None)
+ e._decode_token([10], "r2", is_end=True)
+ assert "r2" not in e.data_processor.decode_status
+ # clear_data
+ e.send_response_server = _ns(req_dict={})
+ e.recv_request_server = _ns(req_dict={})
+ e.cache_task_queue = _ns(clear_transfer_task=_noop)
+ assert e.clear_data()
+ e.token_processor.clear_data = lambda: (_ for _ in ()).throw(RuntimeError("x"))
+ assert not e.clear_data()
+ # check_and_free_block_tables (L1702)
+ freed = []
+ e.resource_manager.check_and_free_block_tables = lambda: freed.append(1)
+ e.check_and_free_block_tables()
+ assert freed
+ # _setting_environ_variables
+ e2 = _eng(monkeypatch)
+ r = e2._setting_environ_variables()
+ assert "FLAGS_use_append_attn=1" in r
+ e2.cfg.scheduler_config.splitwise_role = "prefill"
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False)
+ assert "FLAGS_use_pd_disaggregation=1" in e2._setting_environ_variables()
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True)
+ assert "FLAGS_use_pd_disaggregation_per_chunk=1" in e2._setting_environ_variables()
+ # enable_mm → FLAGS_max_partition_size (L1941)
+ e2.cfg.model_config.enable_mm = True
+ assert "FLAGS_max_partition_size=1024" in e2._setting_environ_variables()
+ # _send_error_response
+ e3 = _eng(monkeypatch)
+ sent = []
+ e3.send_response_server = _ns(send_response=lambda rid, r: sent.append(rid))
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+ e3._send_error_response("r1", "err", 503)
+ assert sent[-1] == "r1"
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True)
+ e3._send_error_response("r2", "err")
+ assert sent[-1] is None
+
+ def test_call_worker(self, monkeypatch):
+ """Cover _call_worker + _wait_all_control_responses (L1398-1434)."""
+ import asyncio
+
+ e = _eng(monkeypatch)
+ # set up mock ctrl queues that return a ControlResponse-like object
+ resp = _ns(request_id="cw1", error_code=200, error_message="", result={"ok": True})
+ payload_msg = _ns(payload=resp)
+
+ async def _get(timeout=0):
+ return payload_msg
+
+ q = _ns(get=_get, name="q0")
+ e._ctrl_worker_output_queues = [q]
+ e.engine_worker_queue = _ns(put_tasks=_noop)
+ cr = _ns(request_id="cw1")
+ result = e._call_worker(cr, timeout=10)
+ assert result == [{"ok": True}]
+
+ # timeout path
+ async def _get_slow(timeout=0):
+ await asyncio.sleep(999)
+
+ q2 = _ns(get=_get_slow, name="q1")
+ e2 = _eng(monkeypatch)
+ e2._ctrl_worker_output_queues = [q2]
+ e2.engine_worker_queue = _ns(put_tasks=_noop)
+ with pytest.raises(Exception, match="Timeouted"):
+ e2._call_worker(_ns(request_id="cw2"), timeout=0.01)
+
+ # error_code != 200
+ resp_err = _ns(request_id="cw3", error_code=500, error_message="bad", result=None)
+ payload_err = _ns(payload=resp_err)
+
+ async def _get_err(timeout=0):
+ return payload_err
+
+ q3 = _ns(get=_get_err, name="q2")
+ e3 = _eng(monkeypatch)
+ e3._ctrl_worker_output_queues = [q3]
+ e3.engine_worker_queue = _ns(put_tasks=_noop)
+ with pytest.raises(Exception, match="Call Worker error"):
+ e3._call_worker(_ns(request_id="cw3"), timeout=10)
+
+ def test_chunk_size(self, monkeypatch):
+ e = _eng(monkeypatch)
+ e.cfg.cache_config.enable_chunked_prefill = False
+ reqs = [_ns(prompt_token_ids_len=100)]
+ e.update_requests_chunk_size(reqs)
+ assert not hasattr(reqs[0], "prefill_chunk_info")
+ e.cfg.cache_config.enable_chunked_prefill = True
+ e.cfg.cache_config.block_size = 16
+ e.cfg.scheduler_config.max_num_batched_tokens = 128
+ e.partial_chunked_tokens = [0, 128]
+ ci = {}
+ req = _ns(prompt_token_ids_len=64, set=lambda k, v: ci.update({k: v}))
+ e.update_requests_chunk_size([req])
+ assert sum(ci["prefill_chunk_info"]) == 64
+ # multiple with remainder
+ e.cfg.scheduler_config.max_num_batched_tokens = 256
+ e.cfg.max_num_partial_prefills = 2
+ e.partial_chunked_tokens = [0, 256, 32]
+ cs = [{}, {}]
+ rs = [_ns(prompt_token_ids_len=100, set=lambda k, v, i=i: cs[i].update({k: v})) for i in range(2)]
+ e.update_requests_chunk_size(rs)
+ for c in cs:
+ assert sum(c["prefill_chunk_info"]) == 100
+
+
+class TestInsertTasks:
+ def test_insert_basic(self, monkeypatch):
+ _ptr(monkeypatch)
+ e = _eng(monkeypatch)
+ e.cfg.scheduler_config.splitwise_role = "mixed"
+ e.cfg.cache_config.enable_chunked_prefill = False
+ e.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e.resource_manager.real_bsz = 1
+ put = []
+ e.engine_worker_queue = _ns(put_tasks=lambda t: put.append(t))
+ assert e.insert_tasks(_task()) # non-list wraps
+ assert e.insert_tasks([_task()]) # list happy
+ # exceeds batch
+ e.resource_manager.stop_flags = np.array([True, False, False], dtype=bool)
+ e.insert_tasks([_task(f"r{i}") for i in range(3)])
+ assert len(put[-1][0]) == 1
+ # allocation fails
+ e2 = _eng(monkeypatch)
+ e2.resource_manager.allocate_resources_for_new_tasks = lambda t: []
+ from fastdeploy.engine.common_engine import EngineError
+
+ with pytest.raises(EngineError):
+ e2.insert_tasks([_task()])
+
+ def test_insert_splitwise(self, monkeypatch):
+ _ptr(monkeypatch)
+ # prefill role
+ e = _eng(monkeypatch)
+ e.cfg.scheduler_config.splitwise_role = "prefill"
+ e.cfg.cache_config.enable_chunked_prefill = False
+ e.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e.resource_manager.real_bsz = 1
+ e.split_connector = _ns(check_decode_allocated=lambda t: (True, None), send_cache_info_to_messager=_noop)
+ e.engine_worker_queue = _ns(put_tasks=_noop)
+ assert e.insert_tasks([_task(disagg=_ns(foo=1))])
+ # prefill fail path
+ from fastdeploy.engine.common_engine import EngineError as EE
+
+ e1b = _eng(monkeypatch)
+ e1b.cfg.scheduler_config.splitwise_role = "prefill"
+ e1b.cfg.cache_config.enable_chunked_prefill = False
+ e1b.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e1b.resource_manager.real_bsz = 1
+ e1b.split_connector = _ns(
+ check_decode_allocated=lambda t: (False, "no blocks"), send_cache_info_to_messager=_noop
)
- eng = self._make_engine(cfg)
- self._setup_v1_engine(eng)
-
- task = Request(request_id="v1_e0", prompt_token_ids=[1], prompt_token_ids_len=1)
- task.task_type = RequestType.PREFILL
- task.trace_carrier = {}
- task.metrics.scheduler_recv_req_time = time.time()
-
- eng.scheduler = Mock(get_requests=Mock(return_value=[]), put_results=Mock())
- eng.engine_worker_queue = Mock(
- exist_tasks=Mock(return_value=False), put_tasks=Mock(), num_tasks=Mock(return_value=0)
+ e1b.engine_worker_queue = _ns(put_tasks=_noop)
+ e1b.scheduler.put_results = _noop
+ with pytest.raises(EE):
+ e1b.insert_tasks([_task(disagg=_ns(foo=1))])
+ # decode role
+ e2 = _eng(monkeypatch)
+ e2.cfg.scheduler_config.splitwise_role = "decode"
+ e2.cfg.cache_config.enable_chunked_prefill = False
+ e2.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e2.resource_manager.real_bsz = 1
+ e2.split_connector = _ns(send_cache_info_to_prefill=_noop)
+ e2.engine_worker_queue = _ns(put_tasks=_noop)
+ assert e2.insert_tasks([_task(disagg=_ns(x=1))])
+ # preempted + carrier
+ e3 = _eng(monkeypatch)
+ e3.cfg.scheduler_config.splitwise_role = "mixed"
+ e3.cfg.cache_config.enable_chunked_prefill = False
+ e3.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e3.resource_manager.real_bsz = 1
+ e3.engine_worker_queue = _ns(put_tasks=_noop)
+ assert e3.insert_tasks([_task(preempted=True, carrier={"traceparent": "00-abc"})])
+ # enable_mm path → update_mm_requests_chunk_size (L532)
+ e4 = _eng(monkeypatch)
+ e4.cfg.scheduler_config.splitwise_role = "mixed"
+ e4.cfg.model_config.enable_mm = True
+ e4.cfg.cache_config.enable_chunked_prefill = False
+ e4.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e4.resource_manager.real_bsz = 1
+ e4.engine_worker_queue = _ns(put_tasks=_noop)
+ e4.update_mm_requests_chunk_size = _noop
+ assert e4.insert_tasks([_task()])
+
+ def test_prefilled_requests(self, monkeypatch):
+ def _setup(e):
+ e.cfg.speculative_config = _ns(method="none")
+ e.cfg.scheduler_config.splitwise_role = "decode"
+ e.resource_manager.real_bsz = 1
+ e.resource_manager._recycle_block_tables = _noop
+
+ met = _ns(decode_recv_req_time=0, decode_preallocate_req_time=0, decode_inference_start_time=0)
+ # happy path
+ e = _eng(monkeypatch)
+ _setup(e)
+ e.resource_manager.req_dict = {"r1": 0}
+ e.resource_manager.tasks_list = [_ns(prompt_token_ids=[0], num_cached_tokens=0, metrics=met)]
+ e.resource_manager.stop_flags = np.array([False], dtype=bool)
+ e.token_processor = _ns(tokens_counter={}, clear_data=_noop, number_of_tasks=0, number_of_input_tokens=0)
+ put = []
+ e.engine_worker_queue = _ns(put_tasks=lambda t: put.append(1))
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+ ro = _ns(
+ request_id="r1",
+ outputs=_ns(token_ids=[42], draft_token_ids=None),
+ error_code=200,
+ error_msg=None,
+ num_cached_tokens=5,
+ metrics=met,
)
- eng._send_error_response = Mock()
-
- eng.resource_manager = self._make_v1_decode_rm(eng, ([task], [("rid_none", None)]))
-
- with (
- patch("fastdeploy.engine.common_engine.ThreadPoolExecutor", self._make_dummy_executor(eng)),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._schedule_request_to_worker_v1()
-
- eng.engine_worker_queue.put_tasks.assert_called_once()
- eng._send_error_response.assert_not_called()
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_v1_threadpool_shutdown_breaks(self):
- eng = self._make_mixed_engine()
- self._setup_v1_engine(eng)
-
- eng.engine_worker_queue = Mock(exist_tasks=Mock(return_value=False))
-
- eng.resource_manager = self._make_v1_decode_rm(eng, ([], []))
-
- class DummyExecutor:
- def __init__(self, max_workers=None):
- pass
-
- def submit(self, fn):
- raise RuntimeError("cannot schedule new futures after shutdown")
-
- with (
- patch("fastdeploy.engine.common_engine.ThreadPoolExecutor", DummyExecutor),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._schedule_request_to_worker_v1()
-
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_v1_prefill_continuous_cache_success(self):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
- kv_cache_ratio=1,
+ assert e._insert_prefilled_requests([ro])
+ assert len(put) == 1
+ # error path
+ e2 = _eng(monkeypatch)
+ _setup(e2)
+ e2.resource_manager.req_dict = {"re": 0}
+ e2.resource_manager.tasks_list = [_ns(prompt_token_ids=[0], num_cached_tokens=0, metrics=met)]
+ e2.resource_manager.stop_flags = np.array([False], dtype=bool)
+ e2.token_processor = _ns(tokens_counter={"re": 1}, clear_data=_noop, number_of_tasks=0)
+ e2.scheduler.put_results = _noop
+ e2.engine_worker_queue = _ns(put_tasks=_noop)
+ ro_e = _ns(
+ request_id="re",
+ outputs=_ns(token_ids=[1], draft_token_ids=None),
+ error_code=500,
+ error_msg="fail",
+ num_cached_tokens=0,
+ metrics=met,
)
- eng = self._make_engine(cfg)
- self._setup_v1_engine(eng)
-
- task = Request(request_id="pc_ok", prompt_token_ids=[1], prompt_token_ids_len=1)
- task.idx = 0
- task.metrics.scheduler_recv_req_time = time.time()
-
- eng.scheduler = Mock(get_requests=Mock(return_value=[task]), put_results=Mock())
-
- eng.resource_manager = self._make_v1_prefill_continuous_rm(eng, waiting_async_result=False)
-
- calls = {"n": 0}
-
- def get_finished_add_cache_task_req():
- if calls["n"] == 0:
- calls["n"] += 1
- return ["pc_ok"]
- return []
-
- eng.engine_worker_queue = Mock(
- exist_tasks=Mock(return_value=False),
- get_finished_add_cache_task_req=Mock(side_effect=get_finished_add_cache_task_req),
+ e2._insert_prefilled_requests([ro_e])
+ # adapter first-token-is-EOS
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True)
+ e3 = _eng(monkeypatch)
+ _setup(e3)
+ e3.resource_manager.req_dict = {"re2": 0}
+ e3.resource_manager.tasks_list = [_ns(prompt_token_ids=[0], num_cached_tokens=0, metrics=met)]
+ e3.resource_manager.stop_flags = np.array([False], dtype=bool)
+ e3.resource_manager._recycle_block_tables = _noop
+ e3.token_processor = _ns(
+ tokens_counter={"re2": 1}, clear_data=_noop, number_of_tasks=0, number_of_input_tokens=0
)
-
- eng.split_connector = Mock(
- send_splitwise_tasks=Mock(),
- check_decode_allocated=Mock(return_value=(True, "")),
- send_cache_info_to_messager=Mock(),
+ e3.engine_worker_queue = _ns(put_tasks=_noop)
+ ro_eos = _ns(
+ request_id="re2",
+ outputs=_ns(token_ids=[], draft_token_ids=None),
+ error_code=200,
+ error_msg=None,
+ num_cached_tokens=0,
+ metrics=met,
)
-
- with (
- patch("fastdeploy.engine.common_engine.envs.PREFILL_CONTINUOUS_REQUEST_DECODE_RESOURCES", True),
- patch("fastdeploy.engine.common_engine.ThreadPoolExecutor", self._make_dummy_executor(eng)),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._schedule_request_to_worker_v1()
-
- eng.split_connector.send_splitwise_tasks.assert_called()
- eng.split_connector.send_cache_info_to_messager.assert_called_once()
- eng.resource_manager.add_request_in_p.assert_called_once()
- eng.scheduler.put_results.assert_not_called()
- self._detach_finalizer(eng)
-
- def test_schedule_request_to_worker_v1_prefill_continuous_wait_async_none(self):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
- kv_cache_ratio=1,
+ e3._insert_prefilled_requests([ro_eos])
+ assert e3.resource_manager.stop_flags[0]
+ assert e2.resource_manager.stop_flags[0]
+ # MTP speculative decode path (L566)
+ e4 = _eng(monkeypatch)
+ _setup(e4)
+ e4.cfg.speculative_config = _ns(method="mtp")
+ e4.resource_manager.req_dict = {"rm": 0}
+ e4.resource_manager.tasks_list = [_ns(prompt_token_ids=[0], num_cached_tokens=0, metrics=met)]
+ e4.resource_manager.stop_flags = np.array([False], dtype=bool)
+ e4.token_processor = _ns(tokens_counter={}, clear_data=_noop, number_of_tasks=0, number_of_input_tokens=0)
+ e4.engine_worker_queue = _ns(put_tasks=_noop)
+ ro_mtp = _ns(
+ request_id="rm",
+ outputs=_ns(token_ids=[7], draft_token_ids=[10, 11]),
+ error_code=200,
+ error_msg=None,
+ num_cached_tokens=3,
+ metrics=met,
)
- eng = self._make_engine(cfg)
- self._setup_v1_engine(eng)
-
- task = Request(request_id="pc_fail", prompt_token_ids=[1], prompt_token_ids_len=1)
- task.idx = 0
- task.error_code = 501
- task.error_message = "prefill bad"
- task.metrics.scheduler_recv_req_time = time.time()
-
- eng.scheduler = Mock(get_requests=Mock(return_value=[task]), put_results=Mock())
-
- eng.resource_manager = self._make_v1_prefill_continuous_rm(eng, waiting_async_result=None)
-
- calls = {"n": 0}
-
- def get_finished_add_cache_task_req():
- if calls["n"] == 0:
- calls["n"] += 1
- return ["pc_fail"]
+ e4._insert_prefilled_requests([ro_mtp])
+ assert e4.resource_manager.tasks_list[0].draft_token_ids == [10, 11]
+
+
+class TestSchedule:
+ def test_v0_scheduling(self, monkeypatch):
+ _ptr(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.main_process_metrics.num_requests_waiting.dec", _noop)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.main_process_metrics.num_requests_running.inc", _noop)
+ e = _eng(monkeypatch)
+ e.cfg.scheduler_config.splitwise_role = "mixed"
+ e.resource_manager.available_batch = lambda: 1
+ e.resource_manager.available_block_num = lambda: 100
+ e.resource_manager.abort_req_ids_set = set()
+ e.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e.resource_manager.real_bsz = 1
+ e.split_connector = _ns(current_request_ids=[], has_splitwise_tasks=lambda: False)
+ e.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop, num_cache_infos=lambda: 0)
+ cc = [0]
+
+ def get_reqs(**kw):
+ cc[0] += 1
+ if cc[0] == 1:
+ return [_task()]
+ e.running = False
return []
- eng.engine_worker_queue = Mock(
- exist_tasks=Mock(return_value=False),
- get_finished_add_cache_task_req=Mock(side_effect=get_finished_add_cache_task_req),
+ e.scheduler.get_requests = get_reqs
+ e._schedule_request_to_worker()
+ assert cc[0] >= 1
+ # no-batch skips
+ e2 = _eng(monkeypatch)
+ sc = [0]
+ e2.resource_manager.available_batch = lambda: (
+ (sc.__setitem__(0, sc[0] + 1) or 0) if sc[0] < 3 else (setattr(e2, "running", False) or 0)
)
-
- eng.split_connector = Mock(
- send_splitwise_tasks=Mock(),
- check_decode_allocated=Mock(return_value=(True, "")),
- send_cache_info_to_messager=Mock(),
- )
-
- with (
- patch("fastdeploy.engine.common_engine.envs.PREFILL_CONTINUOUS_REQUEST_DECODE_RESOURCES", True),
- patch("fastdeploy.engine.common_engine.ThreadPoolExecutor", self._make_dummy_executor(eng)),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._schedule_request_to_worker_v1()
-
- eng.scheduler.put_results.assert_called_once()
- eng.resource_manager.pre_recycle_resource.assert_called_once_with("pc_fail")
- eng.resource_manager.add_request_in_p.assert_not_called()
- self._detach_finalizer(eng)
-
- def test_start_zmq_service_ipc_servers(self):
- eng = self._make_mixed_engine()
-
- created = {"threads": 0}
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- patch("fastdeploy.engine.common_engine.ZmqIpcServer", self._make_zmq_server_cls()),
- patch("fastdeploy.engine.common_engine.threading.Thread", self._make_zmq_thread_cls(created)),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng.start_zmq_service(api_server_pid=4321)
-
- self.assertEqual(created["threads"], 3)
- self.assertEqual(eng.recv_request_server.kwargs["name"], 4321)
- self._detach_finalizer(eng)
-
- def test_start_zmq_service_internal_adapter_tcp(self):
- eng = self._make_mixed_engine()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True),
- patch("fastdeploy.engine.common_engine.ZmqTcpServer", self._make_zmq_server_cls()),
- patch("fastdeploy.engine.common_engine.InternalAdapter", Mock()),
- patch("fastdeploy.engine.common_engine.threading.Thread", self._make_zmq_thread_cls()),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng.start_zmq_service(api_server_pid=5555)
-
- self.assertIsNotNone(eng.internal_adapter)
- self._detach_finalizer(eng)
-
- def test_start_zmq_service_none(self):
- eng = self._make_mixed_engine()
- eng.start_zmq_service(api_server_pid=None)
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_to_scheduler_abort_request(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.is_paused = False
- eng.guided_decoding_checker = None
-
- class DummyRM:
- def __init__(self):
- self.abort_req_ids_set = set()
- self.waiting_abort_req_id_set = set()
- self.real_bsz = 1
- self.requests = {"rid": Mock()}
-
- def add_abort_req_ids(self, req_id):
- self.waiting_abort_req_id_set.add(req_id)
-
- def _prepare_preempt_task(self, req):
- return Request(request_id="rid", prompt_token_ids=[1], prompt_token_ids_len=1)
-
- eng.resource_manager = DummyRM()
- eng.scheduler = Mock(_recycle=Mock())
- eng.engine_worker_queue = Mock()
-
- eng.recv_request_server = self._make_dummy_recv(
- eng,
- payload={"request_id": "rid", "status": RequestStatus.ABORT.value},
- )
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- # Verify abort request was handled correctly - added to waiting_abort_req_id_set
- self.assertIn("rid", eng.resource_manager.waiting_abort_req_id_set)
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_to_scheduler_paused_sends_error(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.is_paused = True
- eng.guided_decoding_checker = None
- eng.resource_manager = Mock(abort_req_ids_set=set(), requests={})
- eng.scheduler = Mock()
- eng.engine_worker_queue = Mock()
- eng._send_error_response = Mock()
-
- eng.recv_request_server = self._make_dummy_recv(
- eng,
- payload={
- "request_id": "req1",
- "prompt_token_ids": [1],
- "prompt_token_ids_len": 1,
- "temperature": 1.0,
- },
- )
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- eng._send_error_response.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_to_scheduler_context_terminated(self):
- eng = self._make_mixed_engine()
- eng.running = True
-
- eng.recv_request_server = self._make_dummy_recv(eng, error=RuntimeError("Context was terminated"))
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.ZmqIpcServer", self._make_zmq_server_cls()),
- patch.object(eng, "llm_logger") as mock_logger,
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- mock_logger.info.assert_called()
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_to_scheduler_error_reinit(self):
- eng = self._make_mixed_engine()
- eng.running = True
-
- eng.recv_request_server = self._make_dummy_recv(eng, error=RuntimeError("boom"))
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.ZmqIpcServer", self._make_zmq_server_cls()),
- patch.object(eng, "llm_logger") as mock_logger,
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- mock_logger.error.assert_called()
- self._detach_finalizer(eng)
-
- def test_decode_process_splitwise_requests_single_cycle(self):
- cfg = self._make_cfg(
- splitwise_role="decode",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
- )
- eng = self._make_engine(cfg)
- eng.running = True
- eng.enable_decode_cache_task = False
- eng.cfg.splitwise_version = "v1"
- eng.scheduler = Mock(has_request=Mock(return_value=True), put_results=Mock())
- eng._insert_prefilled_requests = Mock()
-
- class DummyRM:
- def is_resource_sufficient(self, prompt_len):
- return True
-
- eng.resource_manager = DummyRM()
- eng.insert_tasks = Mock()
-
- task = Request(request_id="r0", prompt_token_ids=[1], prompt_token_ids_len=1)
- output = RequestOutput(
- request_id="r1",
- outputs=Mock(token_ids=[1], decode_type=1, tool_calls=None),
- metrics=Mock(),
- finished=False,
+ e2._schedule_request_to_worker()
+ # exist_prefill_signal
+ e3 = _eng(monkeypatch)
+ e3.cfg.scheduler_config.splitwise_role = "mixed"
+ e3.resource_manager.available_batch = lambda: 1
+ e3.engine_worker_queue = _ns(exist_tasks=lambda: False, num_cache_infos=lambda: 0)
+ e3.split_connector = _ns(current_request_ids=[], has_splitwise_tasks=lambda: True)
+ e3.exist_prefill_task_signal = _Sig(np.array([1], dtype=np.int32))
+ pc = [0]
+
+ def avail3():
+ pc[0] += 1
+ if pc[0] >= 3:
+ e3.running = False
+ return 1
+
+ e3.resource_manager.available_batch = avail3
+ e3._schedule_request_to_worker()
+ # exist_tasks true → sleep+continue, then proceed
+ e4 = _eng(monkeypatch)
+ e4.cfg.scheduler_config.splitwise_role = "mixed"
+ et4 = [0]
+ e4.resource_manager.available_batch = lambda: 1
+ e4.resource_manager.available_block_num = lambda: 100
+ e4.resource_manager.abort_req_ids_set = set()
+ e4.engine_worker_queue = _ns(
+ exist_tasks=lambda: (et4.__setitem__(0, et4[0] + 1) or et4[0] <= 1),
+ put_tasks=_noop,
+ num_cache_infos=lambda: 0,
)
+ e4.split_connector = _ns(current_request_ids=[], has_splitwise_tasks=lambda: False)
+ vc4 = [0]
- class DummyQueue:
- def disaggregate_queue_empty(self):
- return False
-
- def get_disaggregated_tasks(self):
- eng.running = False
- return [
- (None, [task]),
- (None, [output]),
- ]
-
- eng.engine_worker_queue = DummyQueue()
-
- class DummyThread:
- def __init__(self, target=None, daemon=None):
- self.target = target
- self.daemon = daemon
-
- def start(self):
- try:
- self.target()
- finally:
- eng.running = False
-
- with (
- patch("fastdeploy.engine.common_engine.threading.Thread", DummyThread),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False),
- ):
- eng._decode_process_splitwise_requests()
-
- eng.insert_tasks.assert_called_once()
- eng._insert_prefilled_requests.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_single_batch(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.send_response_server = Mock()
-
- self._make_scheduler_with_output(eng, [1, 2], 1, True)
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._zmq_send_generated_tokens()
-
- eng.send_response_server.send_response.assert_called()
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_non_internal_adapter_empty_and_other(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.send_response_server = Mock()
- eng._decode_token = Mock(return_value=("", []))
-
- self._make_scheduler_with_output(eng, [1], 0, True, include_raw=True)
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- ):
- eng._zmq_send_generated_tokens()
-
- eng.send_response_server.send_response.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_logs_exception(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.send_response_server = Mock()
-
- def get_results():
- eng.running = False
- raise RuntimeError("boom")
-
- eng.scheduler = Mock(get_results=get_results)
-
- try:
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False):
- eng._zmq_send_generated_tokens()
- finally:
- eng.running = False
-
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_internal_adapter_decode(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.send_response_server = Mock()
-
- class DummyProcessor:
- def __init__(self):
- self.decode_status = {"rid": (0, 2)}
-
- def ids2tokens(self, token_ids, req_id):
- return "hi", [1, 2], None
-
- eng.data_processor = DummyProcessor()
-
- self._make_scheduler_with_output(eng, [1, 2], 0, True, fmt="list")
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._zmq_send_generated_tokens()
-
- eng.send_response_server.send_response.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_internal_adapter_decode_type_one(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.send_response_server = Mock()
-
- self._make_scheduler_with_output(eng, [3, 4], 1, True, fmt="list")
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._zmq_send_generated_tokens()
-
- eng.send_response_server.send_response.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_internal_adapter_warns_on_empty(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.send_response_server = Mock()
-
- self._make_scheduler_with_output(eng, [], 1, False, fmt="list")
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True),
- patch.object(eng, "llm_logger") as mock_logger,
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._zmq_send_generated_tokens()
-
- mock_logger.warning.assert_called()
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_empty_results(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.scheduler = Mock()
-
- def get_results():
- eng.running = False
+ def gr4(**kw):
+ vc4[0] += 1
+ if vc4[0] >= 1:
+ e4.running = False
return []
- eng.scheduler.get_results = get_results
-
- with patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None):
- eng._zmq_send_generated_tokens()
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_decode_type_zero(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.send_response_server = Mock()
-
- self._make_scheduler_with_output(eng, [1, 2], 0, True)
- eng._decode_token = Mock(return_value=("hi", [1, 2]))
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._zmq_send_generated_tokens()
-
- eng.send_response_server.send_response.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_zmq_send_generated_tokens_warns_on_empty(self):
- eng = self._make_mixed_engine()
- eng.running = True
- eng.send_response_server = Mock()
-
- self._make_scheduler_with_output(eng, [], 1, False)
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch.object(eng, "llm_logger") as mock_logger,
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._zmq_send_generated_tokens()
-
- mock_logger.warning.assert_called()
- self._detach_finalizer(eng)
-
- def test_wait_for_control_responses_success(self):
- eng = self._make_mixed_engine()
-
- eng._ctrl_output_queues = {
- "ctrl_w2e_rank0_6778": self._make_ctrl_queue(
- "q0", Mock(request_id="req", error_code=200, result={"ok": True})
- ),
- "ctrl_w2e_rank1_6778": self._make_ctrl_queue(
- "q1", Mock(request_id="req", error_code=200, result={"ok": True})
- ),
- }
-
- results = asyncio.run(eng._wait_for_control_responses("req", timeout=1))
- self.assertEqual(results, [{"ok": True}, {"ok": True}])
- self._detach_finalizer(eng)
-
- def test_wait_for_control_responses_filters_executors(self):
- eng = self._make_mixed_engine()
-
- eng._ctrl_output_queues = {
- "ctrl_w2e_rank0_6778": self._make_ctrl_queue(
- "worker", Mock(request_id="req", error_code=200, result={"worker": True})
- ),
- "ctrl_c2e_rank0_6779": self._make_ctrl_queue(
- "cache", Mock(request_id="req", error_code=200, result={"cache": True})
- ),
- }
-
- worker_results = asyncio.run(eng._wait_for_control_responses("req", timeout=1, executors=["worker"]))
- cache_results = asyncio.run(eng._wait_for_control_responses("req", timeout=1, executors=["cache_transfer"]))
-
- self.assertEqual(worker_results, [{"worker": True}])
- self.assertEqual(cache_results, [{"cache": True}])
- self._detach_finalizer(eng)
-
- def test_wait_for_control_responses_ignores_mismatch(self):
- eng = self._make_mixed_engine()
-
- class DummyQueue:
- def __init__(self, name, payloads):
- self.name = name
- self.payloads = list(payloads)
-
- async def get(self, timeout=None):
- return Mock(payload=self.payloads.pop(0))
-
- eng._ctrl_output_queues = {
- "ctrl_w2e_rank0_6778": DummyQueue(
- "q0",
- [
- Mock(request_id="old", error_code=200, result={"ok": False}),
- Mock(request_id="req", error_code=200, result={"ok": "from-q0"}),
- ],
- ),
- "ctrl_w2e_rank1_6778": self._make_ctrl_queue(
- "q1", Mock(request_id="req", error_code=200, result={"ok": True})
- ),
- }
-
- results = asyncio.run(eng._wait_for_control_responses("req", timeout=1))
- self.assertEqual(results, [{"ok": "from-q0"}, {"ok": True}])
- self.assertEqual(
- eng._ctrl_response_mailboxes["ctrl_w2e_rank0_6778"]["old"].result,
- {"ok": False},
+ e4.scheduler.get_requests = gr4
+ e4._schedule_request_to_worker()
+ # decode role → skip
+ e5 = _eng(monkeypatch)
+ e5.cfg.scheduler_config.splitwise_role = "decode"
+ e5.resource_manager.available_batch = lambda: 1
+ e5.resource_manager.available_block_num = lambda: 100
+ e5.resource_manager.abort_req_ids_set = set()
+ e5.engine_worker_queue = _ns(exist_tasks=lambda: False, num_cache_infos=lambda: 0)
+ e5.split_connector = _ns(current_request_ids=[], has_splitwise_tasks=lambda: False)
+ dc5 = [0]
+
+ def gr5(**kw):
+ dc5[0] += 1
+ if dc5[0] >= 2:
+ e5.running = False
+ return [_task()]
+
+ e5.scheduler.get_requests = gr5
+ e5._schedule_request_to_worker()
+ # num_cache_infos > 0
+ e6 = _eng(monkeypatch)
+ e6.cfg.scheduler_config.splitwise_role = "mixed"
+ nc6 = [0]
+ e6.resource_manager.available_batch = lambda: 1
+ e6.engine_worker_queue = _ns(
+ exist_tasks=lambda: False,
+ num_cache_infos=lambda: (1 if nc6[0] == 0 else 0) or (nc6.__setitem__(0, nc6[0] + 1) or 0),
)
- self._detach_finalizer(eng)
-
- def test_wait_for_control_responses_error_paths(self):
- eng = self._make_mixed_engine()
-
- eng._ctrl_output_queues = {
- "ctrl_w2e_rank0_6778": self._make_ctrl_queue("q0", Exception("boom"), payload_wrapped=False)
- }
-
- with self.assertRaises(Exception):
- asyncio.run(eng._wait_for_control_responses("req", timeout=1))
- self._detach_finalizer(eng)
-
- def test_wait_for_control_responses_none_message(self):
- eng = self._make_mixed_engine()
-
- eng._ctrl_output_queues = {"ctrl_w2e_rank0_6778": self._make_ctrl_queue("q0", None, payload_wrapped=False)}
-
- with self.assertRaises(Exception):
- asyncio.run(eng._wait_for_control_responses("req", timeout=1))
- self._detach_finalizer(eng)
-
- def test_wait_for_control_responses_error_code(self):
- eng = self._make_mixed_engine()
+ e6.split_connector = _ns(current_request_ids=[], has_splitwise_tasks=lambda: False)
+ nc_sc6 = [0]
+
+ def avail6():
+ nc_sc6[0] += 1
+ if nc_sc6[0] >= 3:
+ e6.running = False
+ return 1
+
+ e6.resource_manager.available_batch = avail6
+ e6._schedule_request_to_worker()
+ # current_request_ids non-empty
+ e7 = _eng(monkeypatch)
+ e7.cfg.scheduler_config.splitwise_role = "mixed"
+ e7.resource_manager.available_batch = lambda: 1
+ e7.engine_worker_queue = _ns(exist_tasks=lambda: False, num_cache_infos=lambda: 0)
+ cr_ids = [["rid1"]]
+ e7.split_connector = _ns(current_request_ids=cr_ids[0], has_splitwise_tasks=lambda: False)
+ rc7 = [0]
+
+ def avail7():
+ rc7[0] += 1
+ if rc7[0] >= 2:
+ cr_ids[0].clear()
+ if rc7[0] >= 4:
+ e7.running = False
+ return 1
+
+ e7.resource_manager.available_batch = avail7
+ e7._schedule_request_to_worker()
+ # non-mixed splitwise → send tasks (L787-789)
+ e8 = _eng(monkeypatch)
+ e8.cfg.scheduler_config.splitwise_role = "prefill"
+ e8.resource_manager.available_batch = lambda: 1
+ e8.resource_manager.available_block_num = lambda: 100
+ e8.resource_manager.abort_req_ids_set = set()
+ e8.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e8.resource_manager.real_bsz = 1
+ e8.split_connector = _ns(current_request_ids=[], has_splitwise_tasks=lambda: False, send_splitwise_tasks=_noop)
+ e8.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop, num_cache_infos=lambda: 0)
+ cc8 = [0]
+
+ def gr8(**kw):
+ cc8[0] += 1
+ if cc8[0] == 1:
+ return [_task()]
+ e8.running = False
+ return []
- eng._ctrl_output_queues = {
- "ctrl_w2e_rank0_6778": self._make_ctrl_queue(
- "q0", ControlResponse(request_id="req", error_code=500, error_message="bad")
- )
- }
-
- with self.assertRaises(Exception):
- asyncio.run(eng._wait_for_control_responses("req", timeout=1))
- self._detach_finalizer(eng)
-
- def test_wait_for_control_responses_timeout(self):
- eng = self._make_mixed_engine()
- eng._ctrl_output_queues = {"ctrl_w2e_rank0_6778": self._make_ctrl_queue("q0", None, payload_wrapped=False)}
-
- with patch("fastdeploy.engine.common_engine.asyncio.wait_for", side_effect=asyncio.TimeoutError):
- with self.assertRaises(Exception):
- asyncio.run(eng._wait_for_control_responses("req", timeout=1))
- self._detach_finalizer(eng)
-
- def test_wait_for_control_responses_without_matching_queues(self):
- eng = self._make_mixed_engine()
- eng._ctrl_output_queues = {"ctrl_w2e_rank0_6778": self._make_ctrl_queue("q0", None, payload_wrapped=False)}
-
- result = asyncio.run(eng._wait_for_control_responses("req", timeout=1, executors=["cache_transfer"]))
- self.assertIsNone(result)
- self._detach_finalizer(eng)
-
- def test_insert_tasks_prefill_error_and_success(self):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ e8.scheduler.get_requests = gr8
+ e8._schedule_request_to_worker()
+ # insert_tasks fail → continue (L795)
+ e9 = _eng(monkeypatch)
+ e9.cfg.scheduler_config.splitwise_role = "mixed"
+ e9.resource_manager.available_batch = lambda: 1
+ e9.resource_manager.available_block_num = lambda: 100
+ e9.resource_manager.abort_req_ids_set = set()
+ e9.split_connector = _ns(current_request_ids=[], has_splitwise_tasks=lambda: False)
+ e9.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop, num_cache_infos=lambda: 0)
+ cc9 = [0]
+
+ def gr9(**kw):
+ cc9[0] += 1
+ if cc9[0] >= 2:
+ e9.running = False
+ return [_task()]
+
+ e9.scheduler.get_requests = gr9
+ e9.insert_tasks = lambda *a, **kw: False
+ e9._schedule_request_to_worker()
+
+ def test_v1_scheduling(self, monkeypatch):
+ _ptr(monkeypatch)
+ # mixed happy with scheduler_unhandled_request_num
+ e = _eng(monkeypatch)
+ e.cfg.scheduler_config.splitwise_role = "mixed"
+ e.resource_manager.waiting = []
+ e.resource_manager.get_real_bsz = _noop
+ e.resource_manager.real_bsz = 1
+ e.resource_manager.scheduler_unhandled_request_num = 0
+ e.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop)
+ e.scheduler.get_requests = lambda **kw: [_task()]
+ cc = [0]
+
+ def sched():
+ cc[0] += 1
+ if cc[0] > 1:
+ e.running = False
+ return [], []
+ return [_task()], []
+
+ e.resource_manager.schedule = sched
+ e._schedule_request_to_worker_v1()
+ # prefill with tasks
+ e2 = _eng(monkeypatch)
+ e2.cfg.scheduler_config.splitwise_role = "prefill"
+ e2.resource_manager.waiting = []
+ e2.resource_manager.get_real_bsz = _noop
+ e2.resource_manager.real_bsz = 1
+ e2.split_connector = _ns(check_decode_allocated=lambda t: (True, None), send_cache_info_to_messager=_noop)
+ e2.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop)
+ cc2 = [0]
+
+ def sched2():
+ cc2[0] += 1
+ if cc2[0] == 1:
+ return [_task()], []
+ e2.running = False
+ return [], []
+
+ e2.resource_manager.schedule = sched2
+ e2._schedule_request_to_worker_v1()
+ # decode preempted
+ from fastdeploy.engine.request import Request, RequestType
+
+ e3 = _eng(monkeypatch)
+ e3.cfg.scheduler_config.splitwise_role = "decode"
+ e3.resource_manager.waiting = []
+ e3.resource_manager.get_real_bsz = _noop
+ e3.resource_manager.real_bsz = 1
+ e3.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop)
+ pr = Request.__new__(Request)
+ pr.request_id = "rp"
+ pr.task_type = RequestType.PREEMPTED
+ pr.has_been_preempted_before = True
+ pr.trace_carrier = None
+ pr.user = "test"
+ pr.metrics = _ns(
+ scheduler_recv_req_time=time.time(),
+ inference_start_time=0,
+ decode_inference_start_time=0,
+ add_req_to_resource_manager_time=0,
)
- eng = self._make_engine(cfg)
-
- eng.resource_manager = self._make_insert_tasks_rm(n=2)
- eng.scheduler = Mock()
- eng.engine_worker_queue = Mock()
- eng.split_connector = Mock()
- eng.split_connector.send_cache_info_to_messager = Mock()
- eng.split_connector.check_decode_allocated = Mock(
- side_effect=[(False, "no"), (True, "")],
+ pr_calls = []
+ e3.scheduler.put_results = lambda r: pr_calls.append(r)
+ cc3 = [0]
+
+ def sched3():
+ cc3[0] += 1
+ if cc3[0] == 1:
+ return [pr], []
+ e3.running = False
+ return [], []
+
+ e3.resource_manager.schedule = sched3
+ e3._schedule_request_to_worker_v1()
+ assert len(pr_calls) >= 1
+ # PREFILL task trace spans
+ e4 = _eng(monkeypatch)
+ e4.cfg.scheduler_config.splitwise_role = "mixed"
+ e4.resource_manager.waiting = []
+ e4.resource_manager.get_real_bsz = _noop
+ e4.resource_manager.real_bsz = 1
+ e4.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop)
+ pf = Request.__new__(Request)
+ pf.request_id = "pf_1"
+ pf.task_type = RequestType.PREFILL
+ pf.has_been_preempted_before = False
+ pf.trace_carrier = {"traceparent": "00-abc"}
+ pf.user = "test"
+ pf.metrics = _ns(scheduler_recv_req_time=time.time(), inference_start_time=0)
+ cc4 = [0]
+
+ def sched4():
+ cc4[0] += 1
+ if cc4[0] == 1:
+ return [pf], []
+ e4.running = False
+ return [], []
+
+ e4.resource_manager.schedule = sched4
+ e4._schedule_request_to_worker_v1()
+ # PREFILL preempted (rescheduled)
+ e5 = _eng(monkeypatch)
+ e5.cfg.scheduler_config.splitwise_role = "mixed"
+ e5.resource_manager.waiting = []
+ e5.resource_manager.get_real_bsz = _noop
+ e5.resource_manager.real_bsz = 1
+ e5.engine_worker_queue = _ns(exist_tasks=lambda: False, put_tasks=_noop)
+ pf2 = Request.__new__(Request)
+ pf2.request_id = "pf_2"
+ pf2.task_type = RequestType.PREFILL
+ pf2.has_been_preempted_before = True
+ pf2.trace_carrier = None
+ pf2.user = "test"
+ pf2.metrics = _ns(scheduler_recv_req_time=time.time(), inference_start_time=0)
+ cc5 = [0]
+
+ def sched5():
+ cc5[0] += 1
+ if cc5[0] == 1:
+ return [pf2], []
+ e5.running = False
+ return [], []
+
+ e5.resource_manager.schedule = sched5
+ e5._schedule_request_to_worker_v1()
+ # v1 exist_tasks true → sleep+continue (L984-985)
+ e6 = _eng(monkeypatch)
+ e6.cfg.scheduler_config.splitwise_role = "mixed"
+ e6.resource_manager.waiting = []
+ e6.resource_manager.get_real_bsz = _noop
+ e6.resource_manager.real_bsz = 1
+ et6 = [0]
+ e6.engine_worker_queue = _ns(exist_tasks=lambda: (et6.__setitem__(0, et6[0] + 1) or et6[0] <= 1))
+ cc6 = [0]
+
+ def sched6():
+ cc6[0] += 1
+ e6.running = False
+ return [], []
+
+ e6.resource_manager.schedule = sched6
+ e6._schedule_request_to_worker_v1()
+
+
+class TestZmqAndSplitwise:
+ def test_zmq_start(self, monkeypatch):
+ e = _eng(monkeypatch)
+ e.start_zmq_service(api_server_pid=None)
+ assert not hasattr(e, "recv_request_server")
+ # IPC
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.ZmqIpcServer", lambda **kw: _ns(recv_result_handle=_noop, close=_noop)
)
- eng.token_processor = Mock(number_of_tasks=0, number_of_input_tokens=0)
- eng.update_requests_chunk_size = Mock()
-
- tasks = [
- Request(request_id="p0", prompt_token_ids=[1], prompt_token_ids_len=1),
- Request(request_id="p1", prompt_token_ids=[1], prompt_token_ids_len=1),
- ]
- for task in tasks:
- task.metrics.scheduler_recv_req_time = time.time()
-
- eng.insert_tasks(tasks)
-
- eng.scheduler.put_results.assert_called_once()
- eng.engine_worker_queue.put_tasks.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_insert_tasks_decode_disaggregate_sets_flags(self):
- cfg = self._make_cfg(
- splitwise_role="decode",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.ZmqTcpServer", lambda **kw: _ns(recv_result_handle=_noop, close=_noop)
)
- eng = self._make_engine(cfg)
-
- eng.resource_manager = self._make_insert_tasks_rm()
- eng.engine_worker_queue = Mock()
- eng.split_connector = Mock(send_cache_info_to_prefill=Mock())
- eng.token_processor = Mock(number_of_tasks=0, number_of_input_tokens=0)
-
- task = Request(request_id="d1", prompt_token_ids=[1], prompt_token_ids_len=1, disaggregate_info={})
- eng.insert_tasks([task])
-
- eng.split_connector.send_cache_info_to_prefill.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_insert_tasks_mm_updates_chunk_size(self):
- cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
- cfg.model_config.enable_mm = True
- eng = self._make_engine(cfg)
-
- eng.resource_manager = self._make_insert_tasks_rm()
- eng.engine_worker_queue = Mock()
- eng.token_processor = Mock(number_of_tasks=0, number_of_input_tokens=0)
- eng.update_mm_requests_chunk_size = Mock()
-
- task = Request(request_id="mm", prompt_token_ids=[1], prompt_token_ids_len=1)
- task.metrics.scheduler_recv_req_time = time.time()
- eng.insert_tasks([task])
-
- eng.update_mm_requests_chunk_size.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_insert_tasks_sets_prefill_flag(self):
- eng = self._make_mixed_engine()
-
- eng.resource_manager = self._make_insert_tasks_rm()
- eng.engine_worker_queue = Mock()
- eng.token_processor = Mock(number_of_tasks=0, number_of_input_tokens=0)
- eng.update_requests_chunk_size = Mock()
-
- task = Request(
- request_id="prefill",
- prompt_token_ids=[1],
- prompt_token_ids_len=1,
- disaggregate_info={},
+ monkeypatch.setattr("fastdeploy.engine.common_engine.time.sleep", _noop)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+ e2 = _eng(monkeypatch)
+ e2.running = False
+ e2.start_zmq_service(api_server_pid=1234)
+ assert hasattr(e2, "recv_request_server")
+ # adapter
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.InternalAdapter", lambda **kw: _ns())
+ e3 = _eng(monkeypatch)
+ e3.running = False
+ e3.cfg.parallel_config.local_data_parallel_id = 0
+ e3.start_zmq_service(api_server_pid=5678)
+ assert hasattr(e3, "internal_adapter")
+
+ def test_zmq_requests(self, monkeypatch):
+ _ptr(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_DATA_PROCESSOR", False)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.main_process_metrics.requests_number.inc", _noop)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.main_process_metrics.num_requests_waiting.inc", _noop)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.Request.from_dict", lambda d: _task(d["request_id"]))
+ monkeypatch.setattr("fastdeploy.engine.common_engine.ControlRequest.is_control_request", lambda d: False)
+ # normal request
+ e = _eng(monkeypatch)
+ e.send_response_server = _ns(send_response=_noop)
+ e.guided_decoding_checker = None
+ _zmq_recv(e, [(None, {"request_id": "z1", "status": None})])
+ # abort
+ from fastdeploy.engine.request import RequestStatus
+
+ e2 = _eng(monkeypatch)
+ e2.send_response_server = _ns(send_response=_noop)
+ e2.resource_manager.abort_req_ids_set = set()
+ _zmq_recv(e2, [(None, {"request_id": "a1", "status": RequestStatus.ABORT.value})])
+ assert "a1" in e2.resource_manager.abort_req_ids_set
+ # paused drops
+ e3 = _eng(monkeypatch)
+ e3.send_response_server = _ns(send_response=_noop)
+ e3.is_paused = True
+ e3.guided_decoding_checker = None
+ dropped = []
+ e3._send_error_response = lambda *a: dropped.append(1)
+ _zmq_recv(e3, [(None, {"request_id": "p1", "status": None})])
+ assert len(dropped) >= 1
+ # v1 abort — req IN resource_manager.requests
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True)
+ e4 = _eng(monkeypatch)
+ e4.resource_manager.abort_req_ids_set = set()
+ e4.resource_manager.requests = {"v1a": _ns(request_id="v1a")}
+ e4.resource_manager._prepare_preempt_task = lambda r: _ns(request_id=r.request_id)
+ e4.resource_manager.real_bsz = 1
+ e4.engine_worker_queue = _ns(put_tasks=_noop)
+ _zmq_recv(e4, [(None, {"request_id": "v1a", "status": RequestStatus.ABORT.value})])
+ assert "v1a" in e4.resource_manager.abort_req_ids_set
+ # v1 abort — req NOT in requests (recycle)
+ e4b = _eng(monkeypatch)
+ e4b.resource_manager.abort_req_ids_set = set()
+ e4b.resource_manager.requests = {}
+ e4b.scheduler._recycle = _noop
+ _zmq_recv(e4b, [(None, {"request_id": "v1b", "status": RequestStatus.ABORT.value})])
+ assert "v1b" not in e4b.resource_manager.abort_req_ids_set # removed after recycle
+ # non-Context-terminated error → reconnect
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+ e5 = _eng(monkeypatch)
+ e5.api_server_pid = "test"
+ rc = [0]
+
+ def _r5(block):
+ rc[0] += 1
+ if rc[0] == 1:
+ return ("socket error", None)
+ e5.running = False
+ return ("Context was terminated", None)
+
+ e5.recv_request_server = _ns(receive_json_once=_r5)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.ZmqIpcServer", lambda **kw: e5.recv_request_server)
+ e5._insert_zmq_task_to_scheduler()
+ # control request via zmq
+ monkeypatch.setattr("fastdeploy.engine.common_engine.ControlRequest.is_control_request", lambda d: True)
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.ControlRequest.from_dict",
+ lambda d: _ns(request_id="c1", get_method=lambda: "x"),
)
- task.metrics.scheduler_recv_req_time = time.time()
- eng.insert_tasks([task])
-
- eng.update_requests_chunk_size.assert_not_called()
- self._detach_finalizer(eng)
-
- def test_update_requests_chunk_size_empty_inputs(self):
- eng = self._make_mixed_engine()
- eng.cfg.cache_config.enable_chunked_prefill = True
- eng.update_requests_chunk_size([])
- self._detach_finalizer(eng)
-
- def test_update_mm_requests_chunk_size_handles_none_images(self):
- eng = self._make_mixed_engine()
- eng.cfg.cache_config.enable_chunked_prefill = True
- eng.partial_chunked_tokens = [0, 16]
- eng.data_processor = type("DP", (), {"image_patch_id": 9})()
-
- inputs = {
- "input_ids": np.array([9, 1, 2, 3], dtype="int64"),
- "token_type_ids": np.array([0, 0, 0, 0], dtype="int64"),
- "image_type_ids": np.array([1], dtype="int32"),
- "grid_thw": np.array([[2, 1, 1]], dtype="int64"),
- "images": None,
- "position_ids": np.array([0, 1, 2, 3], dtype="int64"),
- }
- req = Request(request_id="mm1", multimodal_inputs=inputs)
-
- with patch.dict("sys.modules", {"fastdeploy.model_executor.ops.gpu": self._make_mm_stub_module()}):
- eng.update_mm_requests_chunk_size([req])
-
- chunk_info = req.get("prefill_chunk_info")
- self.assertEqual(len(chunk_info), 1)
- self.assertIsNone(chunk_info[0]["images"])
- self._detach_finalizer(eng)
-
- def test_update_mm_requests_chunk_size_expands_grid(self):
- eng = self._make_mixed_engine()
- eng.cfg.cache_config.enable_chunked_prefill = True
- eng.partial_chunked_tokens = [0, 16]
- eng.data_processor = type("DP", (), {"image_patch_id": 9})()
-
- inputs = {
- "input_ids": np.array([9, 1, 2, 3], dtype="int64"),
- "token_type_ids": np.array([0, 0, 0, 0], dtype="int64"),
- "image_type_ids": np.array([1, 1], dtype="int32"),
- "grid_thw": np.array([[2, 1, 1]], dtype="int64"),
- "images": np.ones((2,), dtype="uint8"),
- "position_ids": np.array([0, 1, 2, 3], dtype="int64"),
- }
- req = Request(request_id="mm3", multimodal_inputs=inputs)
-
- with patch.dict("sys.modules", {"fastdeploy.model_executor.ops.gpu": self._make_mm_stub_module()}):
- eng.update_mm_requests_chunk_size([req])
-
- self.assertTrue(req.get("prefill_chunk_info"))
- self._detach_finalizer(eng)
-
- def test_update_mm_requests_chunk_size_skips_when_disabled(self):
- eng = self._make_mixed_engine()
- eng.cfg.cache_config.enable_chunked_prefill = False
- req = Request(request_id="mm2", multimodal_inputs={"images": None})
-
- eng.update_mm_requests_chunk_size([req])
- self._detach_finalizer(eng)
-
- def test_insert_tasks_single_request_with_trace_carrier(self):
- eng = self._make_mixed_engine()
-
- eng.resource_manager = self._make_insert_tasks_rm()
- eng.engine_worker_queue = Mock()
- eng.token_processor = Mock(number_of_tasks=0, number_of_input_tokens=0)
- eng.update_requests_chunk_size = Mock()
-
- task = Request(
- request_id="trace",
- prompt_token_ids=[1],
- prompt_token_ids_len=1,
- trace_carrier={"trace_id": "1"},
+ e6 = _eng(monkeypatch)
+ e6.send_response_server = _ns(send_response=_noop)
+ _zmq_recv(e6, [(None, {"request_id": "c1"})])
+ # from_dict exception
+ monkeypatch.setattr("fastdeploy.engine.common_engine.ControlRequest.is_control_request", lambda d: False)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_DATA_PROCESSOR", False)
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.Request.from_dict", lambda d: (_ for _ in ()).throw(ValueError("bad"))
)
- task.metrics.scheduler_recv_req_time = time.time()
- eng.insert_tasks(task)
-
- eng.update_requests_chunk_size.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_exit_sub_services_cleanup_paths(self):
- """Cover lines 1312-1340, 1350-1354 in _exit_sub_services."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_simple_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- # attach stubs used by cleanup
- eng.worker_ready_signal = self._Sig(0)
- eng.loaded_model_signal = self._Sig(0)
- eng.exist_task_signal = self._Sig(0)
- eng.exist_swapped_task_signal = self._Sig(0)
- eng.worker_healthy_live_signal = self._Sig(0)
- eng.cache_ready_signal = self._Sig(0)
- eng.swap_space_ready_signal = self._Sig(0)
- eng.exist_prefill_task_signal = self._Sig(0)
- eng.model_weights_status_signal = self._Sig(0)
- eng.prefix_tree_status_signal = self._Sig(0)
- eng.kv_cache_status_signal = self._Sig(0)
- eng.send_response_server = Mock()
- eng.recv_request_server = Mock()
- eng.recv_control_cmd_server = Mock()
-
- # ensure cache manager control flags exist before first call
- eng.resource_manager.cache_manager.shm_cache_task_flag_broadcast = Mock(clear=lambda: None)
- eng.resource_manager.cache_manager.cache_ready_signal = Mock(clear=lambda: None)
- eng.cache_manager_processes = []
-
- # worker_proc kill raises -> cover 1312-1313
- eng.worker_proc = MagicMock(pid=1001)
- with patch("fastdeploy.engine.common_engine.os.getpgid", side_effect=RuntimeError("boom")):
- eng._exit_sub_services()
-
- # Prepare cache manager processes to hit both normal and exception branch
- class DummyCacheMgr:
- def __init__(self, pid, raise_on_kill=False):
- self.pid = pid
- self.raise_on_kill = raise_on_kill
-
- eng.cache_manager_processes = [DummyCacheMgr(2001, False), DummyCacheMgr(2002, True)]
- eng.resource_manager.cache_manager.shm_cache_task_flag_broadcast = Mock(clear=lambda: None)
- eng.resource_manager.cache_manager.cache_ready_signal = Mock(clear=lambda: None)
-
- def fake_getpgid(pid):
- return pid
-
- def fake_killpg(pid, sig):
- if pid == 2002:
- raise RuntimeError("kill fail")
-
- # cache_task_queue with cleanup
- eng.cache_task_queue = Mock()
- eng.cache_task_queue.cleanup = Mock()
-
- eng.dp_processed = [Mock(pid=3001, join=lambda: None)]
- eng.dp_engine_worker_queue_server = [Mock(cleanup=lambda: None)]
-
- with (
- patch("fastdeploy.engine.common_engine.os.getpgid", side_effect=fake_getpgid),
- patch("fastdeploy.engine.common_engine.os.killpg", side_effect=fake_killpg),
- ):
- eng._exit_sub_services()
-
- # Now cover manager.shutdown warning path (no cleanup attribute)
- class DummyMgr:
- def __init__(self):
- self.manager = Mock(shutdown=Mock(side_effect=RuntimeError("shutdown fail")))
-
- eng.cache_task_queue = DummyMgr()
- eng._exit_sub_services()
- self._detach_finalizer(eng)
-
- def test_start_worker_service_cmd_build(self):
- """Cover 1517, 1526, 1568, 1592, 1595 by building the worker command with mocks."""
- with patch("fastdeploy.config.get_host_ip", return_value="127.0.0.1"):
- cfg = self._make_cfg(
- splitwise_role="mixed", num_gpu_blocks_override=4, ips=["127.0.0.1", "127.0.0.2"], data_parallel_size=2
- )
- # Make model multi-modal so env var branch already covered above; here not required
- cfg.structured_outputs_config.logits_processors = ["A", "B"]
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_simple_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
- eng.data_processor = self._stub_processor()
-
- captured = {"cmd": None}
-
- class DummyProc:
- def __init__(self):
- self.stdout = None
-
- def poll(self):
- return None
-
- def fake_popen(cmd, stdout, shell, preexec_fn):
- captured["cmd"] = cmd
- return DummyProc()
-
- with patch("fastdeploy.engine.common_engine.subprocess.Popen", side_effect=fake_popen):
- with patch("fastdeploy.engine.common_engine.llm_logger"):
- p = eng._start_worker_service()
-
- self.assertIsNotNone(p)
- self.assertIsInstance(captured["cmd"], str)
- # logits processors added (1568)
- self.assertIn("--logits-processors A B", captured["cmd"]) # type: ignore
- # num_gpu_blocks_override added (1592)
- self.assertIn("--num_gpu_blocks_override 4", captured["cmd"]) # type: ignore
- # ips/nnodes added when nnode > 1 (1595)
- self.assertIn("--nnodes 2", captured["cmd"]) # type: ignore
- self._detach_finalizer(eng)
-
- def test_check_health_unhealthy(self):
- """Cover line 1628: unhealthy worker."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_simple_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- # set worker live time far past threshold
- eng.worker_healthy_live_signal = self._Sig(int(time.time()) - 1000)
- ok, msg = eng.check_health(time_interval_threashold=1)
- self.assertFalse(ok)
- self.assertIn("Not Healthy".lower(), msg.lower())
- self._detach_finalizer(eng)
-
- def test_launch_components_expert_parallel(self):
- """Cover 1635-1638, 1660-1676, 1684-1703 in launch_components()."""
- # For prefill + local scheduler the core code now requires a router
- # and ENABLE_V1_KVCACHE_SCHEDULER=0 when using the default IPC protocol.
- with patch("fastdeploy.engine.args_utils.envs.ENABLE_V1_KVCACHE_SCHEDULER", 0):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- # enable expert parallel and dp > 1 to go into the branch
- data_parallel_size=2,
- enable_expert_parallel=True,
- router="0.0.0.0:30000",
- )
-
- # Provide EngineWorkerQueue stub for ctor
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_full_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=True, use_async_llm=True)
-
- # Init signals to create launched_expert_service_signal
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_MULTI_API_SERVER", False):
- eng.ipc_signal_suffix = cfg.parallel_config.engine_worker_queue_port[0]
- eng._init_worker_signals()
-
- # Don't create real queues/processes
- with (
- patch("fastdeploy.engine.common_engine.EngineWorkerQueue") as FakeQ,
- patch("fastdeploy.engine.common_engine.multiprocessing.Process") as FakeP,
- ):
- # Fake queue instances with cleanup
- FakeQ.return_value = Mock(cleanup=lambda: None)
-
- # When starting process, immediately mark the signal as 1 to break waiting loop
- def start_side_effect(*args, **kwargs):
- # set value for dp id 1
- eng.launched_expert_service_signal.value[1] = 1
-
- proc_instance = Mock(start=start_side_effect)
- FakeP.return_value = proc_instance
-
- # Avoid scheduler doing real work
- eng.scheduler.start = lambda *a, **k: None
- with patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None):
- eng.launch_components()
-
- # Verify expert service branch executed
- self.assertTrue(hasattr(eng, "dp_processed"))
- self.assertGreaterEqual(len(eng.dp_processed), 1)
- self._detach_finalizer(eng)
-
- def test_check_worker_initialize_status_progress(self):
- """Cover 1710-1762 by simulating stdout and ready signals."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_full_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- # Fake worker process stdout content that matches regexes
- lines = [
- b"Loading checkpoint shards: 1\n",
- b"Start load layer 5\n",
- ]
-
- class DummyProc:
- def __init__(self, it):
- self._it = iter(it)
-
- @property
- def stdout(self):
- return self._it
-
- def poll(self):
- return None
-
- eng.worker_proc = DummyProc(lines)
- eng.worker_init_status = {}
- eng.cfg.model_config.num_hidden_layers = 8
-
- # worker_ready_signal makes _worker_processes_ready() return True
- eng.worker_ready_signal = self._Sig(1)
-
- # Replace tqdm and sleep for fast execution
- with patch("fastdeploy.engine.common_engine.tqdm", lambda *a, **k: self._DummyPbar()):
- with patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None):
- ok = eng.check_worker_initialize_status()
- self.assertTrue(ok)
- self._detach_finalizer(eng)
-
- def test_worker_processes_ready_false(self):
- """Cover line 1382 returning False."""
- cfg = self._make_cfg()
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_full_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- eng.worker_ready_signal = self._Sig(0)
- self.assertFalse(eng._worker_processes_ready())
- self._detach_finalizer(eng)
-
- def test_init_worker_signals_profile_iluvatar(self):
- """Cover line 1434 by forcing iluvatar custom device and do_profile=True."""
- # do_profile=True when num_gpu_blocks_override is None
- cfg = self._make_cfg(num_gpu_blocks_override=None)
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_full_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
- eng.ipc_signal_suffix = cfg.parallel_config.engine_worker_queue_port[0]
- with patch("fastdeploy.engine.common_engine.paddle.is_compiled_with_custom_device", return_value=True):
- eng._init_worker_signals()
- # signal should exist
- self.assertTrue(hasattr(eng, "get_profile_block_num_signal"))
- self._detach_finalizer(eng)
-
- def test_launch_components_dp_mode(self):
- """Cover 1648-1652 branch for DP scheduler mode."""
- # When ENABLE_V1_KVCACHE_SCHEDULER=1 the IPC cache-transfer protocol
- # is no longer supported; force it to 0 here to avoid the
- # NotImplementedError raised in EngineArgs.__post_init__ so we can
- # still exercise the DP branch of launch_components.
- with patch("fastdeploy.engine.args_utils.envs.ENABLE_V1_KVCACHE_SCHEDULER", 0):
- cfg = self._make_cfg(
- splitwise_role="prefill",
- data_parallel_size=2,
- scheduler_name="dp",
- )
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_full_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
- # Patch scheduler.start so it doesn't do heavy work
- eng.scheduler.start = Mock()
- eng.launch_components()
- eng.scheduler.start.assert_called()
- self._detach_finalizer(eng)
-
- def test_insert_tasks_raises_when_no_resources(self):
- """Cover insert_tasks resource exhaustion error branch."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", self._make_full_dummy_q_cls()):
- eng = EngineService(cfg, start_queue=False, use_async_llm=False)
-
- eng.resource_manager.stop_flags = np.zeros_like(eng.resource_manager.stop_flags)
-
- token_ids = paddle.to_tensor([1, 2, 3], dtype="int64")
- request = Request(
- request_id="req1",
- prompt_token_ids=token_ids.numpy().tolist(),
- prompt_token_ids_len=3,
+ e7 = _eng(monkeypatch)
+ e7.send_response_server = _ns(send_response=_noop)
+ e7.guided_decoding_checker = None
+ _zmq_recv(e7, [(None, {"request_id": "bad1", "status": None})])
+ # guided_decoding_checker rejects
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.Request.from_dict",
+ lambda d: _ns(request_id=d["request_id"], metrics=_ns(scheduler_recv_req_time=0)),
)
- with self.assertRaises(EngineError) as ctx:
- eng.insert_tasks([request])
- self.assertIn("request id", str(ctx.exception))
- self._detach_finalizer(eng)
-
- def test_get_scheduler_unhandled_request_num(self):
- """Cover _get_scheduler_unhandled_request_num normal/fallback paths."""
- eng = EngineService.__new__(EngineService)
- eng.llm_logger = Mock()
-
- # Scheduler does not provide API -> fallback 0
- eng.scheduler = object()
- self.assertEqual(eng._get_scheduler_unhandled_request_num(), 0)
-
- # Positive value -> return int value
- eng.scheduler = type("SchedOK", (), {"get_unhandled_request_num": lambda self: "3"})()
- self.assertEqual(eng._get_scheduler_unhandled_request_num(), 3)
-
- # Negative value -> clamp to 0
- eng.scheduler = type("SchedNeg", (), {"get_unhandled_request_num": lambda self: -5})()
- self.assertEqual(eng._get_scheduler_unhandled_request_num(), 0)
-
- # Exception -> debug log + fallback 0
- eng.scheduler = type(
- "SchedErr", (), {"get_unhandled_request_num": lambda self: (_ for _ in ()).throw(RuntimeError("boom"))}
- )()
- self.assertEqual(eng._get_scheduler_unhandled_request_num(), 0)
- eng.llm_logger.debug.assert_called()
-
- def test_insert_zmq_task_trace_carrier_handling(self):
- """Cover lines 1164-1167: trace_carrier handling in _insert_zmq_task_to_scheduler."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- self.available_prefill_instances = type("X", (), {"put": lambda *_: None})()
-
- def get_server_port(self):
- return 0
-
- def cleanup(self):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=False)
- eng.running = True
-
- # Mock data with trace_carrier to trigger lines 1164-1167
- test_request_id = "test_req_123"
- trace_carrier_data = {"trace_id": "abc123", "span_id": "def456"}
- mock_data_with_trace = {
- "request_id": test_request_id,
- "trace_carrier": trace_carrier_data,
- "status": None,
- "user": "test_user",
- }
-
- class DummyRecv:
- def __init__(self, data):
- self.data = data
- self.call_count = 0
-
- def receive_json_once(self, block):
- self.call_count += 1
- if self.call_count == 1:
- return None, self.data
- else:
- eng.running = False
- return None, None
-
- def receive_pyobj_once(self, block):
- return self.receive_json_once(block)
-
- def close(self):
- pass
-
- eng.recv_request_server = DummyRecv(mock_data_with_trace)
-
- # Mock tracing.trace_set_proc_propagate_context to verify it's called
- with patch("fastdeploy.engine.common_engine.tracing.trace_set_proc_propagate_context") as mock_trace_set:
- with patch.object(eng, "llm_logger"):
- with patch("fastdeploy.engine.common_engine.Request") as MockRequest:
- mock_request = Mock()
- mock_request.metrics.scheduler_recv_req_time = 0
- MockRequest.from_dict.return_value = mock_request
-
- with (
- patch("fastdeploy.engine.common_engine.trace_print"),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- # Verify trace_set_proc_propagate_context was called with correct args (lines 1165-1167)
- mock_trace_set.assert_called_once()
- call_args = mock_trace_set.call_args
- # request_id should be "test" (first part after split on "_") and trace_carrier
- self.assertEqual(call_args[0][0], "test")
- self.assertEqual(call_args[0][1], trace_carrier_data)
-
- # Reset and test without trace_carrier - should not call trace_set_proc_propagate_context
- eng.running = True
- mock_data_without_trace = {
- "request_id": "test_req_456",
- "status": None,
- "user": "test_user",
- }
- eng.recv_request_server = DummyRecv(mock_data_without_trace)
-
- with patch("fastdeploy.engine.common_engine.tracing.trace_set_proc_propagate_context") as mock_trace_set:
- with patch.object(eng, "llm_logger"):
- with patch("fastdeploy.engine.common_engine.Request") as MockRequest:
- mock_request = Mock()
- mock_request.metrics.scheduler_recv_req_time = 0
- MockRequest.from_dict.return_value = mock_request
-
- with (
- patch("fastdeploy.engine.common_engine.trace_print"),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- # Verify trace_set_proc_propagate_context was NOT called when no trace_carrier
- mock_trace_set.assert_not_called()
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_start_zmq_service_internal_adapter(self):
- """Cover lines 1107, 1110: start_zmq_service with FD_ENABLE_INTERNAL_ADAPTER=1."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- # Mock the necessary components
- eng.api_server_pid = 12345
-
- mock_tcp_server = Mock()
- mock_tcp_server.recv_result_handle = Mock()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", 1),
- patch("fastdeploy.engine.common_engine.envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT", "6666"),
- patch("fastdeploy.engine.common_engine.envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT", "6667"),
- patch("fastdeploy.engine.common_engine.ZmqTcpServer", return_value=mock_tcp_server),
- patch("fastdeploy.engine.common_engine.InternalAdapter"),
- patch("fastdeploy.engine.common_engine.threading.Thread") as mock_thread,
- patch("fastdeploy.engine.common_engine.time.sleep"),
- ):
- eng.start_zmq_service(12345)
-
- # Verify thread was created for recv_result_handle (lines 1107-1110)
- self.assertTrue(mock_thread.called)
- # Check that thread was started
- for call in mock_thread.call_args_list:
- if "target" in call[1]:
- thread_instance = mock_thread.return_value
- thread_instance.start.assert_called()
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_start_zmq_service_batch_mode(self):
- """Cover line 1115: start_zmq_service with ZMQ_SEND_BATCH_DATA=1."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- eng.api_server_pid = 12345
-
- mock_ipc_server = Mock()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", 1),
- patch("fastdeploy.engine.common_engine.ZmqIpcServer", return_value=mock_ipc_server) as mock_server,
- patch("fastdeploy.engine.common_engine.time.sleep"),
- ):
- eng.start_zmq_service(12345)
-
- # Verify ZmqIpcServer was called with PUSH mode (line 1115)
- import zmq
-
- calls = mock_server.call_args_list
- push_mode_found = False
- for call in calls:
- # call[0] is positional args, call[1] is keyword args
- # The actual code uses: ZmqIpcServer(name=api_server_pid, mode=zmq.PUSH)
- # So mode is passed as a keyword argument
- if call[1].get("mode") == zmq.PUSH:
- push_mode_found = True
- break
- self.assertTrue(push_mode_found, "PUSH mode should be used when ZMQ_SEND_BATCH_DATA=1")
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_insert_zmq_abort_request_paused(self):
- """Cover abort request handling: abort bypasses is_paused check and routes to add_abort_req_ids (v1)."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=False)
- eng.running = True
- eng.is_paused = True # Engine is paused, but abort requests bypass this check
-
- abort_data = {
- "request_id": "abort_test_req",
- "status": 5, # RequestStatus.ABORT.value
- }
-
- class DummyRecv:
- def __init__(self):
- self.call_count = 0
-
- def receive_json_once(self, block):
- self.call_count += 1
- if self.call_count == 1:
- return None, abort_data
- else:
- eng.running = False
- return None, None
-
- def receive_pyobj_once(self, block):
- return self.receive_json_once(block)
-
- def close(self):
- pass
-
- eng.recv_request_server = DummyRecv()
-
- # Setup resource_manager with abort_req_ids_set
- eng.resource_manager.abort_req_ids_set = set()
- eng.resource_manager.add_abort_req_ids = Mock()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- patch.object(eng, "llm_logger") as mock_logger,
- patch("fastdeploy.engine.common_engine.RequestStatus") as mock_status,
- ):
- mock_status.ABORT.value = 5
- eng._insert_zmq_task_to_scheduler()
-
- # Verify abort request was logged
- info_calls = [str(call) for call in mock_logger.info.call_args_list]
- abort_logged = any("abort" in call.lower() for call in info_calls)
- self.assertTrue(abort_logged, "Should log 'Receive abort request'")
-
- # Verify add_abort_req_ids was called (v1 scheduler path)
- eng.resource_manager.add_abort_req_ids.assert_called_once_with("abort_test_req")
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_insert_zmq_abort_request_in_requests(self):
- """Cover abort request handling: when ENABLE_V1_KVCACHE_SCHEDULER=1, add_abort_req_ids is called."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=False)
- eng.running = True
- eng.is_paused = False
-
- abort_data = {
- "request_id": "abort_in_requests",
- "status": 5, # RequestStatus.ABORT.value
- }
-
- class DummyRecv:
- def __init__(self):
- self.call_count = 0
-
- def receive_json_once(self, block):
- self.call_count += 1
- if self.call_count == 1:
- return None, abort_data
- else:
- eng.running = False
- return None, None
-
- def receive_pyobj_once(self, block):
- return self.receive_json_once(block)
-
- def close(self):
- pass
-
- eng.recv_request_server = DummyRecv()
- eng.resource_manager.abort_req_ids_set = set()
-
- # Mock add_abort_req_ids on resource_manager
- eng.resource_manager.add_abort_req_ids = Mock()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1),
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
- patch.object(eng, "llm_logger"),
- patch("fastdeploy.engine.common_engine.RequestStatus") as mock_status,
- ):
- mock_status.ABORT.value = 5
- eng._insert_zmq_task_to_scheduler()
-
- # Verify add_abort_req_ids was called with the correct req_id (v1 scheduler path)
- eng.resource_manager.add_abort_req_ids.assert_called_once_with("abort_in_requests")
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_run_control_method_with_batch_data(self):
- """Cover lines 1283, 1284, 1290, 1291, 1297, 1298: run_control_method with ZMQ_SEND_BATCH_DATA=1."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- # Mock send_response_server
- eng.send_response_server = Mock()
- eng.send_response_server.send_response = Mock()
-
- control_req = Mock()
- control_req.get_method.return_value = "is_paused" # Use existing method
- control_req.request_id = "control_test_123"
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", 1),
- patch.object(eng, "llm_logger"),
- patch.object(eng, "_control_is_paused") as mock_handler,
- ):
- mock_handler.return_value = {"is_paused": False}
- eng.run_control_method(control_req)
-
- # Verify send_response was called with 2D array (line 1291)
- eng.send_response_server.send_response.assert_called_once()
- call_args = eng.send_response_server.send_response.call_args
- data = call_args[0][1]
- # Should be [[response]] format for batch mode
- self.assertIsInstance(data, list)
- self.assertIsInstance(data[0], list)
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_run_control_method_unknown_with_batch_data(self):
- """Cover lines 1283-1284: unknown control method with ZMQ_SEND_BATCH_DATA=1."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- eng.send_response_server = Mock()
- eng.send_response_server.send_response = Mock()
-
- control_req = Mock()
- control_req.get_method.return_value = "unknown_method"
- control_req.request_id = "control_unknown"
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", 1),
- patch.object(eng, "llm_logger"),
- ):
- eng.run_control_method(control_req)
-
- # Verify send_response was called with error response (lines 1283-1284)
- eng.send_response_server.send_response.assert_called_once()
- call_args = eng.send_response_server.send_response.call_args
- data = call_args[0][1]
- # Should be [[error_response]] format
- self.assertIsInstance(data, list)
- self.assertIsInstance(data[0], list)
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_send_error_response_with_batch_data(self):
- """Cover lines 1467, 1468: _send_error_response with ZMQ_SEND_BATCH_DATA=1."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- eng.send_response_server = Mock()
- eng.send_response_server.send_response = Mock()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", 1),
- patch.object(eng, "llm_logger"),
- ):
- eng._send_error_response("test_req_id", "Test error message", 500)
-
- # Verify send_response was called with 2D array format (lines 1467-1468)
- eng.send_response_server.send_response.assert_called_once()
- call_args = eng.send_response_server.send_response.call_args
- data = call_args[0][1]
- # Should be [[error_result]] format
- self.assertIsInstance(data, list)
- self.assertIsInstance(data[0], list)
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_zmq_send_generated_tokens_batch_mode(self):
- """Cover lines 1530, 1557-1563: _zmq_send_generated_tokens with ZMQ_SEND_BATCH_DATA=1."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=False)
-
- # Initialize request_worker_map for batch mode routing
- import threading as _threading
-
- eng.request_worker_map = {}
- eng.request_worker_map_lock = _threading.Lock()
-
- # Setup scheduler to return results
- mock_output = Mock()
- mock_output.outputs = Mock()
- mock_output.outputs.token_ids = [1, 2, 3]
- mock_output.outputs.decode_type = 1 # Not decode_type 0
- mock_output.finished = False
- mock_output.request_id = "test_req"
-
- eng.scheduler = Mock()
- eng.scheduler.get_results.return_value = {"test_req": [mock_output]}
-
- eng.send_response_server = Mock()
- eng.send_response_server.send_response = Mock()
-
- # Make the loop run only once
- call_count = [0]
-
- def get_results_side_effect():
- call_count[0] += 1
- if call_count[0] == 1:
- return {"test_req": [mock_output]}
- else:
- eng.running = False
- return {}
-
- eng.scheduler.get_results.side_effect = get_results_side_effect
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", 1),
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", 0),
- patch.object(eng, "llm_logger"),
- ):
- eng.running = True
- eng._zmq_send_generated_tokens()
-
- # Verify send_response was called with batch_data (lines 1557-1563)
- eng.send_response_server.send_response.assert_called_once()
- call_args = eng.send_response_server.send_response.call_args
- # First arg should be None, second should be batch_data (list of lists)
- self.assertIsNone(call_args[0][0])
- batch_data = call_args[0][1]
- self.assertIsInstance(batch_data, list)
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- def test_run_control_method_exception_with_batch_data(self):
- """Cover lines 1297-1298: run_control_method exception handling with ZMQ_SEND_BATCH_DATA=1."""
- cfg = self._make_cfg(splitwise_role="mixed")
-
- class DummyQ:
- def __init__(self, *a, **k):
- pass
-
- with patch("fastdeploy.engine.common_engine.EngineWorkerQueue", DummyQ):
- eng = EngineService(cfg, start_queue=False, use_async_llm=True)
-
- eng.send_response_server = Mock()
- eng.send_response_server.send_response = Mock()
-
- control_req = Mock()
- control_req.get_method.return_value = "is_paused" # Use existing method
- control_req.request_id = "control_exception"
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", 1),
- patch.object(eng, "llm_logger"),
- patch.object(eng, "_control_is_paused", side_effect=RuntimeError("Test exception")),
- ):
- eng.run_control_method(control_req)
-
- # Verify send_response was called with error response (lines 1297-1298)
- eng.send_response_server.send_response.assert_called_once()
- call_args = eng.send_response_server.send_response.call_args
- data = call_args[0][1]
- # Should be [[error_response]] format
- self.assertIsInstance(data, list)
- self.assertIsInstance(data[0], list)
-
- if hasattr(eng, "_finalizer"):
- try:
- eng._finalizer.detach()
- except Exception:
- pass
-
- # -----------------------------------------------------------------------
- # New tests targeting uncovered violation lines
- # -----------------------------------------------------------------------
-
- def test_insert_zmq_task_control_request_with_worker_pid(self):
- """Lines 1183-1189: control request when ZMQ_SEND_BATCH_DATA=True maps worker_pid and calls run_control_method."""
- eng = self._make_mixed_engine()
- eng.running = True
- eng.is_paused = False
- eng.guided_decoding_checker = None
- eng.resource_manager = Mock(abort_req_ids_set=set(), requests={})
- eng.scheduler = Mock()
- eng.engine_worker_queue = Mock()
- eng.run_control_method = Mock()
-
- import threading as _threading
-
- eng.request_worker_map = {}
- eng.request_worker_map_lock = _threading.Lock()
-
- ctrl_data = {
- "request_id": "ctrl-batch",
- "method": "is_paused",
- "args": {},
- "zmq_worker_pid": 9999,
- }
-
- class DummyRecv:
- def __init__(self):
- self.calls = 0
-
- def receive_json_once(self, block):
- self.calls += 1
- if self.calls == 1:
- return None, ctrl_data
- eng.running = False
- return None, None
-
- def receive_pyobj_once(self, block):
- return self.receive_json_once(block)
-
- def close(self):
- pass
-
- eng.recv_request_server = DummyRecv()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True),
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- # worker_pid should be stored in request_worker_map for the control request
- self.assertIn("ctrl-batch", eng.request_worker_map)
- self.assertEqual(eng.request_worker_map["ctrl-batch"], 9999)
- eng.run_control_method.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_control_request_exception_with_worker_pid(self):
- """Lines 1188-1189: exception during control request processing is caught and logged."""
- eng = self._make_mixed_engine()
- eng.running = True
- eng.is_paused = False
- eng.guided_decoding_checker = None
- eng.resource_manager = Mock(abort_req_ids_set=set(), requests={})
- eng.scheduler = Mock()
- eng.engine_worker_queue = Mock()
- eng.run_control_method = Mock(side_effect=RuntimeError("ctrl boom"))
-
- import threading as _threading
-
- eng.request_worker_map = {}
- eng.request_worker_map_lock = _threading.Lock()
-
- ctrl_data = {
- "request_id": "ctrl-err",
- "method": "is_paused",
- "args": {},
- "zmq_worker_pid": 1111,
- }
-
- class DummyRecv:
- def __init__(self):
- self.calls = 0
-
- def receive_json_once(self, block):
- self.calls += 1
- if self.calls == 1:
- return None, ctrl_data
- eng.running = False
- return None, None
-
- def receive_pyobj_once(self, block):
- return self.receive_json_once(block)
-
- def close(self):
- pass
-
- eng.recv_request_server = DummyRecv()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True),
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch.object(eng, "llm_logger") as mock_logger,
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- mock_logger.error.assert_called()
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_normal_request_with_worker_pid(self):
- """Lines 1204-1207: normal request stores worker_pid in request_worker_map; abort request handled."""
- eng = self._make_mixed_engine()
- eng.running = True
- eng.is_paused = False
- eng.guided_decoding_checker = None
- eng.resource_manager = Mock(abort_req_ids_set=set(), requests={})
- eng.scheduler = Mock()
- eng.engine_worker_queue = Mock()
-
- import threading as _threading
-
- eng.request_worker_map = {}
- eng.request_worker_map_lock = _threading.Lock()
-
- normal_data = {
- "request_id": "normal-batch",
- "prompt_token_ids": [1, 2],
- "prompt_token_ids_len": 2,
- "temperature": 1.0,
- "zmq_worker_pid": 7777,
- }
-
- class DummyRecv:
- def __init__(self):
- self.calls = 0
-
- def receive_json_once(self, block):
- self.calls += 1
- if self.calls == 1:
- return None, normal_data
- eng.running = False
- return None, None
-
- def receive_pyobj_once(self, block):
- return self.receive_json_once(block)
-
- def close(self):
- pass
-
- eng.recv_request_server = DummyRecv()
- eng.scheduler.put_requests.return_value = [("normal-batch", None)]
-
- class DummyMetrics:
- def __init__(self):
- self.requests_number = Mock(inc=Mock())
- self.num_requests_waiting = Mock(inc=Mock())
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True),
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.main_process_metrics", DummyMetrics()),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- # worker_pid for normal request should be stored
- self.assertIn("normal-batch", eng.request_worker_map)
- self.assertEqual(eng.request_worker_map["normal-batch"], 7777)
- self._detach_finalizer(eng)
-
- def test_insert_zmq_task_abort_request_with_worker_pid(self):
- """Lines 1206-1207: abort request with worker_pid stores mapping then continues."""
- eng = self._make_mixed_engine()
- eng.running = True
- eng.is_paused = False
- eng.guided_decoding_checker = None
-
- import threading as _threading
-
- eng.request_worker_map = {}
- eng.request_worker_map_lock = _threading.Lock()
-
- eng.resource_manager = Mock(abort_req_ids_set=set(), requests={})
- eng.resource_manager.add_abort_req_ids = Mock()
- eng.scheduler = Mock()
- eng.engine_worker_queue = Mock()
-
- abort_data = {
- "request_id": "abort-worker",
- "status": RequestStatus.ABORT.value,
- "zmq_worker_pid": 4444,
- }
-
- class DummyRecv:
- def __init__(self):
- self.calls = 0
-
- def receive_json_once(self, block):
- self.calls += 1
- if self.calls == 1:
- return None, abort_data
- eng.running = False
- return None, None
-
- def receive_pyobj_once(self, block):
- return self.receive_json_once(block)
-
- def close(self):
- pass
-
- eng.recv_request_server = DummyRecv()
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True),
- patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", True),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- ):
- eng._insert_zmq_task_to_scheduler()
-
- # worker_pid stored for abort request
- self.assertIn("abort-worker", eng.request_worker_map)
- self.assertEqual(eng.request_worker_map["abort-worker"], 4444)
- eng.resource_manager.add_abort_req_ids.assert_called_once_with("abort-worker")
- self._detach_finalizer(eng)
-
- def test_run_control_method_logging_with_request_worker_map(self):
- """Lines 1299-1300: run_control_method logs start when ZMQ_SEND_BATCH_DATA=True with request_worker_map."""
- eng = self._make_mixed_engine()
- eng.send_response_server = Mock()
- eng._pause_cond = threading.Condition()
-
- import threading as _threading
-
- eng.request_worker_map = {"ctrl-log": 5555}
- eng.request_worker_map_lock = _threading.Lock()
-
- ctrl_req = ControlRequest(request_id="ctrl-log", method="is_paused")
- eng.is_paused = False
-
- with (
- patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True),
- patch.object(eng, "llm_logger") as mock_logger,
- ):
- eng.run_control_method(ctrl_req)
-
- # Lines 1299-1300: try block start + info logging
- info_msgs = [str(c) for c in mock_logger.info.call_args_list]
- self.assertTrue(any("Start to run control method" in m for m in info_msgs))
- # worker_pid should be popped from the map
- self.assertNotIn("ctrl-log", eng.request_worker_map)
- self._detach_finalizer(eng)
-
- def test_decode_token_return_text_non_empty_delta_is_end_deletes_status(self):
- """Lines 1510-1511: _decode_token with non-empty delta and is_end=True deletes decode_status entry."""
- eng = self._make_mixed_engine()
-
- class DummyProcessor:
- def __init__(self):
- self.decode_status = {"tok-req": (1, 3)}
-
- def ids2tokens(self, token_ids, req_id):
- return "hello", [10, 20, 30], None
-
- eng.data_processor = DummyProcessor()
-
- with patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", True):
- delta, ids = eng._decode_token([10, 20, 30], "tok-req", is_end=True)
-
- self.assertEqual(delta, "hello")
- # decode_status key should be deleted (line 1511)
- self.assertNotIn("tok-req", eng.data_processor.decode_status)
- self._detach_finalizer(eng)
-
- def test_decode_process_splitwise_requests_empty_queue_returns_early(self):
- """Lines 1613-1614: _fetch_requests returns early when disaggregate_queue_empty() is True."""
- cfg = self._make_cfg(
- splitwise_role="decode",
- num_gpu_blocks_override=4,
- router="0.0.0.0:30000",
+ e8 = _eng(monkeypatch)
+ e8.send_response_server = _ns(send_response=_noop)
+ e8.guided_decoding_checker = _ns(schema_format=lambda r: (r, "schema err"))
+ errs8 = []
+ e8._send_error_response = lambda *a: errs8.append(a)
+ _zmq_recv(e8, [(None, {"request_id": "g1", "status": None})])
+ assert len(errs8) >= 1
+ # adapter + decode early return (L1125)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True)
+ e9 = _eng(monkeypatch)
+ e9.cfg.scheduler_config.splitwise_role = "decode"
+ e9._insert_zmq_task_to_scheduler() # returns immediately
+ # pyobj_once path (L1133) — enable_mm triggers receive_pyobj_once
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.ControlRequest.is_control_request", lambda d: False)
+ monkeypatch.setattr(
+ "fastdeploy.engine.common_engine.Request.from_dict",
+ lambda d: _ns(request_id=d["request_id"], metrics=_ns(scheduler_recv_req_time=0)),
)
- eng = self._make_engine(cfg)
- eng.running = True
- eng.enable_decode_cache_task = False
- eng.cfg.splitwise_version = "v1"
- eng.scheduler = Mock(has_request=Mock(return_value=True), put_results=Mock())
- eng._insert_prefilled_requests = Mock()
- eng.insert_tasks = Mock()
-
- class DummyRM:
- def is_resource_sufficient(self, prompt_len):
- return True
-
- eng.resource_manager = DummyRM()
-
- empty_queue_call_count = [0]
-
- class DummyQueueAlwaysEmpty:
- def disaggregate_queue_empty(self):
- empty_queue_call_count[0] += 1
- # Return empty on first call then stop the engine
- eng.running = False
- return True
-
- def get_disaggregated_tasks(self):
- return []
-
- eng.engine_worker_queue = DummyQueueAlwaysEmpty()
+ e10 = _eng(monkeypatch)
+ e10.cfg.model_config.enable_mm = True
+ e10.send_response_server = _ns(send_response=_noop)
+ e10.guided_decoding_checker = None
+ idx10 = [0]
+
+ def recv10(block):
+ idx10[0] += 1
+ if idx10[0] == 1:
+ return None, {"request_id": "mm1", "status": None}
+ e10.running = False
+ return "Context was terminated", None
+
+ e10.recv_request_server = _ns(receive_pyobj_once=recv10)
+ e10._insert_zmq_task_to_scheduler()
+
+ def test_send_tokens(self, monkeypatch):
+ from fastdeploy.engine.request import CompletionOutput, RequestOutput
+
+ def _ro(rid, tids, finished=False, dt=1):
+ co = CompletionOutput.__new__(CompletionOutput)
+ co.token_ids, co.decode_type, co.text = tids, dt, ""
+ ro = RequestOutput.__new__(RequestOutput)
+ ro.request_id, ro.outputs, ro.finished = rid, co, finished
+ return ro
+
+ # non-adapter
+ e = _eng(monkeypatch)
+ e.data_processor = _ns(ids2tokens=lambda t, r: ("x", [1], None), decode_status={"r1": [0, 1]})
+ sent = []
+ e.send_response_server = _ns(send_response=lambda rid, r: sent.append(rid))
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", False)
+ cc = [0]
+
+ def gr():
+ cc[0] += 1
+ if cc[0] == 1:
+ return {"r1": [_ro("r1", [1])]}
+ e.running = False
+ return {}
+
+ e.scheduler.get_results = gr
+ e._zmq_send_generated_tokens()
+ assert len(sent) >= 1
+ # finished empty
+ e2 = _eng(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+ s2 = []
+ e2.send_response_server = _ns(send_response=lambda rid, r: s2.append(rid))
+ cc2 = [0]
+
+ def gr2():
+ cc2[0] += 1
+ if cc2[0] == 1:
+ return {"r2": [_ro("r2", [], finished=True)]}
+ e2.running = False
+ return {}
+
+ e2.scheduler.get_results = gr2
+ e2._zmq_send_generated_tokens()
+ assert len(s2) >= 1
+ # adapter
+ e3 = _eng(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", True)
+ e3.data_processor = _ns(ids2tokens=lambda t, r: ("t", [42], None), decode_status={"a1": [0, 1]})
+ s3 = []
+ e3.send_response_server = _ns(send_response=lambda rid, r: s3.append(rid))
+ cc3 = [0]
+
+ def gr3():
+ cc3[0] += 1
+ if cc3[0] == 1:
+ return [[_ro("a1", [42], dt=0)]]
+ e3.running = False
+ return []
- class DummyThread:
- def __init__(self, target=None, daemon=None):
- self.target = target
+ e3.scheduler.get_results = gr3
+ e3._zmq_send_generated_tokens()
+ assert len(s3) >= 1
+
+ # adapter: decode_type!=0 (L1492), finished+empty (L1498), non-RequestOutput (L1504)
+ e6 = _eng(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", True)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_RETURN_TEXT", False)
+ s6 = []
+ e6.send_response_server = _ns(send_response=lambda rid, r: s6.append(r))
+ cc6 = [0]
+
+ def gr6():
+ cc6[0] += 1
+ if cc6[0] == 1:
+ return [[_ro("a6", [9], dt=1), _ro("a6f", [], finished=True, dt=1), "raw"]]
+ e6.running = False
+ return []
- def start(self):
- try:
- self.target()
- finally:
- eng.running = False
+ e6.scheduler.get_results = gr6
+ e6._zmq_send_generated_tokens()
+ assert len(s6) >= 1
+
+ def test_splitwise_decode(self, monkeypatch):
+ from fastdeploy.engine.request import CompletionOutput, Request, RequestOutput
+
+ _ptr(monkeypatch)
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False)
+
+ def _de(v1=False):
+ monkeypatch.setattr("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", v1)
+ e = _eng(monkeypatch)
+ e.cfg.scheduler_config.splitwise_role = "decode"
+ e.cfg.splitwise_version = "v1" if v1 else "v0"
+ e.enable_decode_cache_task = False
+ e.resource_manager.is_resource_sufficient = lambda n: True
+ e.resource_manager.allocate_resources_for_new_tasks = lambda t: t
+ e.resource_manager.real_bsz = 1
+ e.resource_manager.preallocate_resource_in_d = lambda t: True
+ e.resource_manager.pre_recycle_resource = _noop
+ e.resource_manager.add_prefilled_request = _noop
+ e.split_connector = _ns(send_cache_info_to_prefill=_noop)
+ e.token_processor.tokens_counter = {}
+ e.engine_worker_queue = _ns(
+ disaggregate_queue_empty=lambda: True, get_disaggregated_tasks=lambda: [], put_tasks=_noop
+ )
+ return e
+
+ def _rr(rid="x"):
+ r = Request.__new__(Request)
+ r.request_id, r.prompt_token_ids_len = rid, 10
+ r.metrics = _ns(decode_recv_req_time=0, decode_preallocate_req_time=0)
+ r.error_msg = ""
+ return r
+
+ def _rro(rid="y", ec=200):
+ co = CompletionOutput.__new__(CompletionOutput)
+ co.token_ids = [42]
+ ro = RequestOutput.__new__(RequestOutput)
+ ro.request_id, ro.outputs, ro.finished = rid, co, False
+ ro.error_code, ro.error_msg = ec, "" if ec == 200 else "fail"
+ ro.metrics = _ns(decode_recv_first_token_time=0)
+ return ro
+
+ def _run(eng, items):
+ cc = [0]
+
+ def qe():
+ return cc[0] > 0
+
+ def gt():
+ cc[0] += 1
+ eng.running = False
+ return items
- with (
- patch("fastdeploy.engine.common_engine.threading.Thread", DummyThread),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
- patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", False),
- ):
+ eng.engine_worker_queue.disaggregate_queue_empty = qe
+ eng.engine_worker_queue.get_disaggregated_tasks = gt
eng._decode_process_splitwise_requests()
+ time.sleep(0.05) # let daemon thread finish
+
+ # v0: fetch
+ e = _de(v1=False)
+ ins = []
+ e.insert_tasks = lambda t, **kw: ins.append(t)
+ _run(e, [(0, [_rr("rf")])])
+ assert len(ins) >= 1
+ # v1: fetch outputs
+ e2 = _de(v1=True)
+ e2.scheduler.has_request = lambda rid: True
+ added = []
+ e2.resource_manager.add_prefilled_request = lambda r: added.append(r)
+ _run(e2, [(0, [_rro("ro")])])
+ assert len(added) >= 1
+ # v1: alloc fail
+ e3 = _de(v1=True)
+ e3.resource_manager.preallocate_resource_in_d = lambda t: False
+ e3.split_connector.send_cache_info_to_prefill = _noop
+ rr3 = _rr("rf2")
+ _run(e3, [(0, [rr3])])
+ assert rr3.error_msg == "Not enough resources"
+ # v1: error code
+ e4 = _de(v1=True)
+ e4.scheduler.has_request = lambda rid: True
+ recycled = []
+ e4.resource_manager.pre_recycle_resource = lambda rid: recycled.append(rid)
+ _run(e4, [(0, [_rro("re", ec=500)])])
+ assert "re" in recycled
+ # v1: alloc success (Request + preallocate=True) → L1581-1586
+ e5 = _de(v1=True)
+ sent5 = []
+ e5.split_connector.send_cache_info_to_prefill = lambda t: sent5.append(t)
+ _run(e5, [(0, [_rr("rs")])])
+ assert len(sent5) >= 1
- # Queue was seen as empty so get_disaggregated_tasks should not be called
- self.assertEqual(empty_queue_call_count[0], 1)
- eng.insert_tasks.assert_not_called()
- self._detach_finalizer(eng)
-
- def test_register_to_router_inner_function_runs(self):
- """_register inner function body executes (timeout and sleep_seconds set)."""
- from fastdeploy.engine.register_manager import RegisterManager
-
- eng = self._make_mixed_engine()
- eng.cfg.router_config.router = "http://fake-router"
- eng.cfg.router_config.api_server_host = "127.0.0.1"
- eng.cfg.router_config.api_server_port = 19999
- eng.cfg.register_info = {"name": "test-server"}
-
- reg_mgr = RegisterManager(
- cfg=eng.cfg,
- engine_worker_queue=MagicMock(),
- get_is_paused=lambda: False,
- )
-
- captured_target = [None]
-
- class _CapturingThread:
- def __init__(self, target=None, daemon=None):
- captured_target[0] = target
- self.target = target
- self.daemon = daemon
-
- def start(self):
- pass # don't auto-start
-
- with patch("fastdeploy.engine.register_manager.threading.Thread", _CapturingThread):
- reg_mgr._register_to_router()
-
- # Verify the inner _register function was captured
- self.assertIsNotNone(captured_target[0])
-
- # Now invoke the inner _register function directly.
- # Mock out check_service_health to return False so it doesn't hang,
- # and time.sleep to raise StopIteration to break the while True loop.
- call_count = [0]
-
- def _fake_sleep(s):
- call_count[0] += 1
- if call_count[0] >= 2:
- raise StopIteration("stop")
-
- with (
- patch("fastdeploy.engine.register_manager.check_service_health", return_value=False),
- patch("fastdeploy.engine.register_manager.time.sleep", _fake_sleep),
- ):
- try:
- captured_target[0]()
- except StopIteration:
- pass
-
- # At least one sleep call was made, confirming the inner function executed
- self.assertGreaterEqual(call_count[0], 1)
- self._detach_finalizer(eng)
-
- # ── _control_abort_requests / _wait_abort_complete ───────────────
-
- def _make_abort_engine(self, splitwise_role="mixed"):
- """Create an engine wired up for abort tests."""
- extra = {}
- if splitwise_role != "mixed":
- extra["router"] = "0.0.0.0:9000"
- cfg = self._make_cfg(splitwise_role=splitwise_role, num_gpu_blocks_override=4, **extra)
- eng = self._make_engine(cfg)
- eng.llm_logger = MagicMock()
-
- # data_processor with eos token
- eng.data_processor = MagicMock()
- eng.data_processor.eos_token_ids = [2]
-
- # resource_manager with requests dict and abort sets
- eng.resource_manager = MagicMock()
- eng.resource_manager.requests = {}
- eng.resource_manager.waiting_abort_req_id_set = set()
- eng.resource_manager.to_be_aborted_req_id_set = set()
- eng.resource_manager.get_reqs_in_aborting = lambda: (
- eng.resource_manager.waiting_abort_req_id_set | eng.resource_manager.to_be_aborted_req_id_set
- )
-
- # scheduler with requests dict and put_results
- eng.scheduler = MagicMock()
- eng.scheduler.requests = {}
- eng.scheduler.put_results = MagicMock()
-
- return eng
-
- def _make_fake_request(self, output_token_ids=None):
- """Create a fake request object for abort tests."""
- req = MagicMock()
- req.output_token_ids = output_token_ids or [10, 20, 30]
- req.metrics = MagicMock()
- req.metrics.arrival_time = 1000.0
- req.metrics.inference_start_time = 1000.1
- req.metrics.engine_recv_first_token_time = 1000.2
- return req
-
- def test_control_abort_requests_not_v1_raises(self):
- """abort_requests raises when ENABLE_V1_KVCACHE_SCHEDULER is off."""
- eng = self._make_abort_engine()
- control_req = ControlRequest("ctrl-1", "abort_requests", {"abort_all": True, "req_ids": []})
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", 0):
- with self.assertRaises(Exception) as ctx:
- eng._control_abort_requests(control_req)
- self.assertIn("only supported", str(ctx.exception))
- self._detach_finalizer(eng)
-
- def test_control_abort_requests_abort_all(self):
- """abort_all=True aborts all requests in resource_manager + scheduler."""
- eng = self._make_abort_engine()
- eng.resource_manager.requests = {"req-1_0": self._make_fake_request([10, 20])}
- eng.scheduler.requests = {"req-2_0": MagicMock(raw=self._make_fake_request([30]))}
-
- control_req = ControlRequest("ctrl-1", "abort_requests", {"abort_all": True, "req_ids": []})
-
- def clear_abort_sets(req_id):
- # Simulate immediate abort completion
- eng.resource_manager.waiting_abort_req_id_set.discard(req_id)
-
- eng.resource_manager.add_abort_req_ids = MagicMock(side_effect=clear_abort_sets)
-
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1):
- result = eng._control_abort_requests(control_req)
-
- self.assertEqual(len(result["aborted"]), 2)
- self.assertEqual(result["not_found"], [])
- ids = {a["request_id"] for a in result["aborted"]}
- self.assertEqual(ids, {"req-1_0", "req-2_0"})
- # put_results should have been called (not prefill)
- eng.scheduler.put_results.assert_called_once()
- self._detach_finalizer(eng)
-
- def test_control_abort_requests_by_req_ids_with_suffix_match(self):
- """req_ids match both exact and _0 suffix."""
- eng = self._make_abort_engine()
- eng.resource_manager.requests = {
- "req-A_0": self._make_fake_request([1, 2, 3]),
- "req-B": self._make_fake_request([4, 5]),
- }
-
- control_req = ControlRequest(
- "ctrl-1",
- "abort_requests",
- {
- "abort_all": False,
- "req_ids": ["req-A", "req-B", "req-C"],
- },
- )
-
- def clear_abort_sets(req_id):
- eng.resource_manager.waiting_abort_req_id_set.discard(req_id)
-
- eng.resource_manager.add_abort_req_ids = MagicMock(side_effect=clear_abort_sets)
-
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1):
- result = eng._control_abort_requests(control_req)
-
- aborted_ids = {a["request_id"] for a in result["aborted"]}
- self.assertIn("req-A_0", aborted_ids) # matched via _0 suffix
- self.assertIn("req-B", aborted_ids) # exact match
- self.assertEqual(result["not_found"], ["req-C"])
- self._detach_finalizer(eng)
-
- def test_control_abort_requests_no_match(self):
- """No requests found returns empty aborted and all in not_found."""
- eng = self._make_abort_engine()
- control_req = ControlRequest(
- "ctrl-1",
- "abort_requests",
- {
- "abort_all": False,
- "req_ids": ["nonexistent"],
- },
- )
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1):
- result = eng._control_abort_requests(control_req)
-
- self.assertEqual(result["aborted"], [])
- self.assertEqual(result["not_found"], ["nonexistent"])
- self._detach_finalizer(eng)
-
- def test_control_abort_requests_prefill_skips_wait_and_put(self):
- """Prefill role skips _wait_abort_complete and put_results."""
- eng = self._make_abort_engine(splitwise_role="prefill")
- eng.resource_manager.requests = {"req-1_0": self._make_fake_request()}
-
- control_req = ControlRequest("ctrl-1", "abort_requests", {"abort_all": True, "req_ids": []})
- eng.resource_manager.add_abort_req_ids = MagicMock()
-
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1):
- result = eng._control_abort_requests(control_req)
-
- self.assertEqual(len(result["aborted"]), 1)
- eng.scheduler.put_results.assert_not_called()
- self._detach_finalizer(eng)
-
- def test_control_abort_requests_output_token_count(self):
- """output_token_count reflects partial_token_ids length."""
- eng = self._make_abort_engine()
- eng.resource_manager.requests = {"req-1_0": self._make_fake_request([10, 20, 30, 40, 50])}
-
- control_req = ControlRequest("ctrl-1", "abort_requests", {"abort_all": True, "req_ids": []})
-
- def clear_abort_sets(req_id):
- eng.resource_manager.waiting_abort_req_id_set.discard(req_id)
-
- eng.resource_manager.add_abort_req_ids = MagicMock(side_effect=clear_abort_sets)
-
- with patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1):
- result = eng._control_abort_requests(control_req)
-
- self.assertEqual(result["aborted"][0]["output_token_count"], 5)
- self._detach_finalizer(eng)
-
- def test_wait_abort_complete_immediate(self):
- """_wait_abort_complete returns immediately when all requests already cleaned."""
- eng = self._make_abort_engine()
- # Empty abort sets → remaining is empty → returns immediately
- eng._wait_abort_complete(["req-1_0"])
- self._detach_finalizer(eng)
-
- def test_wait_abort_complete_progress(self):
- """_wait_abort_complete exits when background thread cleans up."""
- eng = self._make_abort_engine()
- eng.resource_manager.waiting_abort_req_id_set = {"req-1_0"}
- # Add the request to requests dict so it won't be filtered out
- eng.resource_manager.requests = {"req-1_0": self._make_fake_request()}
-
- call_count = [0]
-
- def fake_sleep(s):
- call_count[0] += 1
- # Simulate background thread cleaning up after first sleep
- eng.resource_manager.waiting_abort_req_id_set.discard("req-1_0")
-
- with patch("fastdeploy.engine.common_engine.time.sleep", fake_sleep):
- eng._wait_abort_complete(["req-1_0"])
-
- self.assertGreaterEqual(call_count[0], 1)
- self._detach_finalizer(eng)
-
- def test_wait_abort_complete_force_cleanup_stuck_in_to_be_aborted(self):
- """Stall timeout triggers force cleanup for requests in to_be_aborted_req_id_set."""
- eng = self._make_abort_engine()
- eng.resource_manager.to_be_aborted_req_id_set = {"req-1_0"}
- # Add the request to requests dict so it won't be filtered out
- eng.resource_manager.requests = {"req-1_0": self._make_fake_request()}
-
- def mock_recycle(req_id):
- eng.resource_manager.to_be_aborted_req_id_set.discard(req_id)
-
- eng.resource_manager.recycle_abort_task = MagicMock(side_effect=mock_recycle)
-
- # Make time.time() advance past stall_timeout
- time_values = [100.0, 100.0, 102.0, 102.0, 102.0]
- time_idx = [0]
-
- def fake_time():
- idx = min(time_idx[0], len(time_values) - 1)
- time_idx[0] += 1
- return time_values[idx]
-
- with (
- patch("fastdeploy.engine.common_engine.time.time", fake_time),
- patch("fastdeploy.engine.common_engine.time.sleep", lambda s: None),
- ):
- eng._wait_abort_complete(["req-1_0"], stall_timeout=1)
-
- eng.resource_manager.recycle_abort_task.assert_called_with("req-1_0")
- self._detach_finalizer(eng)
-
-
-class TestWorkerTracebackFunctions(unittest.TestCase):
- """测试 _read_latest_worker_traceback 和 _format_worker_launch_failure_message 函数"""
-
- def test_read_latest_worker_traceback_finds_traceback(self):
- """测试能够正确读取 workerlog 文件中的 traceback"""
- with tempfile.TemporaryDirectory() as temp_dir:
- worker_log = os.path.join(temp_dir, "workerlog.0")
- with open(worker_log, "w", encoding="utf-8") as fp:
- fp.write(
- "Some normal log output\n"
- "Traceback (most recent call last):\n"
- ' File "worker_process.py", line 1, in \n'
- " run_worker_proc()\n"
- "ValueError: The total number of blocks cannot be less than zero.\n"
- )
-
- result = _read_latest_worker_traceback(temp_dir)
- self.assertIsNotNone(result)
- self.assertIn("Traceback (most recent call last):", result)
- self.assertIn("ValueError:", result)
-
- def test_read_latest_worker_traceback_returns_none_when_no_traceback(self):
- """测试当没有 traceback 时返回 None"""
- with tempfile.TemporaryDirectory() as temp_dir:
- worker_log = os.path.join(temp_dir, "workerlog.0")
- with open(worker_log, "w", encoding="utf-8") as fp:
- fp.write("Normal log output without any errors\n")
-
- result = _read_latest_worker_traceback(temp_dir)
- self.assertIsNone(result)
-
- def test_read_latest_worker_traceback_returns_none_when_no_files(self):
- """测试当没有 workerlog 文件时返回 None"""
- with tempfile.TemporaryDirectory() as temp_dir:
- result = _read_latest_worker_traceback(temp_dir)
- self.assertIsNone(result)
-
- def test_read_latest_worker_traceback_returns_none_for_nonexistent_dir(self):
- """测试当目录不存在时返回 None"""
- result = _read_latest_worker_traceback("/nonexistent/path")
- self.assertIsNone(result)
-
- def test_read_latest_worker_traceback_picks_latest_file(self):
- """测试当有多个 workerlog 文件时选择最新的"""
- with tempfile.TemporaryDirectory() as temp_dir:
- # 创建较旧的文件
- old_log = os.path.join(temp_dir, "workerlog.0")
- with open(old_log, "w", encoding="utf-8") as fp:
- fp.write("Traceback (most recent call last):\nOldError: old error\n")
-
- # 短暂等待以确保时间戳不同
- time.sleep(0.01)
-
- # 创建较新的文件
- new_log = os.path.join(temp_dir, "workerlog.1")
- with open(new_log, "w", encoding="utf-8") as fp:
- fp.write("Traceback (most recent call last):\nNewError: new error\n")
-
- result = _read_latest_worker_traceback(temp_dir)
- self.assertIsNotNone(result)
- self.assertIn("NewError", result)
-
- def test_format_worker_launch_failure_message_with_traceback(self):
- """测试带有 traceback 的错误消息格式化"""
- with tempfile.TemporaryDirectory() as temp_dir:
- worker_log = os.path.join(temp_dir, "workerlog.0")
- with open(worker_log, "w", encoding="utf-8") as fp:
- fp.write("Traceback (most recent call last):\n" "ValueError: Test error message\n")
-
- result = _format_worker_launch_failure_message(temp_dir)
- self.assertIn("Failed to launch worker processes", result)
- self.assertIn("workerlog.*", result)
- self.assertIn("Traceback (most recent call last):", result)
- self.assertIn("ValueError: Test error message", result)
-
- def test_format_worker_launch_failure_message_without_traceback(self):
- """测试没有 traceback 时的错误消息格式化"""
- with tempfile.TemporaryDirectory() as temp_dir:
- result = _format_worker_launch_failure_message(temp_dir)
- self.assertIn("Failed to launch worker processes", result)
- self.assertIn("workerlog.*", result)
- self.assertNotIn("Traceback", result)
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])