From 91aa36c7f585b5c58cf31a59ef8a88fa885b4aa1 Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Wed, 6 May 2026 14:22:24 +0800
Subject: [PATCH 1/4] feat(multimodal): bound ViT peak memory with token-budget
 admission + processor max_pixels clamp

Introduces three layers of OOM protection for dynamic-resolution multimodal
models (Qwen2.5/3/3.5-VL, qwen3-omni, tarsier2), analogous to the LLM side's
batch_max_tokens + max_req_total_len pair:

- --visual_batch_max_tokens: per-step ViT admission budget measured in image
  output tokens (post spatial_merge). pull_batch_with_budget in the ViT
  scheduler stops adding images once cumulative token_num would exceed the
  budget. The first image is always admitted to avoid deadlock.
- --visual_image_max_tokens: per-image hard cap. When multimodal is enabled,
  defaults to visual_batch_max_tokens, which in turn defaults to
  batch_max_tokens. Asserts visual_image_max_tokens <= visual_batch_max_tokens
  at startup.
- clamp_processor_max_pixels: at tokenizer/ViT load time, tightens the HF
  image processor's max_pixels so smart_resize auto-clamps any oversized
  image to fit the per-image token budget. Runtime token_num > budget becomes
  structurally impossible for Qwen-VL-family models. Treats
  processor.max_pixels=None (e.g. Qwen3.5-VL AutoProcessor) as "looser than
  any budget" and always applies the computed allowed_max_pixels.

enforce_image_token_budget remains as defense-in-depth in the HTTP managers,
firing for InternVL and any non-processor-clamped path.

Also fixes a heartbeat regression in the ViT infer worker: rank 0 used to
block indefinitely on the infer queue while non-zero ranks waited on a gloo
broadcast with a 30-minute timeout, crashing the workers during idle
periods. Rank 0 now polls with a 60s timeout and broadcasts an empty batch
as a heartbeat so all ranks stay in sync.

Tests: batching, budget validation, processor clamp, and budget enforcement
(20+ tests, all pass).
---
 docs/EN/source/cookbook/qwen35_deployment.rst |   9 ++
 docs/EN/source/tutorial/api_server_args.rst   |  29 ++++
 lightllm/models/qwen2_5_vl/qwen2_5_visual.py  |   6 +-
 lightllm/models/qwen2_vl/qwen2_visual.py      |   6 +-
 lightllm/models/qwen2_vl/vision_process.py    |  32 +++++
 .../qwen3_omni_visual.py                      |   6 +-
 lightllm/models/qwen3_vl/qwen3_visual.py      |   6 +-
 lightllm/models/tarsier2/tarsier2_visual.py   |   6 +-
 lightllm/server/api_cli.py                    |  29 ++++
 lightllm/server/api_start.py                  |  29 ++++
 lightllm/server/core/objs/start_args_type.py  |   2 +
 lightllm/server/httpserver/manager.py         |  10 +-
 .../httpserver_for_pd_master/manager.py       |   7 +-
 lightllm/server/tokenizer.py                  |  13 ++
 lightllm/server/visualserver/manager.py       |   1 +
 .../visualserver/model_infer/batching.py      |  64 +++++++++
 .../visualserver/model_infer/model_rpc.py     |  56 +++++---
 .../visualserver/visual_only_manager.py       |   1 +
 lightllm/utils/multimodal_utils.py            |  15 ++
 .../test_clamp_processor_max_pixels.py        | 131 ++++++++++++++++++
 unit_tests/server/visualserver/__init__.py    |   0
 .../server/visualserver/test_batching.py      | 113 +++++++++++++++
 unit_tests/utils/test_image_token_budget.py   |  47 +++++++
 23 files changed, 586 insertions(+), 32 deletions(-)
 create mode 100644 lightllm/server/visualserver/model_infer/batching.py
 create mode 100644 unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
 create mode 100644 unit_tests/server/visualserver/__init__.py
 create mode 100644 unit_tests/server/visualserver/test_batching.py
 create mode 100644 unit_tests/utils/test_image_token_budget.py

diff --git a/docs/EN/source/cookbook/qwen35_deployment.rst b/docs/EN/source/cookbook/qwen35_deployment.rst
index 36e1288eed..df7e316269 100644
--- a/docs/EN/source/cookbook/qwen35_deployment.rst
+++ b/docs/EN/source/cookbook/qwen35_deployment.rst
@@ -233,3 +233,12 @@ Hardware Requirements
 - ``--tp 8`` required to fit model weights across GPUs
 - Reduce ``--max_req_total_len`` or ``--graph_max_batch_size`` if encountering OOM errors
 - Use ``--data_type fp8_e4m3`` for FP8 KV quantization to further reduce memory pressure
+- For multimodal OOMs driven by dynamic-resolution images or video, cap the
+  per-step ViT workload with ``--visual_batch_max_tokens`` (e.g. ``16384``).
+  This bounds peak ViT memory the same way ``--batch_max_tokens`` bounds the
+  LLM prefill. Setting this alone also derives a default single-image cap
+  (``--visual_image_max_tokens`` is implicitly set to the same value), which
+  rejects any single image that couldn't fit in one batch — closing the "first
+  image always admitted" deadlock-avoidance hole. Override
+  ``--visual_image_max_tokens`` separately only if you need a stricter single-
+  image limit.
diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst
index fef767d636..896ab84fba 100644
--- a/docs/EN/source/tutorial/api_server_args.rst
+++ b/docs/EN/source/tutorial/api_server_args.rst
@@ -272,6 +272,35 @@ Multimodal Parameters
 
     Number of images processed in each inference batch, default is ``1``
 
+.. option:: --visual_batch_max_tokens
+
+    Per-step ViT admission budget, measured in image output tokens (post
+    spatial_merge). The multimodal analogue of ``--batch_max_tokens``: the
+    ViT scheduler stops adding images to the current batch once their
+    cumulative ``token_num`` would exceed this value. Useful for bounding
+    peak ViT memory on dynamic-resolution models (Qwen2.5/3/3.5-VL, etc.)
+    where one 4K image or long video can contain more patches than many
+    small images combined. One image is always admitted per step to avoid
+    deadlock when a single request is larger than the budget. Default is
+    ``None`` (disabled; only ``--visual_infer_batch_size`` applies).
+
+.. option:: --visual_image_max_tokens
+
+    Per-image hard cap, measured in image output tokens (post spatial_merge).
+    The multimodal analogue of ``--max_req_total_len``: a single image whose
+    ``token_num`` exceeds this value is rejected with a ``ValueError`` before
+    reaching the ViT. Pairs with ``--visual_batch_max_tokens`` to close the
+    "first image always admitted" hole — without this cap, one 4K image can
+    still OOM the ViT on its own.
+
+    If not specified, defaults to ``--visual_batch_max_tokens`` (single image
+    must fit in one batch; this is the implicit precondition of the first-image-
+    always-admitted rule). Set explicitly to a smaller value only if your ViT
+    cannot handle a batch-sized image alone. Must satisfy
+    ``visual_image_max_tokens <= visual_batch_max_tokens``.
+    Default is ``None`` (disabled when ``--visual_batch_max_tokens`` is also
+    unset).
+
 .. option:: --visual_gpu_ids
 
     List of GPU IDs to use, e.g., 0 1 2
diff --git a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
index 7156a5ce23..daef2ccc73 100644
--- a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
+++ b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
@@ -8,7 +8,8 @@
 from io import BytesIO
 import torch.nn as nn
 from transformers.activations import ACT2FN
-from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 from safetensors import safe_open
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.models.qwen2_vl.qwen2_visual import PatchEmbed, VisionRotaryEmbedding
@@ -208,6 +209,9 @@ def __init__(
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen2_5_vl-vit"
+        )
 
         self._init_datatype()
 
diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
index 6076756043..0e248c938f 100644
--- a/lightllm/models/qwen2_vl/qwen2_visual.py
+++ b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -33,7 +33,8 @@
 from safetensors import safe_open
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.server.visualserver import get_vit_attn_backend
-from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.models.qwen2_vl.triton_kernel.rotary_pos_emb import apply_rotary_pos_emb_triton
 
@@ -244,6 +245,9 @@ def load_model(self, weight_dir):
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen2_vl-vit"
+        )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:
diff --git a/lightllm/models/qwen2_vl/vision_process.py b/lightllm/models/qwen2_vl/vision_process.py
index bc313fe467..77626a4730 100644
--- a/lightllm/models/qwen2_vl/vision_process.py
+++ b/lightllm/models/qwen2_vl/vision_process.py
@@ -27,6 +27,38 @@
 logger = init_logger(__name__)
 
 
+def clamp_processor_max_pixels(processor, visual_image_max_tokens, processor_name: str = "") -> None:
+    """Clamp a Qwen-VL style image processor's ``max_pixels`` so that even a
+    max-sized image produces ``token_num <= visual_image_max_tokens``.
+
+    Reuses the processor's built-in ``smart_resize`` + ``max_pixels`` mechanism —
+    just tightens ``max_pixels`` so the existing resize path fits the server-wide
+    per-image token budget. After the clamp, ``get_image_token_length`` cannot
+    return a value above the budget, so request-level rejection becomes a
+    defensive no-op in practice.
+
+    No-op when ``visual_image_max_tokens`` is None or the processor already
+    enforces a tighter bound.
+    """
+    if visual_image_max_tokens is None:
+        return
+    unit = processor.patch_size * processor.merge_size
+    allowed_max_pixels = visual_image_max_tokens * unit * unit
+    if allowed_max_pixels < unit * unit:
+        raise ValueError(
+            f"visual_image_max_tokens={visual_image_max_tokens} is too small; "
+            f"need at least 1 patch's worth (={unit * unit} pixels) for {processor_name or 'processor'}."
+        )
+    current_max_pixels = getattr(processor, "max_pixels", None)
+    if current_max_pixels is None or allowed_max_pixels < current_max_pixels:
+        logger.info(
+            f"{processor_name or 'processor'}: clamping max_pixels "
+            f"{current_max_pixels} -> {allowed_max_pixels} "
+            f"(visual_image_max_tokens={visual_image_max_tokens}, unit={unit})"
+        )
+        processor.max_pixels = allowed_max_pixels
+
+
 IMAGE_FACTOR = 28
 MIN_PIXELS = 4 * 28 * 28
 MAX_PIXELS = 16384 * 28 * 28
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py
index 0276724749..a09a5654ce 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py
@@ -27,7 +27,8 @@
 
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
-from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.models.qwen2_vl.qwen2_visual import VisionRotaryEmbedding, VisionFlashAttention
 
 
@@ -225,6 +226,9 @@ def load_model(self, weight_dir):
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen3_omni-vit"
+        )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:
diff --git a/lightllm/models/qwen3_vl/qwen3_visual.py b/lightllm/models/qwen3_vl/qwen3_visual.py
index bed8898115..f876fa2b7d 100644
--- a/lightllm/models/qwen3_vl/qwen3_visual.py
+++ b/lightllm/models/qwen3_vl/qwen3_visual.py
@@ -27,7 +27,8 @@
 
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
-from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.models.qwen2_vl.qwen2_visual import VisionRotaryEmbedding, VisionFlashAttention
 from lightllm.utils.log_utils import init_logger
 
@@ -220,6 +221,9 @@ def load_model(self, weight_dir):
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen3_vl-vit"
+        )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:
diff --git a/lightllm/models/tarsier2/tarsier2_visual.py b/lightllm/models/tarsier2/tarsier2_visual.py
index 9deaf08575..90f3b7a6ae 100644
--- a/lightllm/models/tarsier2/tarsier2_visual.py
+++ b/lightllm/models/tarsier2/tarsier2_visual.py
@@ -16,7 +16,8 @@
 from lightllm.models.qwen2_vl.qwen2_visual import Qwen2VisionTransformerPretrainedModel
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.server.multimodal_params import ImageItem
-from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor, resize_image
+from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor, resize_image, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 
 
 def add_split_tokens(image_features, image_newline_embed, image_new_embed):
@@ -221,6 +222,9 @@ def load_model(self, weight_dir):
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="tarsier2-vit"
+        )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index 70b638715e..71176939dd 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -472,6 +472,35 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--visual_infer_batch_size", type=int, default=None, help="number of images to process in each inference batch"
     )
+    parser.add_argument(
+        "--visual_batch_max_tokens",
+        type=int,
+        default=None,
+        help="""
+        Per-step ViT admission budget measured in image output tokens (post
+        spatial_merge). When set, the ViT scheduler stops adding images to the
+        current batch once their cumulative token_num would exceed this value.
+        Acts as the multimodal analogue of --batch_max_tokens and caps peak ViT
+        memory/compute for dynamic-resolution models (Qwen2.5/3/3.5-VL, etc.).
+        One image is always admitted per step to avoid deadlock when a single
+        request is larger than the budget. Defaults to None (disabled; only the
+        image-count cap applies).
+        """,
+    )
+    parser.add_argument(
+        "--visual_image_max_tokens",
+        type=int,
+        default=None,
+        help="""
+        Per-image hard cap, measured in image output tokens (post spatial_merge).
+        The multimodal analogue of --max_req_total_len: a single image whose
+        token_num exceeds this value is rejected with a ValueError instead of
+        being forwarded to the ViT. Pairs with --visual_batch_max_tokens to
+        close the "first image always admitted" hole — without this cap, one
+        4K image or long-aspect-ratio image can still OOM the ViT by itself.
+        Defaults to None (disabled; any size is accepted).
+        """,
+    )
     parser.add_argument(
         "--visual_send_batch_size",
         type=int,
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
index 2acf8d3626..c88be4dd3e 100644
--- a/lightllm/server/api_start.py
+++ b/lightllm/server/api_start.py
@@ -272,6 +272,24 @@ def normal_or_p_d_start(args):
         args.cpu_cache_token_page_size = args.linear_att_hash_page_size * args.linear_att_page_block_num
         logger.info(f"set cpu_cache_token_page_size to {args.cpu_cache_token_page_size} for linear hybrid att model")
 
+    # 多模态预算默认值：
+    # - visual_batch_max_tokens 默认等于 batch_max_tokens（LLM 和 ViT 共用预算口径）
+    # - visual_image_max_tokens 默认等于 visual_batch_max_tokens（单图必须能塞进一个批次，
+    #   "首图必放行"规则的隐含前提）
+    # 用户显式指定其中任意一个会覆盖默认值。
+    if args.enable_multimodal:
+        if args.visual_batch_max_tokens is None:
+            args.visual_batch_max_tokens = args.batch_max_tokens
+            logger.info(f"visual_batch_max_tokens auto-derived from batch_max_tokens = {args.batch_max_tokens}")
+        if args.visual_image_max_tokens is None:
+            args.visual_image_max_tokens = args.visual_batch_max_tokens
+        if args.visual_image_max_tokens > args.visual_batch_max_tokens:
+            raise ValueError(
+                f"visual_image_max_tokens ({args.visual_image_max_tokens}) must be "
+                f"<= visual_batch_max_tokens ({args.visual_batch_max_tokens}); otherwise "
+                f"a single 'valid' image can always exceed the batch budget alone."
+            )
+
     # help to manage data stored on Ceph
     if "s3://" in args.model_dir:
         from lightllm.utils.petrel_helper import s3_model_prepare
@@ -563,6 +581,17 @@ def visual_only_start(args):
         args.visual_gpu_ids = list(range(args.visual_dp * args.visual_tp))
     if args.visual_infer_batch_size is None:
         args.visual_infer_batch_size = args.visual_dp
+    if args.visual_image_max_tokens is None and args.visual_batch_max_tokens is not None:
+        args.visual_image_max_tokens = args.visual_batch_max_tokens
+    if (
+        args.visual_image_max_tokens is not None
+        and args.visual_batch_max_tokens is not None
+        and args.visual_image_max_tokens > args.visual_batch_max_tokens
+    ):
+        raise ValueError(
+            f"visual_image_max_tokens ({args.visual_image_max_tokens}) must be "
+            f"<= visual_batch_max_tokens ({args.visual_batch_max_tokens})"
+        )
     if args.data_type is None:
         from lightllm.utils.config_utils import get_dtype
 
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
index bcc4b3798a..216fe4307f 100644
--- a/lightllm/server/core/objs/start_args_type.py
+++ b/lightllm/server/core/objs/start_args_type.py
@@ -107,6 +107,8 @@ class StartArgs:
     push_interval: int = field(default=10)
     visual_node_id: int = field(default=None)
     visual_infer_batch_size: int = field(default=None)
+    visual_batch_max_tokens: Optional[int] = field(default=None)
+    visual_image_max_tokens: Optional[int] = field(default=None)
     visual_send_batch_size: int = field(default=1)
     visual_gpu_ids: List[int] = field(default_factory=lambda: [0])
     visual_tp: int = field(default=1)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 892e202e2d..1acb6001a3 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -35,6 +35,7 @@
 from lightllm.utils.config_utils import get_vocab_size
 from lightllm.utils.envs_utils import get_unique_server_name
 from lightllm.utils.error_utils import NixlPrefillNodeStopGenToken
+from lightllm.utils.multimodal_utils import enforce_image_token_budget
 from rpyc.utils.classic import obtain
 
 logger = init_logger(__name__)
@@ -179,11 +180,12 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
             items, md5sums, tokens_nums, datas = [], [], [], []
-            for img in multimodal_params.images:
+            for img_index, img in enumerate(multimodal_params.images):
                 self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
                 data = img.read()
                 # must after init_imageitem_extral_params
                 token_num = self.tokenizer.get_image_token_length(img)
+                enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
                 md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
                 md5sums.append(md5sum)
                 img.md5 = md5sum
@@ -236,10 +238,12 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
         img_count = 0
         audio_tokens = 0
         audio_count = 0
-        for img in multimodal_params.images:
+        for img_index, img in enumerate(multimodal_params.images):
             img_count += 1
             self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
-            image_tokens += self.tokenizer.get_image_token_length(img)
+            token_num = self.tokenizer.get_image_token_length(img)
+            enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
+            image_tokens += token_num
         for audio in multimodal_params.audios:
             audio_count += 1
             self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params)
diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py
index d6a1a58b05..cd0d952657 100644
--- a/lightllm/server/httpserver_for_pd_master/manager.py
+++ b/lightllm/server/httpserver_for_pd_master/manager.py
@@ -21,6 +21,7 @@
 from lightllm.server.httpserver.manager import AsyncQueue
 from lightllm.utils.error_utils import ServerBusyError
 from lightllm.utils.envs_utils import get_pd_split_max_new_tokens
+from lightllm.utils.multimodal_utils import enforce_image_token_budget
 from .pd_selector import create_selector
 
 logger = init_logger(__name__)
@@ -73,10 +74,12 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
         img_count = 0
         audio_tokens = 0
         audio_count = 0
-        for img in multimodal_params.images:
+        for img_index, img in enumerate(multimodal_params.images):
             img_count += 1
             self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
-            image_tokens += self.tokenizer.get_image_token_length(img)
+            token_num = self.tokenizer.get_image_token_length(img)
+            enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
+            image_tokens += token_num
         for audio in multimodal_params.audios:
             audio_count += 1
             self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params)
diff --git a/lightllm/server/tokenizer.py b/lightllm/server/tokenizer.py
index 25726b2578..db7d0dbae6 100644
--- a/lightllm/server/tokenizer.py
+++ b/lightllm/server/tokenizer.py
@@ -89,10 +89,19 @@ def get_tokenizer(
         logger.info("Using DeepSeek-V3.2 tokenizer mode with Python-based chat template encoding.")
         return DeepSeekV32Tokenizer(hf_tokenizer)
 
+    # Qwen-VL family shares a max_pixels clamp helper to keep get_image_token_length
+    # in sync with visual_image_max_tokens budget. No-op for non-Qwen-VL tokenizers.
+    from ..models.qwen2_vl.vision_process import clamp_processor_max_pixels
+    from lightllm.utils.envs_utils import get_env_start_args
+
+    _start_args = get_env_start_args()
+    _img_max_tokens = getattr(_start_args, "visual_image_max_tokens", None)
+
     if model_cfg["architectures"][0] == "TarsierForConditionalGeneration":
         from ..models.qwen2_vl.vision_process import Qwen2VLImageProcessor
 
         image_processor = Qwen2VLImageProcessor.from_pretrained(tokenizer_name)
+        clamp_processor_max_pixels(image_processor, _img_max_tokens, processor_name="tarsier2-tokenizer")
         tokenizer = Tarsier2Tokenizer(tokenizer=tokenizer, image_processor=image_processor, model_cfg=model_cfg)
     elif model_type == "llava" or model_type == "internlmxcomposer2":
         tokenizer = LlavaTokenizer(tokenizer, model_cfg)
@@ -102,6 +111,7 @@ def get_tokenizer(
         from transformers import AutoProcessor
 
         processor = AutoProcessor.from_pretrained(tokenizer_name)
+        clamp_processor_max_pixels(processor.image_processor, _img_max_tokens, processor_name=f"{model_type}-tokenizer")
         tokenizer = QWen2VLTokenizer(
             tokenizer=tokenizer, image_processor=processor.image_processor, model_cfg=model_cfg
         )
@@ -109,6 +119,7 @@ def get_tokenizer(
         from transformers import AutoProcessor
 
         processor = AutoProcessor.from_pretrained(tokenizer_name)
+        clamp_processor_max_pixels(processor.image_processor, _img_max_tokens, processor_name=f"{model_type}-tokenizer")
         tokenizer = QWen3VLTokenizer(
             tokenizer=tokenizer, image_processor=processor.image_processor, model_cfg=model_cfg
         )
@@ -117,6 +128,7 @@ def get_tokenizer(
         from ..models.qwen3_5.model import QWen3_5Tokenizer
 
         processor = AutoProcessor.from_pretrained(tokenizer_name)
+        clamp_processor_max_pixels(processor.image_processor, _img_max_tokens, processor_name=f"{model_type}-tokenizer")
         tokenizer = QWen3_5Tokenizer(
             tokenizer=tokenizer, image_processor=processor.image_processor, model_cfg=model_cfg
         )
@@ -125,6 +137,7 @@ def get_tokenizer(
 
         model_cfg = model_cfg["thinker_config"]
         processor = AutoProcessor.from_pretrained(tokenizer_name)
+        clamp_processor_max_pixels(processor.image_processor, _img_max_tokens, processor_name="qwen3-omni-tokenizer")
         tokenizer = QWen3OmniTokenizer(tokenizer, processor=processor, model_cfg=model_cfg)
     elif model_type == "internvl_chat":
         tokenizer = InternvlTokenizer(tokenizer, model_cfg, weight_dir=tokenizer_name)
diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py
index a165be78f2..60bfafc62d 100644
--- a/lightllm/server/visualserver/manager.py
+++ b/lightllm/server/visualserver/manager.py
@@ -85,6 +85,7 @@ async def wait_to_model_ready(self):
                     "quant_type": self.args.vit_quant_type,
                     "quant_cfg": self.args.vit_quant_cfg,
                     "max_batch_size": min(self.infer_batch_size // self.vit_dp, 1),
+                    "visual_batch_max_tokens": self.args.visual_batch_max_tokens,
                     "vit_attn_backend": self.vit_attn_backend,
                 }
                 init_model_ret.append(self.model_rpcs[dp_rank_id][tp_rank_id].init_model(kvargs))
diff --git a/lightllm/server/visualserver/model_infer/batching.py b/lightllm/server/visualserver/model_infer/batching.py
new file mode 100644
index 0000000000..62a5f7358c
--- /dev/null
+++ b/lightllm/server/visualserver/model_infer/batching.py
@@ -0,0 +1,64 @@
+import queue
+import threading
+from typing import List, Optional
+
+
+def pull_batch_with_budget(
+    infer_queue: "queue.Queue",
+    semaphore: threading.Semaphore,
+    max_num: int,
+    max_tokens: Optional[int],
+    timeout: Optional[float] = None,
+) -> List:
+    """Pull up to ``max_num`` image items from ``infer_queue`` while keeping the
+    cumulative ``item.token_num`` at or below ``max_tokens``.
+
+    Rank-0-only admission logic for the ViT scheduler. The first item is always
+    admitted even when it alone exceeds ``max_tokens`` — this avoids a deadlock
+    when a single request is larger than the per-step budget. Each subsequent
+    item is pulled, inspected, and either kept or returned to the queue.
+
+    ``semaphore`` counts share with the caller (see ``_init_taskes``); callers
+    acquire before every get and release on over-pull so the permit count stays
+    consistent with queue contents.
+
+    When ``timeout`` is not None, the first acquire/get is bounded so rank 0
+    can emit a heartbeat broadcast instead of blocking indefinitely on the
+    gloo broadcast (avoids the 30-minute NCCL-style timeout on idle workers).
+    An empty list is returned on timeout.
+    """
+    tasks: List = []
+
+    if timeout is not None:
+        if not semaphore.acquire(timeout=timeout):
+            return tasks
+        try:
+            first = infer_queue.get(timeout=timeout)
+        except queue.Empty:
+            semaphore.release()
+            return tasks
+    else:
+        semaphore.acquire()
+        first = infer_queue.get(block=True)
+    tasks.append(first)
+    total_tokens = first.token_num or 0
+
+    while len(tasks) < max_num:
+        try:
+            task = infer_queue.get(block=False)
+        except queue.Empty:
+            break
+        if not semaphore.acquire(blocking=False):
+            infer_queue.put(task)
+            break
+
+        next_tokens = task.token_num or 0
+        if max_tokens is not None and total_tokens + next_tokens > max_tokens:
+            infer_queue.put(task)
+            semaphore.release()
+            break
+
+        tasks.append(task)
+        total_tokens += next_tokens
+
+    return tasks
diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py
index 55f4704a31..b856da5845 100644
--- a/lightllm/server/visualserver/model_infer/model_rpc.py
+++ b/lightllm/server/visualserver/model_infer/model_rpc.py
@@ -26,6 +26,7 @@
 from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
 from lightllm.server.visualserver import set_vit_att_backend
 from lightllm.server.embed_cache.afs_utils import SepEmbedHandler
+from lightllm.server.visualserver.model_infer.batching import pull_batch_with_budget
 from lightllm.utils.log_utils import init_logger
 
 
@@ -53,6 +54,7 @@ def exposed_init_model(self, kvargs):
 
         weight_dir = kvargs["weight_dir"]
         self.infer_max_batch_size = kvargs["max_batch_size"]
+        self.visual_batch_max_tokens = kvargs.get("visual_batch_max_tokens", None)
         self.device_id = kvargs["device_id"]
         self.vit_tp = kvargs["vit_tp"]
         self.dp_rank_id = kvargs["dp_rank_id"]
@@ -186,33 +188,37 @@ def _init_taskes(self):
     def _forward(self, images: List[ImageItem]):
         return self.model.encode(images)
 
-    def _get_image_items_from_infer_queue(self, max_num: int, force_same: bool = False) -> List[ImageItem]:
+    def _get_image_items_from_infer_queue(
+        self, max_num: int, force_same: bool = False, timeout: float = None
+    ) -> List[ImageItem]:
         """
         从队列中批量获取任务，直到达到 max_num 或队列为空。
-        """
-        tasks = []
-        # 至少获取一个任务，阻塞
-        self.sempare.acquire()
-        task = self.infer_queue.get(block=True)
-        tasks.append(task)
 
-        if not force_same:
-            # 尝试继续获取更多任务，直到达到 max_num
-            while len(tasks) < max_num:
-                try:
-                    self.sempare.acquire()
-                    task = self.infer_queue.get(block=False)
-                    tasks.append(task)
-                except queue.Empty:
-                    self.sempare.release()
-                    break
-        else:
+        timeout 仅对首个任务的阻塞等待生效；超时返回空列表。rank 0 使用
+        timeout 作为心跳，避免其他 rank 在 gloo broadcast 上长时间无响应
+        而触发 30 分钟超时崩溃。
+
+        On rank 0 the cumulative ``img.token_num`` is additionally capped by
+        ``visual_batch_max_tokens`` so a dynamic-resolution image (or batch of
+        them) cannot blow the ViT's memory budget. The non-rank-0 ``force_same``
+        path follows rank 0's already-decided count via the gloo broadcast.
+        """
+        if force_same:
+            tasks = []
+            self.sempare.acquire()
+            tasks.append(self.infer_queue.get(block=True))
             while len(tasks) < max_num:
                 self.sempare.acquire()
-                task = self.infer_queue.get(block=True)
-                tasks.append(task)
+                tasks.append(self.infer_queue.get(block=True))
+            return tasks
 
-        return tasks
+        return pull_batch_with_budget(
+            infer_queue=self.infer_queue,
+            semaphore=self.sempare,
+            max_num=max_num,
+            max_tokens=self.visual_batch_max_tokens,
+            timeout=timeout,
+        )
 
     def _get_image_items_from_store_queue(self, max_num: int) -> List[ImageItem]:
         """
@@ -240,12 +246,18 @@ def _infer_worker(self):
         while True:
             try:
                 # 从队列获取任务, 阻塞等待
+                # rank 0 用带超时的 get, 空闲时也会广播 [0] 当作心跳,
+                # 避免其他 rank 在 gloo broadcast 上长时间无响应而触发 30 分钟超时崩溃。
                 if self.tp_rank_id == 0:
-                    images = self._get_image_items_from_infer_queue(max_num=self.infer_max_batch_size)
+                    images = self._get_image_items_from_infer_queue(max_num=self.infer_max_batch_size, timeout=60.0)
                     dist.broadcast_object_list([len(images)], src=0, group=self.gloo_group)
+                    if len(images) == 0:
+                        continue
                 else:
                     ans = [None]
                     dist.broadcast_object_list(ans, src=0, group=self.gloo_group)
+                    if ans[0] == 0:
+                        continue
                     images = self._get_image_items_from_infer_queue(max_num=ans[0], force_same=True)
 
                 for image in images:
diff --git a/lightllm/server/visualserver/visual_only_manager.py b/lightllm/server/visualserver/visual_only_manager.py
index 27275c1e8c..c6ca15b775 100644
--- a/lightllm/server/visualserver/visual_only_manager.py
+++ b/lightllm/server/visualserver/visual_only_manager.py
@@ -117,6 +117,7 @@ async def wait_to_model_ready(self):
                     "quant_type": self.args.vit_quant_type,
                     "quant_cfg": self.args.vit_quant_cfg,
                     "max_batch_size": min(self.infer_batch_size // self.vit_dp, 1),
+                    "visual_batch_max_tokens": self.args.visual_batch_max_tokens,
                     "vit_attn_backend": self.vit_attn_backend,
                 }
                 init_model_ret.append(self.model_rpcs[dp_rank_id][tp_rank_id].init_model(kvargs))
diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py
index 4b49ea8891..b459d9ce35 100644
--- a/lightllm/utils/multimodal_utils.py
+++ b/lightllm/utils/multimodal_utils.py
@@ -4,6 +4,7 @@
 import httpx
 from PIL import Image
 from io import BytesIO
+from typing import Optional
 from fastapi import Request
 from functools import lru_cache
 from lightllm.utils.log_utils import init_logger
@@ -11,6 +12,20 @@
 logger = init_logger(__name__)
 
 
+def enforce_image_token_budget(token_num: int, max_tokens: Optional[int], image_index: int = 0) -> None:
+    """Reject a request when a single image's ``token_num`` exceeds the server
+    budget. Pairs with the per-step ``--visual_batch_max_tokens`` admission cap:
+    this guards the batch against one oversized request, since a single image
+    is always admitted (the "first image always runs" deadlock-avoidance rule).
+    """
+    if max_tokens is not None and token_num > max_tokens:
+        raise ValueError(
+            f"image[{image_index}] token_num={token_num} exceeds "
+            f"visual_image_max_tokens={max_tokens}; reduce image resolution, "
+            f"image_max_patch_num (InternVL-family), or preprocessor_config.json::max_pixels (Qwen-VL)"
+        )
+
+
 def _httpx_async_client_proxy_kwargs(proxy) -> dict:
     """
     httpx 0.28+ 使用 AsyncClient(proxy=...)；更早版本使用 proxies=...
diff --git a/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py b/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
new file mode 100644
index 0000000000..590c694a13
--- /dev/null
+++ b/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
@@ -0,0 +1,131 @@
+import importlib.util
+import os
+import unittest
+
+# Load the helper directly from its file so we do not trigger heavy imports in
+# lightllm.models.* (torch, triton kernels, etc.) just to test a pure function.
+_PATH = os.path.normpath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "..",
+        "..",
+        "lightllm",
+        "models",
+        "qwen2_vl",
+        "vision_process.py",
+    )
+)
+
+
+def _load_helper():
+    import sys
+    import types
+
+    # Stub out heavy imports that vision_process.py pulls at module load.
+    # Only the pure helper is under test; nothing below depends on these stubs.
+    for name in ("torch", "numpy", "PIL", "PIL.Image"):
+        if name not in sys.modules:
+            sys.modules[name] = types.ModuleType(name)
+
+    if "torchvision" not in sys.modules:
+        tv = types.ModuleType("torchvision")
+        tv_t = types.ModuleType("torchvision.transforms")
+        tv_tv2 = types.ModuleType("torchvision.transforms.v2")
+        tv_tf = types.ModuleType("torchvision.transforms.v2.functional")
+        sys.modules["torchvision"] = tv
+        sys.modules["torchvision.transforms"] = tv_t
+        sys.modules["torchvision.transforms.v2"] = tv_tv2
+        sys.modules["torchvision.transforms.v2.functional"] = tv_tf
+
+    # The file imports transformers pieces; stub them.
+    if "transformers" not in sys.modules:
+        sys.modules["transformers"] = types.ModuleType("transformers")
+    for sub in (
+        "transformers.image_utils",
+        "transformers.image_processing_utils_fast",
+        "transformers.image_transforms",
+    ):
+        if sub not in sys.modules:
+            sys.modules[sub] = types.ModuleType(sub)
+
+    spec = importlib.util.spec_from_file_location("_vp_under_test", _PATH)
+    mod = importlib.util.module_from_spec(spec)
+    try:
+        spec.loader.exec_module(mod)
+    except Exception:
+        # If stubs aren't enough to import the whole file, fall back to
+        # reading the function source and exec'ing it directly.
+        with open(_PATH, "r") as f:
+            src = f.read()
+        start = src.index("def clamp_processor_max_pixels")
+        # Find the end — the next "def " at column 0.
+        tail = src[start:]
+        next_def = tail.find("\ndef ", 1)
+        fn_src = tail[:next_def] if next_def != -1 else tail
+        ns = {}
+        # Substitute logger with a noop.
+        import logging
+
+        ns["logger"] = logging.getLogger("clamp_test")
+        exec("from typing import Optional\n" + fn_src, ns)
+        return ns["clamp_processor_max_pixels"]
+    return mod.clamp_processor_max_pixels
+
+
+clamp_processor_max_pixels = _load_helper()
+
+
+class _FakeProcessor:
+    def __init__(self, patch_size, merge_size, max_pixels):
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.max_pixels = max_pixels
+
+
+class TestClampProcessorMaxPixels(unittest.TestCase):
+    def test_none_budget_is_noop(self):
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
+        clamp_processor_max_pixels(p, None)
+        self.assertEqual(p.max_pixels, 16384 * 28 * 28)
+
+    def test_budget_looser_than_processor_is_noop(self):
+        # Processor's max_pixels already gives 16384 tokens. Budget is 32768. Keep smaller.
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
+        clamp_processor_max_pixels(p, visual_image_max_tokens=32768)
+        self.assertEqual(p.max_pixels, 16384 * 28 * 28)
+
+    def test_budget_tighter_clamps(self):
+        # patch=14, merge=2 -> unit=28, unit^2=784. Budget 4096 tokens -> 4096*784 pixels.
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
+        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        self.assertEqual(p.max_pixels, 4096 * 28 * 28)
+
+    def test_budget_equal_to_original_is_noop(self):
+        # Original gives exactly 16384 tokens. Budget 16384 -> same value.
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
+        clamp_processor_max_pixels(p, visual_image_max_tokens=16384)
+        self.assertEqual(p.max_pixels, 16384 * 28 * 28)
+
+    def test_budget_zero_raises(self):
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
+        with self.assertRaises(ValueError):
+            clamp_processor_max_pixels(p, visual_image_max_tokens=0)
+
+    def test_different_patch_merge(self):
+        # patch=16, merge=1 -> unit=16, unit^2=256. Budget 1000 tokens -> 256000 pixels.
+        p = _FakeProcessor(patch_size=16, merge_size=1, max_pixels=10_000_000)
+        clamp_processor_max_pixels(p, visual_image_max_tokens=1000)
+        self.assertEqual(p.max_pixels, 1000 * 16 * 16)
+
+    def test_processor_max_pixels_none_is_clamped(self):
+        # HF Qwen3.5-VL's processor exposes max_pixels=None (no intrinsic upper
+        # bound); the clamp must treat that as "looser than any budget" and
+        # always apply our allowed_max_pixels instead of crashing on int<None.
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=None)
+        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        self.assertEqual(p.max_pixels, 4096 * 28 * 28)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/unit_tests/server/visualserver/__init__.py b/unit_tests/server/visualserver/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/unit_tests/server/visualserver/test_batching.py b/unit_tests/server/visualserver/test_batching.py
new file mode 100644
index 0000000000..8053efe65a
--- /dev/null
+++ b/unit_tests/server/visualserver/test_batching.py
@@ -0,0 +1,113 @@
+import importlib.util
+import os
+import queue
+import threading
+import unittest
+
+# Load the helper module directly from its file so the test does not trigger
+# `lightllm.server.visualserver.__init__`, which imports heavy GPU/ViT attention
+# backends that are unavailable in a plain CPU test environment.
+_BATCHING_PATH = os.path.normpath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "..",
+        "..",
+        "lightllm",
+        "server",
+        "visualserver",
+        "model_infer",
+        "batching.py",
+    )
+)
+_spec = importlib.util.spec_from_file_location("_batching_under_test", _BATCHING_PATH)
+_batching = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_batching)
+pull_batch_with_budget = _batching.pull_batch_with_budget
+
+
+class _FakeImg:
+    def __init__(self, token_num):
+        self.token_num = token_num
+
+
+def _setup(token_nums):
+    q = queue.Queue()
+    for tn in token_nums:
+        q.put(_FakeImg(tn))
+    sem = threading.Semaphore(len(token_nums))
+    return q, sem
+
+
+class TestPullBatchWithBudget(unittest.TestCase):
+    def test_unlimited_budget_acts_like_count_cap(self):
+        q, sem = _setup([100, 200, 300, 400])
+        got = pull_batch_with_budget(q, sem, max_num=3, max_tokens=None)
+        self.assertEqual([g.token_num for g in got], [100, 200, 300])
+        self.assertEqual(q.qsize(), 1)
+
+    def test_budget_stops_before_overflow(self):
+        q, sem = _setup([100, 200, 300, 400])
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=400)
+        # 100 + 200 = 300 <= 400; +300 -> 600 > 400 -> stop and put 300 back.
+        self.assertEqual([g.token_num for g in got], [100, 200])
+        self.assertEqual(q.qsize(), 2)
+        remaining = [q.get_nowait().token_num for _ in range(q.qsize())]
+        self.assertIn(300, remaining)
+        self.assertIn(400, remaining)
+
+    def test_first_image_always_admitted_even_if_over_budget(self):
+        q, sem = _setup([10_000, 5])
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=100)
+        self.assertEqual([g.token_num for g in got], [10_000])
+        self.assertEqual(q.qsize(), 1)
+
+    def test_single_item_queue(self):
+        q, sem = _setup([42])
+        got = pull_batch_with_budget(q, sem, max_num=5, max_tokens=1000)
+        self.assertEqual([g.token_num for g in got], [42])
+        self.assertEqual(q.qsize(), 0)
+
+    def test_budget_at_exact_boundary_admits(self):
+        q, sem = _setup([100, 200, 300])
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=300)
+        # 100 + 200 = 300 == budget -> admit; +300 -> 600 > 300 -> stop.
+        self.assertEqual([g.token_num for g in got], [100, 200])
+
+    def test_none_token_num_treated_as_zero(self):
+        q = queue.Queue()
+        q.put(_FakeImg(100))
+        q.put(_FakeImg(None))
+        q.put(_FakeImg(50))
+        sem = threading.Semaphore(3)
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=100)
+        # 100 (admitted first), 0 (None) -> 100 admitted, +50 -> 150 > 100 -> stop.
+        self.assertEqual([g.token_num for g in got], [100, None])
+        self.assertEqual(q.qsize(), 1)
+
+    def test_max_num_respected_under_budget(self):
+        q, sem = _setup([10, 10, 10, 10, 10])
+        got = pull_batch_with_budget(q, sem, max_num=3, max_tokens=10_000)
+        self.assertEqual(len(got), 3)
+        self.assertEqual(q.qsize(), 2)
+
+    def test_semaphore_permits_match_returned_items(self):
+        # After the pull, permits consumed must equal len(returned) so the outer
+        # backpressure accounting in _store_worker releases the right count.
+        q, sem = _setup([100, 200, 300, 400, 500])
+        permits_before = sem._value
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=400)
+        permits_after = sem._value
+        self.assertEqual(permits_before - permits_after, len(got))
+
+    def test_semaphore_permits_match_on_queue_empty(self):
+        q, sem = _setup([100, 200])
+        permits_before = sem._value
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=None)
+        permits_after = sem._value
+        self.assertEqual(permits_before - permits_after, len(got))
+        self.assertEqual(len(got), 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/unit_tests/utils/test_image_token_budget.py b/unit_tests/utils/test_image_token_budget.py
new file mode 100644
index 0000000000..8155216263
--- /dev/null
+++ b/unit_tests/utils/test_image_token_budget.py
@@ -0,0 +1,47 @@
+import importlib.util
+import os
+import unittest
+
+# Load the helper directly to avoid triggering heavy package imports (torch,
+# atomics, etc.) that the full lightllm package pulls in.
+_UTILS_PATH = os.path.normpath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "..",
+        "lightllm",
+        "utils",
+        "multimodal_utils.py",
+    )
+)
+_spec = importlib.util.spec_from_file_location("_mm_utils_under_test", _UTILS_PATH)
+_module = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_module)
+enforce_image_token_budget = _module.enforce_image_token_budget
+
+
+class TestEnforceImageTokenBudget(unittest.TestCase):
+    def test_none_budget_allows_anything(self):
+        enforce_image_token_budget(token_num=10_000_000, max_tokens=None)
+
+    def test_under_budget_ok(self):
+        enforce_image_token_budget(token_num=1000, max_tokens=1024)
+
+    def test_at_budget_ok(self):
+        enforce_image_token_budget(token_num=1024, max_tokens=1024)
+
+    def test_over_budget_raises(self):
+        with self.assertRaises(ValueError) as cm:
+            enforce_image_token_budget(token_num=2048, max_tokens=1024, image_index=3)
+        msg = str(cm.exception)
+        self.assertIn("image[3]", msg)
+        self.assertIn("2048", msg)
+        self.assertIn("1024", msg)
+
+    def test_zero_budget_rejects_any_positive_tokens(self):
+        with self.assertRaises(ValueError):
+            enforce_image_token_budget(token_num=1, max_tokens=0)
+
+
+if __name__ == "__main__":
+    unittest.main()

From ad6046645d1bc04f1f4b6c225e5c1f3ab6496e02 Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Wed, 6 May 2026 14:55:22 +0800
Subject: [PATCH 2/4] fix(multimodal): rank-divergent queue reorder + Qwen3-VL
 longest_edge clamp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two correctness fixes for the visual token-budget admission landed earlier
in this PR.

1. TP rank divergence on budget skip. ``pull_batch_with_budget`` is rank-0-
   only; non-zero ranks pop ``len(images)`` items from their own identical
   FIFO queues to follow rank 0's decision. The old ``infer_queue.put(task)``
   on a budget/sem-permit reject appended the rejected item to the *tail* of
   rank 0's queue, so subsequent calls saw a different FIFO order than rank
   N's queues — different images encoded on different ranks, corrupting TP
   visual inference. Add a ``_put_front`` helper that re-inserts at the head
   under ``infer_queue.mutex`` and use it on both reject paths. Regression
   tests added for the budget-skip and sem-skip orderings.

2. Qwen3-VL / Qwen3.5-VL tokenizer used pre-clamp budget. HF's Qwen3-VL
   image processor stores the per-image limit in
   ``processor.size["longest_edge"]``, and ``QWen3VLTokenizer.__init__``
   reads that key into ``self.max_pixel``. ``clamp_processor_max_pixels``
   only wrote ``processor.max_pixels``, so the tokenizer's
   ``get_image_token_length`` kept using the original (huge) limit. With the
   default ``visual_image_max_tokens = batch_max_tokens``, large images were
   wrongly rejected by ``enforce_image_token_budget`` even though the ViT
   processor would have resized them in-budget. Clamp both
   ``processor.max_pixels`` and ``processor.size["longest_edge"]`` (when
   present) so every reader sees the same tightened bound.

Tests: 21/21 pass for the two affected files.
---
 lightllm/models/qwen2_vl/vision_process.py    | 37 +++++++++++++++----
 .../visualserver/model_infer/batching.py      | 26 +++++++++++--
 .../test_clamp_processor_max_pixels.py        | 31 ++++++++++++++++
 .../server/visualserver/test_batching.py      | 26 +++++++++++++
 4 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/lightllm/models/qwen2_vl/vision_process.py b/lightllm/models/qwen2_vl/vision_process.py
index 77626a4730..c7504326df 100644
--- a/lightllm/models/qwen2_vl/vision_process.py
+++ b/lightllm/models/qwen2_vl/vision_process.py
@@ -28,15 +28,22 @@
 
 
 def clamp_processor_max_pixels(processor, visual_image_max_tokens, processor_name: str = "") -> None:
-    """Clamp a Qwen-VL style image processor's ``max_pixels`` so that even a
+    """Clamp a Qwen-VL style image processor's max-pixel limit so that even a
     max-sized image produces ``token_num <= visual_image_max_tokens``.
 
-    Reuses the processor's built-in ``smart_resize`` + ``max_pixels`` mechanism —
-    just tightens ``max_pixels`` so the existing resize path fits the server-wide
+    Reuses the processor's built-in ``smart_resize`` mechanism — just tightens
+    the per-pixel budget so the existing resize path fits the server-wide
     per-image token budget. After the clamp, ``get_image_token_length`` cannot
     return a value above the budget, so request-level rejection becomes a
     defensive no-op in practice.
 
+    Different Qwen-VL generations expose the limit on different attributes:
+    Qwen2-VL / Qwen2.5-VL / lightllm's own ``Qwen2VLImageProcessor`` use
+    ``processor.max_pixels``, while HF's Qwen3-VL / Qwen3.5-VL processors store
+    it in ``processor.size["longest_edge"]``. Both attributes are clamped when
+    present so any reader (HF runtime, tokenizer ``__init__``) sees the
+    tightened bound.
+
     No-op when ``visual_image_max_tokens`` is None or the processor already
     enforces a tighter bound.
     """
@@ -49,14 +56,30 @@ def clamp_processor_max_pixels(processor, visual_image_max_tokens, processor_nam
             f"visual_image_max_tokens={visual_image_max_tokens} is too small; "
             f"need at least 1 patch's worth (={unit * unit} pixels) for {processor_name or 'processor'}."
         )
+
+    # Track originals so the log line shows the pre-clamp values; some
+    # processors only expose one of the two schemas, so each branch is gated
+    # on its own attribute presence.
     current_max_pixels = getattr(processor, "max_pixels", None)
+    size = getattr(processor, "size", None)
+    has_longest_edge = isinstance(size, dict) and "longest_edge" in size
+    current_longest_edge = size.get("longest_edge") if has_longest_edge else None
+
+    clamped = False
     if current_max_pixels is None or allowed_max_pixels < current_max_pixels:
+        processor.max_pixels = allowed_max_pixels
+        clamped = True
+    if has_longest_edge and (current_longest_edge is None or allowed_max_pixels < current_longest_edge):
+        size["longest_edge"] = allowed_max_pixels
+        clamped = True
+
+    if clamped:
         logger.info(
-            f"{processor_name or 'processor'}: clamping max_pixels "
-            f"{current_max_pixels} -> {allowed_max_pixels} "
-            f"(visual_image_max_tokens={visual_image_max_tokens}, unit={unit})"
+            f"{processor_name or 'processor'}: clamping max_pixels/longest_edge to "
+            f"{allowed_max_pixels} (was max_pixels={current_max_pixels}, "
+            f"longest_edge={current_longest_edge}; "
+            f"visual_image_max_tokens={visual_image_max_tokens}, unit={unit})"
         )
-        processor.max_pixels = allowed_max_pixels
 
 
 IMAGE_FACTOR = 28
diff --git a/lightllm/server/visualserver/model_infer/batching.py b/lightllm/server/visualserver/model_infer/batching.py
index 62a5f7358c..fa98097415 100644
--- a/lightllm/server/visualserver/model_infer/batching.py
+++ b/lightllm/server/visualserver/model_infer/batching.py
@@ -3,6 +3,24 @@
 from typing import List, Optional
 
 
+def _put_front(infer_queue: "queue.Queue", item) -> None:
+    """Push ``item`` back to the front of ``infer_queue``.
+
+    ``queue.Queue.put`` appends to the tail, which would reorder pending items
+    relative to other consumers. The ViT scheduler runs rank-0-only admission
+    on a queue that every TP rank holds an identical copy of, and rank N
+    later pops ``len(images)`` items in FIFO order to follow rank 0's
+    decision. If a rejected item moved to the tail of rank 0's queue, the
+    queues across ranks would diverge and the next batch would encode
+    different images on different ranks. Re-inserting at the head preserves
+    FIFO order on rank 0 and keeps all ranks in sync.
+    """
+    with infer_queue.mutex:
+        infer_queue.queue.appendleft(item)
+        infer_queue.unfinished_tasks += 1
+        infer_queue.not_empty.notify()
+
+
 def pull_batch_with_budget(
     infer_queue: "queue.Queue",
     semaphore: threading.Semaphore,
@@ -16,7 +34,9 @@ def pull_batch_with_budget(
     Rank-0-only admission logic for the ViT scheduler. The first item is always
     admitted even when it alone exceeds ``max_tokens`` — this avoids a deadlock
     when a single request is larger than the per-step budget. Each subsequent
-    item is pulled, inspected, and either kept or returned to the queue.
+    item is pulled, inspected, and either kept or pushed back to the front of
+    the queue so non-rank-0 workers' FIFO pops stay aligned with rank 0's
+    admitted set.
 
     ``semaphore`` counts share with the caller (see ``_init_taskes``); callers
     acquire before every get and release on over-pull so the permit count stays
@@ -49,12 +69,12 @@ def pull_batch_with_budget(
         except queue.Empty:
             break
         if not semaphore.acquire(blocking=False):
-            infer_queue.put(task)
+            _put_front(infer_queue, task)
             break
 
         next_tokens = task.token_num or 0
         if max_tokens is not None and total_tokens + next_tokens > max_tokens:
-            infer_queue.put(task)
+            _put_front(infer_queue, task)
             semaphore.release()
             break
 
diff --git a/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py b/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
index 590c694a13..30f00ef16b 100644
--- a/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
+++ b/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
@@ -126,6 +126,37 @@ def test_processor_max_pixels_none_is_clamped(self):
         clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
         self.assertEqual(p.max_pixels, 4096 * 28 * 28)
 
+    def test_size_longest_edge_is_clamped(self):
+        # HF Qwen3-VL / Qwen3.5-VL processors store the per-image limit in
+        # processor.size["longest_edge"]; QWen3VLTokenizer.__init__ reads
+        # that key into self.max_pixel for budget accounting. The clamp
+        # must update both attributes so the tokenizer's get_image_token_length
+        # matches what the ViT will actually produce after smart_resize.
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=None)
+        p.size = {"shortest_edge": 4 * 28 * 28, "longest_edge": 16384 * 28 * 28}
+        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        self.assertEqual(p.max_pixels, 4096 * 28 * 28)
+        self.assertEqual(p.size["longest_edge"], 4096 * 28 * 28)
+        # shortest_edge is unrelated to the cap; must not be touched.
+        self.assertEqual(p.size["shortest_edge"], 4 * 28 * 28)
+
+    def test_size_longest_edge_already_tighter_is_noop(self):
+        # If the processor's longest_edge is already below our budget, leave
+        # it alone — same semantics as the existing max_pixels branch.
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=None)
+        p.size = {"longest_edge": 1024 * 28 * 28}
+        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        self.assertEqual(p.size["longest_edge"], 1024 * 28 * 28)
+
+    def test_size_without_longest_edge_is_ignored(self):
+        # Some processors expose `size` as a dict keyed by height/width. The
+        # clamp must not invent a longest_edge key in that case.
+        p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
+        p.size = {"height": 224, "width": 224}
+        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        self.assertEqual(p.max_pixels, 4096 * 28 * 28)
+        self.assertNotIn("longest_edge", p.size)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/unit_tests/server/visualserver/test_batching.py b/unit_tests/server/visualserver/test_batching.py
index 8053efe65a..066bc19c73 100644
--- a/unit_tests/server/visualserver/test_batching.py
+++ b/unit_tests/server/visualserver/test_batching.py
@@ -108,6 +108,32 @@ def test_semaphore_permits_match_on_queue_empty(self):
         self.assertEqual(permits_before - permits_after, len(got))
         self.assertEqual(len(got), 2)
 
+    def test_rejected_item_returns_to_front_preserves_fifo(self):
+        # TP-correctness regression: after rank 0's budget admission, the
+        # residual queue must equal the original FIFO order with the admitted
+        # prefix removed. Other TP ranks pop ``len(returned)`` from their own
+        # identical queues, so any reorder on rank 0 makes ranks encode
+        # different images on the next step.
+        q, sem = _setup([100, 500, 100])
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=200)
+        # 100 admitted, 500 over-budget -> rejected, loop breaks.
+        self.assertEqual([g.token_num for g in got], [100])
+        remaining_in_order = [q.get_nowait().token_num for _ in range(q.qsize())]
+        self.assertEqual(remaining_in_order, [500, 100])
+
+    def test_rejected_on_sem_exhaustion_returns_to_front(self):
+        # Semaphore-skip path mirrors the budget-skip path: the popped item
+        # must end up at the front of the queue, not the tail.
+        q = queue.Queue()
+        for tn in [100, 200, 300]:
+            q.put(_FakeImg(tn))
+        # Two permits => first acquire (200) succeeds, second (300) fails.
+        sem = threading.Semaphore(2)
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=10_000)
+        self.assertEqual([g.token_num for g in got], [100, 200])
+        remaining_in_order = [q.get_nowait().token_num for _ in range(q.qsize())]
+        self.assertEqual(remaining_in_order, [300])
+
 
 if __name__ == "__main__":
     unittest.main()

From ec55e40bd5bc4f8f2bd76b46c3caa858dfb60160 Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Wed, 6 May 2026 16:15:57 +0800
Subject: [PATCH 3/4] fix(multimodal): documented opt-out for visual budget +
 queue invariant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups from a second review pass.

1. Visual budget defaults are auto-derived (not None) when --enable_multimodal
   is on, but the CLI help and docs still claimed "Default: None (disabled)".
   This was a user-visible behavior mismatch and existing deployments had no
   documented way to keep the old unlimited behavior.
   - Accept ``0`` as an explicit opt-out sentinel for both
     --visual_batch_max_tokens and --visual_image_max_tokens; api_start.py
     converts the 0 back to None internally so all downstream consumers
     (pull_batch_with_budget, clamp_processor_max_pixels) treat it as
     disabled.
   - Rewrite the CLI help and docs (api_server_args.rst, qwen35_deployment.rst)
     to describe the actual default (auto-derived from --batch_max_tokens) and
     point to the 0 opt-out for backward compatibility.
   - Loosen the visual_image_max_tokens <= visual_batch_max_tokens assertion
     to skip when either is None (the disabled case).

2. ``_put_front`` was bumping ``infer_queue.unfinished_tasks`` after manually
   re-inserting an item that ``Queue.put`` had already counted. ``Queue.get``
   does not decrement that counter (only ``task_done()`` does), so the bump
   would corrupt the queue invariant and make any future ``Queue.join()`` /
   ``task_done()`` usage hang. Today ``visualserver`` does not call
   ``join``/``task_done``, so this is latent — but the contract was wrong.
   - Drop the ``unfinished_tasks += 1`` line; keep the lock and the
     ``not_empty.notify()`` so blocked consumers still wake up.
   - Add a regression test that put → get → reject → task_done loop and
     calls ``Queue.join()``; ``join`` would hang under the old code.

Tests: 22/22 pass on the affected files.
---
 docs/EN/source/cookbook/qwen35_deployment.rst | 20 +++++++------
 docs/EN/source/tutorial/api_server_args.rst   | 28 ++++++++++-------
 lightllm/server/api_cli.py                    | 23 ++++++++++----
 lightllm/server/api_start.py                  | 30 ++++++++++++++-----
 .../visualserver/model_infer/batching.py      |  7 ++++-
 .../server/visualserver/test_batching.py      | 23 ++++++++++++++
 6 files changed, 97 insertions(+), 34 deletions(-)

diff --git a/docs/EN/source/cookbook/qwen35_deployment.rst b/docs/EN/source/cookbook/qwen35_deployment.rst
index df7e316269..b73b8e3bb7 100644
--- a/docs/EN/source/cookbook/qwen35_deployment.rst
+++ b/docs/EN/source/cookbook/qwen35_deployment.rst
@@ -233,12 +233,14 @@ Hardware Requirements
 - ``--tp 8`` required to fit model weights across GPUs
 - Reduce ``--max_req_total_len`` or ``--graph_max_batch_size`` if encountering OOM errors
 - Use ``--data_type fp8_e4m3`` for FP8 KV quantization to further reduce memory pressure
-- For multimodal OOMs driven by dynamic-resolution images or video, cap the
-  per-step ViT workload with ``--visual_batch_max_tokens`` (e.g. ``16384``).
-  This bounds peak ViT memory the same way ``--batch_max_tokens`` bounds the
-  LLM prefill. Setting this alone also derives a default single-image cap
-  (``--visual_image_max_tokens`` is implicitly set to the same value), which
-  rejects any single image that couldn't fit in one batch — closing the "first
-  image always admitted" deadlock-avoidance hole. Override
-  ``--visual_image_max_tokens`` separately only if you need a stricter single-
-  image limit.
+- Multimodal deployments get ViT OOM protection by default: when
+  ``--enable_multimodal`` is on, ``--visual_batch_max_tokens`` is auto-derived
+  from ``--batch_max_tokens`` and ``--visual_image_max_tokens`` is auto-
+  derived from that, bounding peak ViT memory the same way
+  ``--batch_max_tokens`` bounds the LLM prefill. To tighten the per-step
+  budget further, pass an explicit value (e.g.
+  ``--visual_batch_max_tokens 16384``); to override the per-image cap
+  separately, set ``--visual_image_max_tokens`` (must be
+  ``<= --visual_batch_max_tokens``).
+- To opt out of the budget (restore pre-PR behavior), pass
+  ``--visual_batch_max_tokens 0`` and/or ``--visual_image_max_tokens 0``.
diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst
index 896ab84fba..813e345488 100644
--- a/docs/EN/source/tutorial/api_server_args.rst
+++ b/docs/EN/source/tutorial/api_server_args.rst
@@ -281,8 +281,13 @@ Multimodal Parameters
     peak ViT memory on dynamic-resolution models (Qwen2.5/3/3.5-VL, etc.)
     where one 4K image or long video can contain more patches than many
     small images combined. One image is always admitted per step to avoid
-    deadlock when a single request is larger than the budget. Default is
-    ``None`` (disabled; only ``--visual_infer_batch_size`` applies).
+    deadlock when a single request is larger than the budget.
+
+    **Default behavior with** ``--enable_multimodal``: auto-derived from
+    ``--batch_max_tokens`` so multimodal deployments get OOM protection
+    without explicit opt-in. Pass an explicit positive integer to override.
+    Pass ``0`` to opt out and restore the pre-budget behavior (only
+    ``--visual_infer_batch_size`` applies).
 
 .. option:: --visual_image_max_tokens
 
@@ -291,15 +296,16 @@ Multimodal Parameters
     ``token_num`` exceeds this value is rejected with a ``ValueError`` before
     reaching the ViT. Pairs with ``--visual_batch_max_tokens`` to close the
     "first image always admitted" hole — without this cap, one 4K image can
-    still OOM the ViT on its own.
-
-    If not specified, defaults to ``--visual_batch_max_tokens`` (single image
-    must fit in one batch; this is the implicit precondition of the first-image-
-    always-admitted rule). Set explicitly to a smaller value only if your ViT
-    cannot handle a batch-sized image alone. Must satisfy
-    ``visual_image_max_tokens <= visual_batch_max_tokens``.
-    Default is ``None`` (disabled when ``--visual_batch_max_tokens`` is also
-    unset).
+    still OOM the ViT on its own. Also drives the processor ``max_pixels``
+    clamp so Qwen-VL-family processors auto-resize oversized images to fit.
+
+    **Default behavior with** ``--enable_multimodal``: auto-derived from
+    ``--visual_batch_max_tokens`` (single image must fit in one batch; this
+    is the implicit precondition of the first-image-always-admitted rule).
+    Set explicitly to a smaller positive integer only if your ViT cannot
+    handle a batch-sized image alone; must satisfy
+    ``visual_image_max_tokens <= visual_batch_max_tokens``. Pass ``0`` to opt
+    out and accept any image size.
 
 .. option:: --visual_gpu_ids
 
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index 71176939dd..aa49308991 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -478,13 +478,18 @@ def make_argument_parser() -> argparse.ArgumentParser:
         default=None,
         help="""
         Per-step ViT admission budget measured in image output tokens (post
-        spatial_merge). When set, the ViT scheduler stops adding images to the
-        current batch once their cumulative token_num would exceed this value.
-        Acts as the multimodal analogue of --batch_max_tokens and caps peak ViT
+        spatial_merge). The ViT scheduler stops adding images to the current
+        batch once their cumulative token_num would exceed this value. Acts as
+        the multimodal analogue of --batch_max_tokens and caps peak ViT
         memory/compute for dynamic-resolution models (Qwen2.5/3/3.5-VL, etc.).
         One image is always admitted per step to avoid deadlock when a single
-        request is larger than the budget. Defaults to None (disabled; only the
-        image-count cap applies).
+        request is larger than the budget.
+
+        Default behavior when --enable_multimodal is on: auto-derived from
+        --batch_max_tokens so multimodal deployments get OOM protection without
+        explicit opt-in. Pass an explicit positive integer to override; pass 0
+        to opt out entirely and restore the pre-budget behavior (only the
+        image-count cap --visual_infer_batch_size applies).
         """,
     )
     parser.add_argument(
@@ -498,7 +503,13 @@ def make_argument_parser() -> argparse.ArgumentParser:
         being forwarded to the ViT. Pairs with --visual_batch_max_tokens to
         close the "first image always admitted" hole — without this cap, one
         4K image or long-aspect-ratio image can still OOM the ViT by itself.
-        Defaults to None (disabled; any size is accepted).
+        Also drives the processor max_pixels clamp so Qwen-VL-family processors
+        auto-resize oversized images to fit.
+
+        Default behavior when --enable_multimodal is on: auto-derived from
+        --visual_batch_max_tokens (single image must fit in one batch). Pass an
+        explicit smaller positive integer for a stricter cap; pass 0 to opt
+        out and accept any image size.
         """,
     )
     parser.add_argument(
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
index c88be4dd3e..e82861ed33 100644
--- a/lightllm/server/api_start.py
+++ b/lightllm/server/api_start.py
@@ -272,18 +272,34 @@ def normal_or_p_d_start(args):
         args.cpu_cache_token_page_size = args.linear_att_hash_page_size * args.linear_att_page_block_num
         logger.info(f"set cpu_cache_token_page_size to {args.cpu_cache_token_page_size} for linear hybrid att model")
 
-    # 多模态预算默认值：
-    # - visual_batch_max_tokens 默认等于 batch_max_tokens（LLM 和 ViT 共用预算口径）
-    # - visual_image_max_tokens 默认等于 visual_batch_max_tokens（单图必须能塞进一个批次，
-    #   "首图必放行"规则的隐含前提）
-    # 用户显式指定其中任意一个会覆盖默认值。
+    # 多模态预算默认值（safety-on-by-default for multimodal deployments）：
+    # - 不传：visual_batch_max_tokens 默认等于 batch_max_tokens（LLM 和 ViT 共用预算口径）；
+    #         visual_image_max_tokens 默认等于 visual_batch_max_tokens（单图必须能塞进一个批次，
+    #         "首图必放行"规则的隐含前提）。
+    # - 传 0：显式关闭对应预算，恢复 PR 之前的"不限"行为（向后兼容用）。
+    # - 传正整数：作为显式预算使用。
     if args.enable_multimodal:
         if args.visual_batch_max_tokens is None:
             args.visual_batch_max_tokens = args.batch_max_tokens
-            logger.info(f"visual_batch_max_tokens auto-derived from batch_max_tokens = {args.batch_max_tokens}")
+            logger.info(
+                f"visual_batch_max_tokens auto-derived from batch_max_tokens = {args.batch_max_tokens} "
+                f"(pass --visual_batch_max_tokens 0 to opt out)"
+            )
+        elif args.visual_batch_max_tokens == 0:
+            logger.info("visual_batch_max_tokens explicitly disabled (=0); per-step ViT token budget off")
+            args.visual_batch_max_tokens = None
+
         if args.visual_image_max_tokens is None:
             args.visual_image_max_tokens = args.visual_batch_max_tokens
-        if args.visual_image_max_tokens > args.visual_batch_max_tokens:
+        elif args.visual_image_max_tokens == 0:
+            logger.info("visual_image_max_tokens explicitly disabled (=0); per-image hard cap off")
+            args.visual_image_max_tokens = None
+
+        if (
+            args.visual_image_max_tokens is not None
+            and args.visual_batch_max_tokens is not None
+            and args.visual_image_max_tokens > args.visual_batch_max_tokens
+        ):
             raise ValueError(
                 f"visual_image_max_tokens ({args.visual_image_max_tokens}) must be "
                 f"<= visual_batch_max_tokens ({args.visual_batch_max_tokens}); otherwise "
diff --git a/lightllm/server/visualserver/model_infer/batching.py b/lightllm/server/visualserver/model_infer/batching.py
index fa98097415..17dec51d98 100644
--- a/lightllm/server/visualserver/model_infer/batching.py
+++ b/lightllm/server/visualserver/model_infer/batching.py
@@ -14,10 +14,15 @@ def _put_front(infer_queue: "queue.Queue", item) -> None:
     queues across ranks would diverge and the next batch would encode
     different images on different ranks. Re-inserting at the head preserves
     FIFO order on rank 0 and keeps all ranks in sync.
+
+    Note: ``Queue.get`` does *not* decrement ``unfinished_tasks`` — only
+    ``task_done()`` does. The original ``Queue.put`` already counted this
+    item, so we must NOT bump the counter again on re-insert; doing so would
+    desync ``Queue.join()``/``task_done()`` accounting (a latent footgun if
+    any future caller starts using them on this queue).
     """
     with infer_queue.mutex:
         infer_queue.queue.appendleft(item)
-        infer_queue.unfinished_tasks += 1
         infer_queue.not_empty.notify()
 
 
diff --git a/unit_tests/server/visualserver/test_batching.py b/unit_tests/server/visualserver/test_batching.py
index 066bc19c73..9b0a1ed430 100644
--- a/unit_tests/server/visualserver/test_batching.py
+++ b/unit_tests/server/visualserver/test_batching.py
@@ -134,6 +134,29 @@ def test_rejected_on_sem_exhaustion_returns_to_front(self):
         remaining_in_order = [q.get_nowait().token_num for _ in range(q.qsize())]
         self.assertEqual(remaining_in_order, [300])
 
+    def test_unfinished_tasks_stays_consistent_through_reject(self):
+        # Queue.put bumps unfinished_tasks; Queue.get does NOT decrement it
+        # (only task_done() does). The reject path re-inserts at the front
+        # and must not bump the counter again — otherwise Queue.join() would
+        # hang forever even after every consumed item is task_done()'d.
+        q, sem = _setup([100, 500, 100])
+        # 3 items put in => unfinished_tasks == 3
+        self.assertEqual(q.unfinished_tasks, 3)
+        got = pull_batch_with_budget(q, sem, max_num=10, max_tokens=200)
+        self.assertEqual([g.token_num for g in got], [100])
+        # One item consumed (returned to caller, awaiting task_done), two
+        # still pending in the queue. Counter should still match the number
+        # of logical outstanding tasks: 3.
+        self.assertEqual(q.unfinished_tasks, 3)
+        # task_done for the returned item, then drain the rest with task_done.
+        q.task_done()
+        self.assertEqual(q.qsize(), 2)
+        for _ in range(q.qsize()):
+            q.get_nowait()
+            q.task_done()
+        # join() must return promptly; if the counter were corrupted it would hang.
+        q.join()
+
 
 if __name__ == "__main__":
     unittest.main()

From 581eda76ba6968be3185c5089e9582d0f48b8937 Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Wed, 6 May 2026 17:44:26 +0800
Subject: [PATCH 4/4] refactor(multimodal): drop --visual_image_max_tokens,
 fold into --visual_batch_max_tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-image cap was always defaulted to ``visual_batch_max_tokens`` and asserted
``<=`` it, because the "first image always admitted" deadlock-avoidance rule
implies a single image must fit in one batch. Two knobs for the same value was
just surface-area without a real degree of freedom.

Collapse to one knob:

- Remove ``--visual_image_max_tokens`` from CLI, ``StartArgs``, ``api_start.py``,
  and the inter-block validation that enforced the redundant ``<=`` invariant.
- All clamp call sites (qwen2_vl / qwen2_5_vl / qwen3_vl / qwen3_omni / tarsier2
  visual loaders, and ``server/tokenizer.py``) now read ``visual_batch_max_tokens``.
- ``enforce_image_token_budget`` (httpserver + httpserver_for_pd_master) and the
  error message in ``multimodal_utils.py`` now reference
  ``visual_batch_max_tokens``.
- Rename the helper's parameter from ``visual_image_max_tokens`` to
  ``max_image_tokens`` since it no longer mirrors a CLI flag — it's just the
  numeric budget the caller computes from ``visual_batch_max_tokens``.
- Update CLI help and docs (``api_server_args.rst``, ``qwen35_deployment.rst``)
  to describe the single-knob design.
- Update unit tests accordingly.

Tests: 22/22 pass on the affected files.
---
 docs/EN/source/cookbook/qwen35_deployment.rst | 15 +++----
 docs/EN/source/tutorial/api_server_args.rst   | 24 +++---------
 lightllm/models/qwen2_5_vl/qwen2_5_visual.py  |  2 +-
 lightllm/models/qwen2_vl/qwen2_visual.py      |  2 +-
 lightllm/models/qwen2_vl/vision_process.py    | 22 +++++------
 .../qwen3_omni_visual.py                      |  2 +-
 lightllm/models/qwen3_vl/qwen3_visual.py      |  2 +-
 lightllm/models/tarsier2/tarsier2_visual.py   |  2 +-
 lightllm/server/api_cli.py                    | 26 +++----------
 lightllm/server/api_start.py                  | 39 +++----------------
 lightllm/server/core/objs/start_args_type.py  |  1 -
 lightllm/server/httpserver/manager.py         |  4 +-
 .../httpserver_for_pd_master/manager.py       |  2 +-
 lightllm/server/tokenizer.py                  |  4 +-
 lightllm/utils/multimodal_utils.py            |  2 +-
 .../test_clamp_processor_max_pixels.py        | 18 ++++-----
 16 files changed, 53 insertions(+), 114 deletions(-)

diff --git a/docs/EN/source/cookbook/qwen35_deployment.rst b/docs/EN/source/cookbook/qwen35_deployment.rst
index b73b8e3bb7..818ce162dc 100644
--- a/docs/EN/source/cookbook/qwen35_deployment.rst
+++ b/docs/EN/source/cookbook/qwen35_deployment.rst
@@ -235,12 +235,9 @@ Hardware Requirements
 - Use ``--data_type fp8_e4m3`` for FP8 KV quantization to further reduce memory pressure
 - Multimodal deployments get ViT OOM protection by default: when
   ``--enable_multimodal`` is on, ``--visual_batch_max_tokens`` is auto-derived
-  from ``--batch_max_tokens`` and ``--visual_image_max_tokens`` is auto-
-  derived from that, bounding peak ViT memory the same way
-  ``--batch_max_tokens`` bounds the LLM prefill. To tighten the per-step
-  budget further, pass an explicit value (e.g.
-  ``--visual_batch_max_tokens 16384``); to override the per-image cap
-  separately, set ``--visual_image_max_tokens`` (must be
-  ``<= --visual_batch_max_tokens``).
-- To opt out of the budget (restore pre-PR behavior), pass
-  ``--visual_batch_max_tokens 0`` and/or ``--visual_image_max_tokens 0``.
+  from ``--batch_max_tokens``. The same value caps both per-step batch
+  output and per-image output (oversized images are auto-resized by the
+  Qwen-VL ``max_pixels`` clamp; anything still over budget is rejected
+  before reaching the ViT). To tighten the budget further, pass an explicit
+  value (e.g. ``--visual_batch_max_tokens 16384``); to opt out and restore
+  pre-PR behavior, pass ``--visual_batch_max_tokens 0``.
diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst
index 813e345488..b806d06739 100644
--- a/docs/EN/source/tutorial/api_server_args.rst
+++ b/docs/EN/source/tutorial/api_server_args.rst
@@ -281,7 +281,11 @@ Multimodal Parameters
     peak ViT memory on dynamic-resolution models (Qwen2.5/3/3.5-VL, etc.)
     where one 4K image or long video can contain more patches than many
     small images combined. One image is always admitted per step to avoid
-    deadlock when a single request is larger than the budget.
+    deadlock when a single request is larger than the budget — to make that
+    safe, the same value also drives the per-image budget: oversized images
+    are auto-resized by the Qwen-VL processor ``max_pixels`` clamp, and any
+    image that still exceeds the budget is rejected with a ``ValueError``
+    before reaching the ViT.
 
     **Default behavior with** ``--enable_multimodal``: auto-derived from
     ``--batch_max_tokens`` so multimodal deployments get OOM protection
@@ -289,24 +293,6 @@ Multimodal Parameters
     Pass ``0`` to opt out and restore the pre-budget behavior (only
     ``--visual_infer_batch_size`` applies).
 
-.. option:: --visual_image_max_tokens
-
-    Per-image hard cap, measured in image output tokens (post spatial_merge).
-    The multimodal analogue of ``--max_req_total_len``: a single image whose
-    ``token_num`` exceeds this value is rejected with a ``ValueError`` before
-    reaching the ViT. Pairs with ``--visual_batch_max_tokens`` to close the
-    "first image always admitted" hole — without this cap, one 4K image can
-    still OOM the ViT on its own. Also drives the processor ``max_pixels``
-    clamp so Qwen-VL-family processors auto-resize oversized images to fit.
-
-    **Default behavior with** ``--enable_multimodal``: auto-derived from
-    ``--visual_batch_max_tokens`` (single image must fit in one batch; this
-    is the implicit precondition of the first-image-always-admitted rule).
-    Set explicitly to a smaller positive integer only if your ViT cannot
-    handle a batch-sized image alone; must satisfy
-    ``visual_image_max_tokens <= visual_batch_max_tokens``. Pass ``0`` to opt
-    out and accept any image size.
-
 .. option:: --visual_gpu_ids
 
     List of GPU IDs to use, e.g., 0 1 2
diff --git a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
index daef2ccc73..3e6272810f 100644
--- a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
+++ b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
@@ -210,7 +210,7 @@ def __init__(
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
         clamp_processor_max_pixels(
-            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen2_5_vl-vit"
+            self.processor, get_env_start_args().visual_batch_max_tokens, processor_name="qwen2_5_vl-vit"
         )
 
         self._init_datatype()
diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
index 0e248c938f..fea900f904 100644
--- a/lightllm/models/qwen2_vl/qwen2_visual.py
+++ b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -246,7 +246,7 @@ def load_model(self, weight_dir):
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
         clamp_processor_max_pixels(
-            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen2_vl-vit"
+            self.processor, get_env_start_args().visual_batch_max_tokens, processor_name="qwen2_vl-vit"
         )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
diff --git a/lightllm/models/qwen2_vl/vision_process.py b/lightllm/models/qwen2_vl/vision_process.py
index c7504326df..12302b1b1a 100644
--- a/lightllm/models/qwen2_vl/vision_process.py
+++ b/lightllm/models/qwen2_vl/vision_process.py
@@ -27,15 +27,15 @@
 logger = init_logger(__name__)
 
 
-def clamp_processor_max_pixels(processor, visual_image_max_tokens, processor_name: str = "") -> None:
+def clamp_processor_max_pixels(processor, max_image_tokens, processor_name: str = "") -> None:
     """Clamp a Qwen-VL style image processor's max-pixel limit so that even a
-    max-sized image produces ``token_num <= visual_image_max_tokens``.
+    max-sized image produces ``token_num <= max_image_tokens``.
 
     Reuses the processor's built-in ``smart_resize`` mechanism — just tightens
     the per-pixel budget so the existing resize path fits the server-wide
-    per-image token budget. After the clamp, ``get_image_token_length`` cannot
-    return a value above the budget, so request-level rejection becomes a
-    defensive no-op in practice.
+    per-image token budget (``--visual_batch_max_tokens``). After the clamp,
+    ``get_image_token_length`` cannot return a value above the budget, so
+    request-level rejection becomes a defensive no-op in practice.
 
     Different Qwen-VL generations expose the limit on different attributes:
     Qwen2-VL / Qwen2.5-VL / lightllm's own ``Qwen2VLImageProcessor`` use
@@ -44,16 +44,16 @@ def clamp_processor_max_pixels(processor, visual_image_max_tokens, processor_nam
     present so any reader (HF runtime, tokenizer ``__init__``) sees the
     tightened bound.
 
-    No-op when ``visual_image_max_tokens`` is None or the processor already
-    enforces a tighter bound.
+    No-op when ``max_image_tokens`` is None or the processor already enforces a
+    tighter bound.
     """
-    if visual_image_max_tokens is None:
+    if max_image_tokens is None:
         return
     unit = processor.patch_size * processor.merge_size
-    allowed_max_pixels = visual_image_max_tokens * unit * unit
+    allowed_max_pixels = max_image_tokens * unit * unit
     if allowed_max_pixels < unit * unit:
         raise ValueError(
-            f"visual_image_max_tokens={visual_image_max_tokens} is too small; "
+            f"max_image_tokens={max_image_tokens} is too small; "
             f"need at least 1 patch's worth (={unit * unit} pixels) for {processor_name or 'processor'}."
         )
 
@@ -78,7 +78,7 @@ def clamp_processor_max_pixels(processor, visual_image_max_tokens, processor_nam
             f"{processor_name or 'processor'}: clamping max_pixels/longest_edge to "
             f"{allowed_max_pixels} (was max_pixels={current_max_pixels}, "
             f"longest_edge={current_longest_edge}; "
-            f"visual_image_max_tokens={visual_image_max_tokens}, unit={unit})"
+            f"max_image_tokens={max_image_tokens}, unit={unit})"
         )
 
 
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py
index a09a5654ce..381d989b6e 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py
@@ -227,7 +227,7 @@ def load_model(self, weight_dir):
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
         clamp_processor_max_pixels(
-            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen3_omni-vit"
+            self.processor, get_env_start_args().visual_batch_max_tokens, processor_name="qwen3_omni-vit"
         )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
diff --git a/lightllm/models/qwen3_vl/qwen3_visual.py b/lightllm/models/qwen3_vl/qwen3_visual.py
index f876fa2b7d..3523fd92c3 100644
--- a/lightllm/models/qwen3_vl/qwen3_visual.py
+++ b/lightllm/models/qwen3_vl/qwen3_visual.py
@@ -222,7 +222,7 @@ def load_model(self, weight_dir):
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
         clamp_processor_max_pixels(
-            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen3_vl-vit"
+            self.processor, get_env_start_args().visual_batch_max_tokens, processor_name="qwen3_vl-vit"
         )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
diff --git a/lightllm/models/tarsier2/tarsier2_visual.py b/lightllm/models/tarsier2/tarsier2_visual.py
index 90f3b7a6ae..3c5afb1f81 100644
--- a/lightllm/models/tarsier2/tarsier2_visual.py
+++ b/lightllm/models/tarsier2/tarsier2_visual.py
@@ -223,7 +223,7 @@ def load_model(self, weight_dir):
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
         clamp_processor_max_pixels(
-            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="tarsier2-vit"
+            self.processor, get_env_start_args().visual_batch_max_tokens, processor_name="tarsier2-vit"
         )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index aa49308991..3cbc8be69f 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -483,7 +483,11 @@ def make_argument_parser() -> argparse.ArgumentParser:
         the multimodal analogue of --batch_max_tokens and caps peak ViT
         memory/compute for dynamic-resolution models (Qwen2.5/3/3.5-VL, etc.).
         One image is always admitted per step to avoid deadlock when a single
-        request is larger than the budget.
+        request is larger than the budget — to make that safe, this value
+        also drives the per-image budget: oversized images are auto-resized
+        by the Qwen-VL processor max_pixels clamp, and any image that still
+        exceeds the budget is rejected with a ValueError before reaching the
+        ViT.
 
         Default behavior when --enable_multimodal is on: auto-derived from
         --batch_max_tokens so multimodal deployments get OOM protection without
@@ -492,26 +496,6 @@ def make_argument_parser() -> argparse.ArgumentParser:
         image-count cap --visual_infer_batch_size applies).
         """,
     )
-    parser.add_argument(
-        "--visual_image_max_tokens",
-        type=int,
-        default=None,
-        help="""
-        Per-image hard cap, measured in image output tokens (post spatial_merge).
-        The multimodal analogue of --max_req_total_len: a single image whose
-        token_num exceeds this value is rejected with a ValueError instead of
-        being forwarded to the ViT. Pairs with --visual_batch_max_tokens to
-        close the "first image always admitted" hole — without this cap, one
-        4K image or long-aspect-ratio image can still OOM the ViT by itself.
-        Also drives the processor max_pixels clamp so Qwen-VL-family processors
-        auto-resize oversized images to fit.
-
-        Default behavior when --enable_multimodal is on: auto-derived from
-        --visual_batch_max_tokens (single image must fit in one batch). Pass an
-        explicit smaller positive integer for a stricter cap; pass 0 to opt
-        out and accept any image size.
-        """,
-    )
     parser.add_argument(
         "--visual_send_batch_size",
         type=int,
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
index e82861ed33..dce45c11fc 100644
--- a/lightllm/server/api_start.py
+++ b/lightllm/server/api_start.py
@@ -273,11 +273,12 @@ def normal_or_p_d_start(args):
         logger.info(f"set cpu_cache_token_page_size to {args.cpu_cache_token_page_size} for linear hybrid att model")
 
     # 多模态预算默认值（safety-on-by-default for multimodal deployments）：
-    # - 不传：visual_batch_max_tokens 默认等于 batch_max_tokens（LLM 和 ViT 共用预算口径）；
-    #         visual_image_max_tokens 默认等于 visual_batch_max_tokens（单图必须能塞进一个批次，
-    #         "首图必放行"规则的隐含前提）。
-    # - 传 0：显式关闭对应预算，恢复 PR 之前的"不限"行为（向后兼容用）。
+    # - 不传：visual_batch_max_tokens 默认等于 batch_max_tokens（LLM 和 ViT 共用预算口径）。
+    # - 传 0：显式关闭，恢复 PR 之前的"不限"行为（向后兼容用）。
     # - 传正整数：作为显式预算使用。
+    # 同一个值同时充当 per-step batch budget、per-image hard cap 和 processor max_pixels
+    # clamp 的依据 —— "首图必放行" 规则要求单图必须能塞进一个批次，所以 batch budget 和
+    # 单图上限本来就是同一个数。
     if args.enable_multimodal:
         if args.visual_batch_max_tokens is None:
             args.visual_batch_max_tokens = args.batch_max_tokens
@@ -286,26 +287,9 @@ def normal_or_p_d_start(args):
                 f"(pass --visual_batch_max_tokens 0 to opt out)"
             )
         elif args.visual_batch_max_tokens == 0:
-            logger.info("visual_batch_max_tokens explicitly disabled (=0); per-step ViT token budget off")
+            logger.info("visual_batch_max_tokens explicitly disabled (=0); ViT token budget off")
             args.visual_batch_max_tokens = None
 
-        if args.visual_image_max_tokens is None:
-            args.visual_image_max_tokens = args.visual_batch_max_tokens
-        elif args.visual_image_max_tokens == 0:
-            logger.info("visual_image_max_tokens explicitly disabled (=0); per-image hard cap off")
-            args.visual_image_max_tokens = None
-
-        if (
-            args.visual_image_max_tokens is not None
-            and args.visual_batch_max_tokens is not None
-            and args.visual_image_max_tokens > args.visual_batch_max_tokens
-        ):
-            raise ValueError(
-                f"visual_image_max_tokens ({args.visual_image_max_tokens}) must be "
-                f"<= visual_batch_max_tokens ({args.visual_batch_max_tokens}); otherwise "
-                f"a single 'valid' image can always exceed the batch budget alone."
-            )
-
     # help to manage data stored on Ceph
     if "s3://" in args.model_dir:
         from lightllm.utils.petrel_helper import s3_model_prepare
@@ -597,17 +581,6 @@ def visual_only_start(args):
         args.visual_gpu_ids = list(range(args.visual_dp * args.visual_tp))
     if args.visual_infer_batch_size is None:
         args.visual_infer_batch_size = args.visual_dp
-    if args.visual_image_max_tokens is None and args.visual_batch_max_tokens is not None:
-        args.visual_image_max_tokens = args.visual_batch_max_tokens
-    if (
-        args.visual_image_max_tokens is not None
-        and args.visual_batch_max_tokens is not None
-        and args.visual_image_max_tokens > args.visual_batch_max_tokens
-    ):
-        raise ValueError(
-            f"visual_image_max_tokens ({args.visual_image_max_tokens}) must be "
-            f"<= visual_batch_max_tokens ({args.visual_batch_max_tokens})"
-        )
     if args.data_type is None:
         from lightllm.utils.config_utils import get_dtype
 
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
index 216fe4307f..0e17b78d87 100644
--- a/lightllm/server/core/objs/start_args_type.py
+++ b/lightllm/server/core/objs/start_args_type.py
@@ -108,7 +108,6 @@ class StartArgs:
     visual_node_id: int = field(default=None)
     visual_infer_batch_size: int = field(default=None)
     visual_batch_max_tokens: Optional[int] = field(default=None)
-    visual_image_max_tokens: Optional[int] = field(default=None)
     visual_send_batch_size: int = field(default=1)
     visual_gpu_ids: List[int] = field(default_factory=lambda: [0])
     visual_tp: int = field(default=1)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 1acb6001a3..f7bc4b78b1 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -185,7 +185,7 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
                 data = img.read()
                 # must after init_imageitem_extral_params
                 token_num = self.tokenizer.get_image_token_length(img)
-                enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
+                enforce_image_token_budget(token_num, self.args.visual_batch_max_tokens, image_index=img_index)
                 md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
                 md5sums.append(md5sum)
                 img.md5 = md5sum
@@ -242,7 +242,7 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
             img_count += 1
             self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
             token_num = self.tokenizer.get_image_token_length(img)
-            enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
+            enforce_image_token_budget(token_num, self.args.visual_batch_max_tokens, image_index=img_index)
             image_tokens += token_num
         for audio in multimodal_params.audios:
             audio_count += 1
diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py
index cd0d952657..efde914548 100644
--- a/lightllm/server/httpserver_for_pd_master/manager.py
+++ b/lightllm/server/httpserver_for_pd_master/manager.py
@@ -78,7 +78,7 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
             img_count += 1
             self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
             token_num = self.tokenizer.get_image_token_length(img)
-            enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
+            enforce_image_token_budget(token_num, self.args.visual_batch_max_tokens, image_index=img_index)
             image_tokens += token_num
         for audio in multimodal_params.audios:
             audio_count += 1
diff --git a/lightllm/server/tokenizer.py b/lightllm/server/tokenizer.py
index db7d0dbae6..247ca3e0d2 100644
--- a/lightllm/server/tokenizer.py
+++ b/lightllm/server/tokenizer.py
@@ -90,12 +90,12 @@ def get_tokenizer(
         return DeepSeekV32Tokenizer(hf_tokenizer)
 
     # Qwen-VL family shares a max_pixels clamp helper to keep get_image_token_length
-    # in sync with visual_image_max_tokens budget. No-op for non-Qwen-VL tokenizers.
+    # in sync with visual_batch_max_tokens budget. No-op for non-Qwen-VL tokenizers.
     from ..models.qwen2_vl.vision_process import clamp_processor_max_pixels
     from lightllm.utils.envs_utils import get_env_start_args
 
     _start_args = get_env_start_args()
-    _img_max_tokens = getattr(_start_args, "visual_image_max_tokens", None)
+    _img_max_tokens = getattr(_start_args, "visual_batch_max_tokens", None)
 
     if model_cfg["architectures"][0] == "TarsierForConditionalGeneration":
         from ..models.qwen2_vl.vision_process import Qwen2VLImageProcessor
diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py
index b459d9ce35..f8151665b2 100644
--- a/lightllm/utils/multimodal_utils.py
+++ b/lightllm/utils/multimodal_utils.py
@@ -21,7 +21,7 @@ def enforce_image_token_budget(token_num: int, max_tokens: Optional[int], image_
     if max_tokens is not None and token_num > max_tokens:
         raise ValueError(
             f"image[{image_index}] token_num={token_num} exceeds "
-            f"visual_image_max_tokens={max_tokens}; reduce image resolution, "
+            f"visual_batch_max_tokens={max_tokens}; reduce image resolution, "
             f"image_max_patch_num (InternVL-family), or preprocessor_config.json::max_pixels (Qwen-VL)"
         )
 
diff --git a/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py b/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
index 30f00ef16b..d561e8bb56 100644
--- a/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
+++ b/unit_tests/models/qwen2_vl/test_clamp_processor_max_pixels.py
@@ -92,30 +92,30 @@ def test_none_budget_is_noop(self):
     def test_budget_looser_than_processor_is_noop(self):
         # Processor's max_pixels already gives 16384 tokens. Budget is 32768. Keep smaller.
         p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
-        clamp_processor_max_pixels(p, visual_image_max_tokens=32768)
+        clamp_processor_max_pixels(p, max_image_tokens=32768)
         self.assertEqual(p.max_pixels, 16384 * 28 * 28)
 
     def test_budget_tighter_clamps(self):
         # patch=14, merge=2 -> unit=28, unit^2=784. Budget 4096 tokens -> 4096*784 pixels.
         p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
-        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        clamp_processor_max_pixels(p, max_image_tokens=4096)
         self.assertEqual(p.max_pixels, 4096 * 28 * 28)
 
     def test_budget_equal_to_original_is_noop(self):
         # Original gives exactly 16384 tokens. Budget 16384 -> same value.
         p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
-        clamp_processor_max_pixels(p, visual_image_max_tokens=16384)
+        clamp_processor_max_pixels(p, max_image_tokens=16384)
         self.assertEqual(p.max_pixels, 16384 * 28 * 28)
 
     def test_budget_zero_raises(self):
         p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
         with self.assertRaises(ValueError):
-            clamp_processor_max_pixels(p, visual_image_max_tokens=0)
+            clamp_processor_max_pixels(p, max_image_tokens=0)
 
     def test_different_patch_merge(self):
         # patch=16, merge=1 -> unit=16, unit^2=256. Budget 1000 tokens -> 256000 pixels.
         p = _FakeProcessor(patch_size=16, merge_size=1, max_pixels=10_000_000)
-        clamp_processor_max_pixels(p, visual_image_max_tokens=1000)
+        clamp_processor_max_pixels(p, max_image_tokens=1000)
         self.assertEqual(p.max_pixels, 1000 * 16 * 16)
 
     def test_processor_max_pixels_none_is_clamped(self):
@@ -123,7 +123,7 @@ def test_processor_max_pixels_none_is_clamped(self):
         # bound); the clamp must treat that as "looser than any budget" and
         # always apply our allowed_max_pixels instead of crashing on int<None.
         p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=None)
-        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        clamp_processor_max_pixels(p, max_image_tokens=4096)
         self.assertEqual(p.max_pixels, 4096 * 28 * 28)
 
     def test_size_longest_edge_is_clamped(self):
@@ -134,7 +134,7 @@ def test_size_longest_edge_is_clamped(self):
         # matches what the ViT will actually produce after smart_resize.
         p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=None)
         p.size = {"shortest_edge": 4 * 28 * 28, "longest_edge": 16384 * 28 * 28}
-        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        clamp_processor_max_pixels(p, max_image_tokens=4096)
         self.assertEqual(p.max_pixels, 4096 * 28 * 28)
         self.assertEqual(p.size["longest_edge"], 4096 * 28 * 28)
         # shortest_edge is unrelated to the cap; must not be touched.
@@ -145,7 +145,7 @@ def test_size_longest_edge_already_tighter_is_noop(self):
         # it alone — same semantics as the existing max_pixels branch.
         p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=None)
         p.size = {"longest_edge": 1024 * 28 * 28}
-        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        clamp_processor_max_pixels(p, max_image_tokens=4096)
         self.assertEqual(p.size["longest_edge"], 1024 * 28 * 28)
 
     def test_size_without_longest_edge_is_ignored(self):
@@ -153,7 +153,7 @@ def test_size_without_longest_edge_is_ignored(self):
         # clamp must not invent a longest_edge key in that case.
         p = _FakeProcessor(patch_size=14, merge_size=2, max_pixels=16384 * 28 * 28)
         p.size = {"height": 224, "width": 224}
-        clamp_processor_max_pixels(p, visual_image_max_tokens=4096)
+        clamp_processor_max_pixels(p, max_image_tokens=4096)
         self.assertEqual(p.max_pixels, 4096 * 28 * 28)
         self.assertNotIn("longest_edge", p.size)