From 807345f985386d0cc631f6972d55bc961386d047 Mon Sep 17 00:00:00 2001 From: vincentzed <207368749+vincentzed@users.noreply.github.com> Date: Sat, 2 Aug 2025 11:07:14 -0400 Subject: [PATCH] feat(multimodal): add uuid-based caching optimization for media processing introduce uuid fields in multimodal data structures to enable efficient cache lookups and bypass expensive media fetching and processing when possible. this optimization reduces redundant operations by leveraging user-provided uuids for images, videos, and audio content. - add `uuid` field to chat utils param types - implement cache key extraction and url-only hashing in multimodal hasher - extend multimodal inputs with uuid fields for caching hints - enhance processor to handle uuid-only references and reconstruct inputs from cache - update media connector with cache-aware fetch methods to skip processing on cache hits this change improves performance for repeated media inputs by avoiding redundant downloads and decoding. rmv mypy add and discard update flags fix --- vllm/entrypoints/chat_utils.py | 287 +++++++++++++++++++++++++++++---- vllm/multimodal/hasher.py | 39 +++++ vllm/multimodal/inputs.py | 10 ++ vllm/multimodal/processing.py | 116 +++++++++++-- vllm/multimodal/utils.py | 1 + 5 files changed, 408 insertions(+), 45 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a658d97cc8c5..da24cb334a49 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -125,8 +125,20 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): { "image_url": "https://example.com/image.jpg" } + + With UUID for caching optimization: + { + "image_url": "https://example.com/image.jpg", + "uuid": "abcde" + } + + UUID-only cache reference (empty content): + { + "uuid": "abcde" + } """ - image_url: Required[str] + image_url: str + uuid: str class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): @@ -136,19 +148,33 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): { "audio_url": "https://example.com/audio.mp3" } + + With UUID for caching optimization: + { + "audio_url": "https://example.com/audio.mp3", + "uuid": "abcde" + } """ - audio_url: Required[str] + audio_url: str + uuid: str class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): - """A simpler version of the param that only accepts a plain audio_url. + """A simpler version of the param that only accepts a plain video_url. Example: { "video_url": "https://example.com/video.mp4" } + + With UUID for caching optimization: + { + "video_url": "https://example.com/video.mp4", + "uuid": "abcde" + } """ - video_url: Required[str] + video_url: str + uuid: str class CustomThinkCompletionContentParam(TypedDict, total=False): @@ -572,12 +598,29 @@ def create_parser(self) -> "BaseMultiModalContentParser": class MultiModalItemTracker(BaseMultiModalItemTracker[object]): + def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): + super().__init__(model_config, tokenizer) + # Track UUIDs separately for cache optimization + self._uuids_by_modality = defaultdict[str, list[str]](list) + + def add_uuid(self, modality: str, uuid: str) -> None: + """Add a UUID for the given modality.""" + self._uuids_by_modality[modality].append(uuid) def all_mm_data(self) -> Optional[MultiModalDataDict]: - if not self._items_by_modality: + if not self._items_by_modality and not self._uuids_by_modality: return None mm_inputs = {} items_by_modality = dict(self._items_by_modality) + + # Add UUID fields to mm_inputs for cache optimization + if "image" in self._uuids_by_modality: + mm_inputs["image_uuids"] = self._uuids_by_modality["image"] + if "audio" in self._uuids_by_modality: + mm_inputs["audio_uuids"] = self._uuids_by_modality["audio"] + if "video" in self._uuids_by_modality: + mm_inputs["video_uuids"] = self._uuids_by_modality["video"] + if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError(\ "Mixing raw image and embedding inputs is not allowed") @@ -601,15 +644,31 @@ def create_parser(self) -> "BaseMultiModalContentParser": class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): + def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): + super().__init__(model_config, tokenizer) + # Track UUIDs separately for cache optimization + self._uuids_by_modality = defaultdict[str, list[str]](list) + + def add_uuid(self, modality: str, uuid: str) -> None: + """Add a UUID for the given modality.""" + self._uuids_by_modality[modality].append(uuid) async def all_mm_data(self) -> Optional[MultiModalDataDict]: - if not self._items_by_modality: + if not self._items_by_modality and not self._uuids_by_modality: return None mm_inputs = {} items_by_modality = { modality: await asyncio.gather(*items) for modality, items in self._items_by_modality.items() } + + # Add UUID fields to mm_inputs for cache optimization + if "image" in self._uuids_by_modality: + mm_inputs["image_uuids"] = self._uuids_by_modality["image"] + if "audio" in self._uuids_by_modality: + mm_inputs["audio_uuids"] = self._uuids_by_modality["audio"] + if "video" in self._uuids_by_modality: + mm_inputs["video_uuids"] = self._uuids_by_modality["video"] if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError( @@ -656,7 +715,7 @@ def mm_placeholder_storage(self) -> dict[str, list]: return dict(self._placeholder_storage) @abstractmethod - def parse_image(self, image_url: str) -> None: + def parse_image(self, image_url: Union[str, dict[str, str]]) -> None: raise NotImplementedError @abstractmethod @@ -669,7 +728,7 @@ def parse_image_pil(self, image_pil: Image.Image) -> None: raise NotImplementedError @abstractmethod - def parse_audio(self, audio_url: str) -> None: + def parse_audio(self, audio_url: Union[str, dict[str, str]]) -> None: raise NotImplementedError @abstractmethod @@ -677,7 +736,7 @@ def parse_input_audio(self, input_audio: InputAudio) -> None: raise NotImplementedError @abstractmethod - def parse_video(self, video_url: str) -> None: + def parse_video(self, video_url: Union[str, dict[str, str]]) -> None: raise NotImplementedError @@ -693,8 +752,24 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str) -> None: - image = self._connector.fetch_image(image_url) + def parse_image(self, image_url: Union[str, dict[str, str]]) -> None: + if isinstance(image_url, dict): + # Handle UUID for cache optimization + if "uuid" in image_url: + self._tracker.add_uuid("image", image_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in image_url or not image_url["url"]: + placeholder = self._tracker.add("image", None) # Placeholder for UUID-only + self._add_placeholder("image", placeholder) + return + url = image_url.get("url", "") + if url: + image = self._connector.fetch_image(url) + else: + # UUID-only reference, use None as placeholder + image = None + else: + image = self._connector.fetch_image(image_url) placeholder = self._tracker.add("image", image) self._add_placeholder("image", placeholder) @@ -718,8 +793,24 @@ def parse_image_pil(self, image_pil: Image.Image) -> None: placeholder = self._tracker.add("image", image_pil) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: - audio = self._connector.fetch_audio(audio_url) + def parse_audio(self, audio_url: Union[str, dict[str, str]]) -> None: + if isinstance(audio_url, dict): + # Handle UUID for cache optimization + if "uuid" in audio_url: + self._tracker.add_uuid("audio", audio_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in audio_url or not audio_url["url"]: + placeholder = self._tracker.add("audio", None) # Placeholder for UUID-only + self._add_placeholder("audio", placeholder) + return + url = audio_url.get("url", "") + if url: + audio = self._connector.fetch_audio(url) + else: + # UUID-only reference, use None as placeholder + audio = None + else: + audio = self._connector.fetch_audio(audio_url) placeholder = self._tracker.add("audio", audio) self._add_placeholder("audio", placeholder) @@ -731,8 +822,24 @@ def parse_input_audio(self, input_audio: InputAudio) -> None: return self.parse_audio(audio_url) - def parse_video(self, video_url: str) -> None: - video = self._connector.fetch_video(video_url=video_url) + def parse_video(self, video_url: Union[str, dict[str, str]]) -> None: + if isinstance(video_url, dict): + # Handle UUID for cache optimization + if "uuid" in video_url: + self._tracker.add_uuid("video", video_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in video_url or not video_url["url"]: + placeholder = self._tracker.add("video", None) # Placeholder for UUID-only + self._add_placeholder("video", placeholder) + return + url = video_url.get("url", "") + if url: + video = self._connector.fetch_video(video_url=url) + else: + # UUID-only reference, use None as placeholder + video = None + else: + video = self._connector.fetch_video(video_url=video_url) placeholder = self._tracker.add("video", video) self._add_placeholder("video", placeholder) @@ -749,8 +856,28 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path ) - def parse_image(self, image_url: str) -> None: - image_coro = self._connector.fetch_image_async(image_url) + def parse_image(self, image_url: Union[str, dict[str, str]]) -> None: + if isinstance(image_url, dict): + # Handle UUID for cache optimization + if "uuid" in image_url: + self._tracker.add_uuid("image", image_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in image_url or not image_url["url"]: + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + placeholder = self._tracker.add("image", future) # Placeholder for UUID-only + self._add_placeholder("image", placeholder) + return + url = image_url.get("url", "") + if url: + image_coro = self._connector.fetch_image_async(url) + else: + # UUID-only reference, use None as placeholder + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + image_coro = future + else: + image_coro = self._connector.fetch_image_async(image_url) placeholder = self._tracker.add("image", image_coro) self._add_placeholder("image", placeholder) @@ -781,8 +908,28 @@ def parse_image_pil(self, image_pil: Image.Image) -> None: placeholder = self._tracker.add("image", future) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: - audio_coro = self._connector.fetch_audio_async(audio_url) + def parse_audio(self, audio_url: Union[str, dict[str, str]]) -> None: + if isinstance(audio_url, dict): + # Handle UUID for cache optimization + if "uuid" in audio_url: + self._tracker.add_uuid("audio", audio_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in audio_url or not audio_url["url"]: + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + placeholder = self._tracker.add("audio", future) # Placeholder for UUID-only + self._add_placeholder("audio", placeholder) + return + url = audio_url.get("url", "") + if url: + audio_coro = self._connector.fetch_audio_async(url) + else: + # UUID-only reference, use None as placeholder + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + audio_coro = future + else: + audio_coro = self._connector.fetch_audio_async(audio_url) placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder("audio", placeholder) @@ -794,8 +941,28 @@ def parse_input_audio(self, input_audio: InputAudio) -> None: return self.parse_audio(audio_url) - def parse_video(self, video_url: str) -> None: - video = self._connector.fetch_video_async(video_url=video_url) + def parse_video(self, video_url: Union[str, dict[str, str]]) -> None: + if isinstance(video_url, dict): + # Handle UUID for cache optimization + if "uuid" in video_url: + self._tracker.add_uuid("video", video_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in video_url or not video_url["url"]: + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + placeholder = self._tracker.add("video", future) # Placeholder for UUID-only + self._add_placeholder("video", placeholder) + return + url = video_url.get("url", "") + if url: + video = self._connector.fetch_video_async(video_url=url) + else: + # UUID-only reference, use None as placeholder + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + video = future + else: + video = self._connector.fetch_video_async(video_url=video_url) placeholder = self._tracker.add("video", video) self._add_placeholder("video", placeholder) @@ -945,6 +1112,34 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], ResponseInputImageParam).validate_python _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage] + +def _parse_image_url_with_uuid(parsed_part: dict[str, Any]) -> dict[str, str]: + """Extract both URL and UUID from image_url part.""" + image_url_data = parsed_part.get("image_url", {}) + result = {"url": image_url_data.get("url", "")} + if "uuid" in parsed_part: + result["uuid"] = parsed_part["uuid"] + return result + + +def _parse_audio_url_with_uuid(parsed_part: dict[str, Any]) -> dict[str, str]: + """Extract both URL and UUID from audio_url part.""" + audio_url_data = parsed_part.get("audio_url", {}) + result = {"url": audio_url_data.get("url", "")} + if "uuid" in parsed_part: + result["uuid"] = parsed_part["uuid"] + return result + + +def _parse_video_url_with_uuid(parsed_part: dict[str, Any]) -> dict[str, str]: + """Extract both URL and UUID from video_url part.""" + video_url_data = parsed_part.get("video_url", {}) + result = {"url": video_url_data.get("url", "")} + if "uuid" in parsed_part: + result["uuid"] = parsed_part["uuid"] + return result + + # Define a mapping from part types to their corresponding parsing functions. MM_PARSER_MAP: dict[ str, @@ -959,18 +1154,18 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None), "image_url": - lambda part: _ImageParser(part).get("image_url", {}).get("url", None), + lambda part: _parse_image_url_with_uuid(_ImageParser(part)), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), "image_pil": lambda part: _PILImageParser(part).get("image_pil", None), "audio_url": - lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), + lambda part: _parse_audio_url_with_uuid(_AudioParser(part)), "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None), "refusal": lambda part: _RefusalParser(part).get("refusal", None), "video_url": - lambda part: _VideoParser(part).get("video_url", {}).get("url", None), + lambda part: _parse_video_url_with_uuid(_VideoParser(part)), } @@ -1008,21 +1203,47 @@ def _parse_chat_message_content_mm_part( # Handle missing 'type' but provided direct URL fields. # 'type' is required field by pydantic if part_type is None: + # Handle UUID-only references first + if part.get("uuid") is not None and len(part) == 1: + # Pure UUID reference without URL - need to infer type from context + # Default to image_url for now, but this should ideally be specified + return "image_url", {"uuid": part["uuid"]} + if part.get("image_url") is not None: image_params = cast(CustomChatCompletionContentSimpleImageParam, part) - return "image_url", image_params.get("image_url", "") + result = {"url": image_params.get("image_url", "")} + if "uuid" in image_params: + result["uuid"] = image_params["uuid"] + return "image_url", result if part.get("audio_url") is not None: audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part) - return "audio_url", audio_params.get("audio_url", "") + result = {"url": audio_params.get("audio_url", "")} + if "uuid" in audio_params: + result["uuid"] = audio_params["uuid"] + return "audio_url", result if part.get("input_audio") is not None: input_audio_params = cast(dict[str, str], part) return "input_audio", input_audio_params if part.get("video_url") is not None: video_params = cast(CustomChatCompletionContentSimpleVideoParam, part) - return "video_url", video_params.get("video_url", "") + result = {"url": video_params.get("video_url", "")} + if "uuid" in video_params: + result["uuid"] = video_params["uuid"] + return "video_url", result + # Handle UUID-only references for specific modalities + if "uuid" in part: + # Check for modality-specific UUID references + if "image_uuid" in part or part.get("type") == "image": + return "image_url", {"uuid": part["uuid"]} + elif "audio_uuid" in part or part.get("type") == "audio": + return "audio_url", {"uuid": part["uuid"]} + elif "video_uuid" in part or part.get("type") == "video": + return "video_url", {"uuid": part["uuid"]} + # Default to image for generic UUID + return "image_url", {"uuid": part["uuid"]} # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") @@ -1113,24 +1334,24 @@ def _parse_chat_message_content_part( mm_parser.parse_image_pil(image_content) modality = "image" elif part_type in ("image_url", "input_image"): - str_content = cast(str, content) - mm_parser.parse_image(str_content) + # content can now be str or dict[str, str] (with UUID) + mm_parser.parse_image(content) modality = "image" elif part_type == "image_embeds": content = cast(Union[str, dict[str, str]], content) mm_parser.parse_image_embeds(content) modality = "image" elif part_type == "audio_url": - str_content = cast(str, content) - mm_parser.parse_audio(str_content) + # content can now be str or dict[str, str] (with UUID) + mm_parser.parse_audio(content) modality = "audio" elif part_type == "input_audio": dict_content = cast(InputAudio, content) mm_parser.parse_input_audio(dict_content) modality = "audio" elif part_type == "video_url": - str_content = cast(str, content) - mm_parser.parse_video(str_content) + # content can now be str or dict[str, str] (with UUID) + mm_parser.parse_video(content) modality = "video" else: raise NotImplementedError(f"Unknown part type: {part_type}") diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index ac27bb66f7b5..5082f45bd67f 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -23,6 +23,45 @@ class MultiModalHasher: + @classmethod + def get_cache_keys_from_mm_data(cls, mm_data: Mapping[str, object]) -> list[str]: + """Extract cache keys from multimodal data, preferring UUIDs.""" + cache_keys = [] + + # Check for UUID fields first + for modality in ['image', 'video', 'audio']: + uuid_key = f"{modality}_uuids" + if uuid_key in mm_data: + uuids = mm_data[uuid_key] + if isinstance(uuids, list): + cache_keys.extend(uuids) + else: + cache_keys.append(uuids) + + # If no UUIDs found, fall back to URL-optimized hashing + if not cache_keys: + return [cls._hash_with_url_optimization(**mm_data)] + + return cache_keys + + @classmethod + def _hash_with_url_optimization(cls, **kwargs: object) -> str: + """ + Hash multimodal data with URL-only optimization when possible. + For URL-based inputs, hash only the URL instead of full content. + """ + optimized_kwargs: dict[str, object] = {} + + for key, value in kwargs.items(): + # For URL-based inputs, hash only the URL + if key.endswith('_url') and isinstance(value, str): + optimized_kwargs[key] = value + # For other inputs, use full content hashing + else: + optimized_kwargs[key] = value + + return cls.hash_kwargs(**optimized_kwargs) + @classmethod def serialize_item(cls, obj: object) -> Union[bytes, memoryview]: # Simple cases diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 18aae35c6fd4..37a0723d9d49 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -105,6 +105,16 @@ class MultiModalDataBuiltins(TypedDict, total=False): audio: ModalityData[AudioItem] """The input audio(s).""" + + # UUID fields for caching optimization + image_uuids: ModalityData[str] + """User-provided UUIDs for image cache optimization.""" + + video_uuids: ModalityData[str] + """User-provided UUIDs for video cache optimization.""" + + audio_uuids: ModalityData[str] + """User-provided UUIDs for audio cache optimization.""" MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 46240855d12a..c34b8912181d 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1021,6 +1021,28 @@ def get_item( value=self._cache.get(cache_key), ) + def get_by_key(self, cache_key: str) -> Optional[MultiModalKwargsItem]: + """ + Get a processed multi-modal item from the cache using a direct cache key. + + This method is used for UUID-based cache lookups where the cache key + is already known (e.g., when using UUIDs as direct cache keys). + + Args: + cache_key: The direct cache key to lookup + + Returns: + The cached item if found, None otherwise + """ + self._maybe_log_cache_stats() + + if self.debug_cache_hit_ratio_steps: + if cache_key in self._cache: + self.debug_cache_hits += 1 + self.debug_cache_total += 1 + + return self._cache.get(cache_key) + def put( self, model_id: str, @@ -1220,6 +1242,10 @@ def _to_mm_items( before passing them to [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. """ + # Handle UUID-only references (empty content with UUIDs) + if self._is_uuid_only_reference(mm_data): + return self._create_uuid_placeholder_items(mm_data) + mm_items = self.data_parser.parse_mm_data(mm_data) for modality, items in mm_items.items(): @@ -1227,6 +1253,19 @@ def _to_mm_items( return mm_items + def _is_uuid_only_reference(self, mm_data: MultiModalDataDict) -> bool: + """Check if this is a UUID-only cache reference with no actual content.""" + has_uuids = any(key.endswith('_uuids') for key in mm_data) + has_content = any(key in ['image', 'video', 'audio'] for key in mm_data) + return has_uuids and not has_content + + def _create_uuid_placeholder_items(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: + """Create placeholder items for UUID-only references.""" + from .parse import MultiModalDataItems + # For UUID-only references, return empty items + # The actual processing will be handled by the UUID cache lookup + return MultiModalDataItems() + @abstractmethod def _get_mm_fields_config( self, @@ -1823,6 +1862,29 @@ def _maybe_apply_prompt_updates( return prompt_ids, prompt, mm_placeholders + def _has_uuid_fields(self, mm_data: MultiModalDataDict) -> bool: + """Check if multimodal data contains any UUID fields.""" + return any(key.endswith('_uuids') for key in mm_data) + + def _create_uuid_hash_dict( + self, + mm_data: MultiModalDataDict + ) -> Mapping[str, list[str]]: + """Create a hash dictionary from UUIDs in multimodal data.""" + hash_dict = {} + + for modality in ['image', 'video', 'audio']: + uuid_key = f"{modality}_uuids" + if uuid_key in mm_data: + uuids = mm_data[uuid_key] + if isinstance(uuids, list): + hash_dict[modality] = uuids + else: + hash_dict[modality] = [uuids] + + return hash_dict + + def apply( self, prompt: Union[str, list[int]], @@ -1844,23 +1906,53 @@ def apply( 3. Extract information about the placeholder tokens from the processed token IDs. """ + # NEW: UUID-based optimization - bypass expensive MediaConnector and content hashing + # but still do proper prompt processing for placeholder insertion + uuid_cache_kwargs = None + if self.cache is not None and return_mm_hashes and self._has_uuid_fields(mm_data): + cache_keys = MultiModalHasher.get_cache_keys_from_mm_data(mm_data) + if cache_keys: + cache_hits = [self.cache.get_by_key(key) for key in cache_keys] + if all(hit is not None for hit in cache_hits): + # Full cache hit - we can reconstruct mm_kwargs directly + try: + non_null_hits = cast(list[MultiModalKwargsItem], cache_hits) + uuid_cache_kwargs = MultiModalKwargs.from_items(non_null_hits) + except (ValueError, KeyError): + # Race condition: cache evicted between check and reconstruction + pass + mm_items = self._to_mm_items(mm_data) if tokenization_kwargs is None: tokenization_kwargs = {} - ( - prompt_ids, - mm_kwargs, - mm_hashes, - is_update_applied, - ) = self._cached_apply_hf_processor( - prompt, - mm_items, - hf_processor_mm_kwargs, - tokenization_kwargs=tokenization_kwargs, - return_mm_hashes=return_mm_hashes, - ) + # Use cached kwargs if available, otherwise process normally + if uuid_cache_kwargs is not None: + # UUID cache hit - skip HF processing but do prompt tokenization + if isinstance(prompt, str): + tokenizer = self.info.get_tokenizer() + prompt_ids = encode_tokens(tokenizer, prompt, add_special_tokens=False) + else: + prompt_ids = prompt + mm_kwargs = uuid_cache_kwargs + # Convert UUIDs to proper MultiModalHashDict format + mm_hashes = self._create_uuid_hash_dict(mm_data) if return_mm_hashes else None + is_update_applied = False # Will need to apply prompt updates + else: + # Normal processing path + ( + prompt_ids, + mm_kwargs, + mm_hashes, + is_update_applied, + ) = self._cached_apply_hf_processor( + prompt, + mm_items, + hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + return_mm_hashes=return_mm_hashes, + ) # NOTE: tokenization_kwargs are not required to init processor prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 8dfbc6503520..c468901ad239 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -287,6 +287,7 @@ def fetch_image_embedding( return image_embedding_io.load_base64("", data) + def encode_audio_base64( audio: np.ndarray, sampling_rate: float,