From 9bac84888273d119510fc396f533f034296555ab Mon Sep 17 00:00:00 2001 From: Michael Carroll Date: Fri, 27 Mar 2026 15:47:39 -0400 Subject: [PATCH 1/2] Fill defaults from existing generation config if present. --- src/engine/ov_genai/llm.py | 44 ++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/engine/ov_genai/llm.py b/src/engine/ov_genai/llm.py index d2d1686..43a9e6d 100755 --- a/src/engine/ov_genai/llm.py +++ b/src/engine/ov_genai/llm.py @@ -80,13 +80,21 @@ async def generate_text(self, gen_config: OVGenAI_GenConfig) -> AsyncIterator[Un Async non-streaming text generation. Yields in order: metrics (dict), new_text (str). """ - generation_kwargs = GenerationConfig( - max_new_tokens=gen_config.max_tokens, - temperature=gen_config.temperature, - top_k=gen_config.top_k, - top_p=gen_config.top_p, - repetition_penalty=gen_config.repetition_penalty, - ) + if isinstance(self.model, LLMPipeline): + generation_kwargs = self.model.get_generation_config() + generation_kwargs.max_new_tokens = gen_config.max_tokens + generation_kwargs.temperature = gen_config.temperature + generation_kwargs.top_k = gen_config.top_k + generation_kwargs.top_p = gen_config.top_p + generation_kwargs.repetition_penalty = gen_config.repetition_penalty + else: + generation_kwargs = GenerationConfig( + max_new_tokens=gen_config.max_tokens, + temperature=gen_config.temperature, + top_k=gen_config.top_k, + top_p=gen_config.top_p, + repetition_penalty=gen_config.repetition_penalty, + ) # Add speculative decoding parameters (mutually exclusive per OpenVINO docs) import os @@ -130,13 +138,21 @@ async def generate_stream(self, gen_config: OVGenAI_GenConfig) -> AsyncIterator[ Async streaming text generation. Yields token chunks (str) as they arrive, then metrics (dict), then final new_text (str). """ - generation_kwargs = GenerationConfig( - max_new_tokens=gen_config.max_tokens, - temperature=gen_config.temperature, - top_k=gen_config.top_k, - top_p=gen_config.top_p, - repetition_penalty=gen_config.repetition_penalty - ) + if isinstance(self.model, LLMPipeline): + generation_kwargs = self.model.get_generation_config() + generation_kwargs.max_new_tokens = gen_config.max_tokens + generation_kwargs.temperature = gen_config.temperature + generation_kwargs.top_k = gen_config.top_k + generation_kwargs.top_p = gen_config.top_p + generation_kwargs.repetition_penalty = gen_config.repetition_penalty + else: + generation_kwargs = GenerationConfig( + max_new_tokens=gen_config.max_tokens, + temperature=gen_config.temperature, + top_k=gen_config.top_k, + top_p=gen_config.top_p, + repetition_penalty=gen_config.repetition_penalty, + ) # Add speculative decoding parameters (mutually exclusive per OpenVINO docs) import os From 47f1217cc833b49d3f06649023737192674736e6 Mon Sep 17 00:00:00 2001 From: SearchSavior Date: Sat, 28 Mar 2026 17:26:16 -0400 Subject: [PATCH 2/2] - update vlm generation_kwargs to follow new pattern --- src/engine/ov_genai/vlm.py | 44 ++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/engine/ov_genai/vlm.py b/src/engine/ov_genai/vlm.py index d4ec4d8..14f800e 100644 --- a/src/engine/ov_genai/vlm.py +++ b/src/engine/ov_genai/vlm.py @@ -141,13 +141,21 @@ async def generate_text(self, gen_config: OVGenAI_GenConfig) -> AsyncIterator[Un Yields in order: metrics (dict), new_text (str). """ try: - generation_kwargs = GenerationConfig( - max_new_tokens=gen_config.max_tokens, - temperature=gen_config.temperature, - top_k=gen_config.top_k, - top_p=gen_config.top_p, - repetition_penalty=gen_config.repetition_penalty, - ) + if isinstance(self.model_path, VLMPipeline): + generation_kwargs = self.model_path.get_generation_config() + generation_kwargs.max_new_tokens = gen_config.max_tokens + generation_kwargs.temperature = gen_config.temperature + generation_kwargs.top_k = gen_config.top_k + generation_kwargs.top_p = gen_config.top_p + generation_kwargs.repetition_penalty = gen_config.repetition_penalty + else: + generation_kwargs = GenerationConfig( + max_new_tokens=gen_config.max_tokens, + temperature=gen_config.temperature, + top_k=gen_config.top_k, + top_p=gen_config.top_p, + repetition_penalty=gen_config.repetition_penalty, + ) prompt, ov_images = self.prepare_inputs(gen_config.messages, gen_config.tools) @@ -176,13 +184,21 @@ async def generate_stream(self, Async streaming generation for VLM. Yields token chunks (str) as they arrive, then metrics (dict). """ - generation_kwargs = GenerationConfig( - max_new_tokens=gen_config.max_tokens, - temperature=gen_config.temperature, - top_k=gen_config.top_k, - top_p=gen_config.top_p, - repetition_penalty=gen_config.repetition_penalty, - ) + if isinstance(self.model_path, VLMPipeline): + generation_kwargs = self.model_path.get_generation_config() + generation_kwargs.max_new_tokens = gen_config.max_tokens + generation_kwargs.temperature = gen_config.temperature + generation_kwargs.top_k = gen_config.top_k + generation_kwargs.top_p = gen_config.top_p + generation_kwargs.repetition_penalty = gen_config.repetition_penalty + else: + generation_kwargs = GenerationConfig( + max_new_tokens=gen_config.max_tokens, + temperature=gen_config.temperature, + top_k=gen_config.top_k, + top_p=gen_config.top_p, + repetition_penalty=gen_config.repetition_penalty, + ) decoder_tokenizer = self.model_path.get_tokenizer() streamer = ChunkStreamer(decoder_tokenizer, gen_config)