From ddc4eafc6bcb84cfad5c6ca485592771e1f3f3aa Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchu@microsoft.com>
Date: Fri, 8 May 2026 00:52:12 +0000
Subject: [PATCH 1/3] feat: add ort-multimodal evaluator for fast multimodal
 model benchmarks

Add LMEvalORTMultimodalEvaluator ('ort-multimodal' backend) that uses
direct ORT InferenceSession for multimodal GenAI packages. This avoids
the overhead of GenAI's Generator API while supporting models with
heterogeneous KV cache head dimensions (e.g. Gemma4 with head_dim=256
for sliding attention and head_dim=512 for full attention).

The evaluator:
- Loads decoder and embedding ONNX models from genai_config.json
- Runs embedding model to convert input_ids -> inputs_embeds
- Runs decoder with per-layer empty KV cache buffers
- Returns full logits [batch, seq_len, vocab] in a single forward pass

Performance on Gemma4 E2B-IT MMLU Pro (CUDA EP):
- ort-multimodal: 142 req/s (full run, matches PyTorch HF accuracy)
- ortgenai:        9.5 req/s (19.5x slower)
- PyTorch HF:    185 req/s (baseline)

Also wire up 'ort-multimodal' in LMEvaluator.evaluate() for Olive
pipeline integration.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Signed-off-by: Justin Chu <justinchu@microsoft.com>
---
 olive/evaluator/lmeval_ort.py      | 145 +++++++++++++++++++++++++++++
 olive/evaluator/olive_evaluator.py |   6 ++
 2 files changed, 151 insertions(+)

diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py
index 50d1f1289..4a031e2e2 100644
--- a/olive/evaluator/lmeval_ort.py
+++ b/olive/evaluator/lmeval_ort.py
@@ -460,6 +460,151 @@ def initialize_buffers(self, batch_size: int, max_length: int):
         self._batch_size = batch_size
 
 
+@register_model("ort-multimodal")
+class LMEvalORTMultimodalEvaluator(LMEvalOnnxBase):
+    """Evaluate a multimodal ONNX model using direct ORT InferenceSession.
+
+    Designed for ORT GenAI multimodal packages (e.g. Gemma4) that have separate
+    decoder and embedding ONNX models. Uses direct session.run() instead of
+    GenAI's Generator API, avoiding the overhead of loading all sub-models
+    and creating Generator objects per call.
+
+    Supports models with heterogeneous KV cache head dimensions (e.g. Gemma4
+    with head_dim=256 for sliding attention and head_dim=512 for full attention),
+    which the standard 'ort' backend cannot handle.
+    """
+
+    def __init__(
+        self,
+        pretrained: str,
+        batch_size: int | str = 1,
+        max_length: int | None = None,
+        ep: str | None = None,
+        ep_options: dict | None = None,
+        **kwargs,
+    ):
+        """Initialize the evaluator.
+
+        :param pretrained: Path to the ORT GenAI model directory containing
+            genai_config.json, decoder/, embedding/, and tokenizer files.
+        :param batch_size: Batch size for evaluation.
+        :param max_length: Maximum sequence length. Defaults to config value.
+        :param ep: Execution provider (e.g. 'CUDAExecutionProvider').
+        :param ep_options: Provider options dict.
+        """
+        import onnxruntime as ort
+
+        super().__init__()
+
+        model_dir = Path(pretrained)
+
+        # Load genai_config to find model paths and metadata
+        with (model_dir / "genai_config.json").open() as f:
+            genai_config = json.load(f)
+
+        model_config = genai_config["model"]
+        decoder_config = model_config["decoder"]
+
+        # Resolve max_length
+        if max_length:
+            self._max_length = max_length
+        else:
+            self._max_length = min(
+                genai_config.get("search", {}).get("max_length", 2048),
+                2048,  # Cap at 2048 for eval efficiency
+            )
+
+        # EOS token handling (can be list or scalar)
+        eot = model_config["eos_token_id"]
+        self._eot_token_id = eot[0] if isinstance(eot, list) else eot
+
+        # Set up execution providers
+        providers = []
+        if ep:
+            providers.append(ep)
+        providers.append("CPUExecutionProvider")
+
+        # Load decoder session
+        decoder_path = str(model_dir / decoder_config["filename"])
+        logger.info("Loading decoder from %s", decoder_path)
+        self._decoder_sess = ort.InferenceSession(decoder_path, providers=providers)
+
+        # Detect per-layer KV cache shapes (supports heterogeneous head_dim)
+        self._kv_shapes = {}
+        for inp in self._decoder_sess.get_inputs():
+            if inp.name.startswith("past_key_values"):
+                self._kv_shapes[inp.name] = {
+                    "num_kv_heads": inp.shape[1],
+                    "head_dim": inp.shape[3],
+                }
+
+        # Load embedding session if available
+        self._embedding_sess = None
+        self._hidden_size = decoder_config["hidden_size"]
+        embedding_config = model_config.get("embedding")
+        if embedding_config:
+            emb_path = str(model_dir / embedding_config["filename"])
+            logger.info("Loading embedding from %s", emb_path)
+            self._embedding_sess = ort.InferenceSession(emb_path, providers=providers)
+
+        # Load tokenizer from model directory
+        self._tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
+        self.batch_size = int(batch_size)
+
+    @property
+    def max_length(self) -> int:
+        return self._max_length
+
+    @property
+    def eot_token_id(self) -> int:
+        return self._eot_token_id
+
+    def tok_encode(self, string: str, **kwargs) -> list[int]:
+        return self._tokenizer.encode(string, add_special_tokens=False)
+
+    def prepare(self, requests: list[LogLikelihoodInputs]):
+        pass
+
+    def model_call(self, input_ids: torch.Tensor, cont_len: int = 0) -> torch.Tensor:
+        import numpy as np
+
+        batch_size, seq_len = input_ids.shape
+        ids_np = input_ids.cpu().numpy().astype(np.int64)
+
+        # Get embeddings if embedding model is available
+        if self._embedding_sess is not None:
+            emb_feed = {
+                "input_ids": ids_np,
+                "image_features": np.zeros((0, self._hidden_size), dtype=np.float16),
+                "audio_features": np.zeros((0, self._hidden_size), dtype=np.float16),
+            }
+            inputs_embeds = self._embedding_sess.run(None, emb_feed)[0]
+        else:
+            inputs_embeds = np.zeros((batch_size, seq_len, self._hidden_size), dtype=np.float16)
+
+        # Build decoder feed with per-layer KV cache shapes
+        dec_feed = {
+            "input_ids": ids_np,
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": np.ones((batch_size, seq_len), dtype=np.int64),
+            "position_ids": np.broadcast_to(
+                np.arange(seq_len, dtype=np.int64).reshape(1, -1),
+                (batch_size, seq_len),
+            ).copy(),
+        }
+        for name, info in self._kv_shapes.items():
+            dec_feed[name] = np.zeros(
+                (batch_size, info["num_kv_heads"], 0, info["head_dim"]),
+                dtype=np.float16,
+            )
+
+        result = self._decoder_sess.run(["logits"], dec_feed)
+        return torch.from_numpy(result[0])
+
+    def complete(self):
+        pass
+
+
 @register_model("ortgenai")
 class LMEvalORTGenAIEvaluator(LMEvalOnnxBase):
     """Evaluate a model using ONNX Runtime GenAI."""
diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py
index 05933b8b6..537aa3b0d 100644
--- a/olive/evaluator/olive_evaluator.py
+++ b/olive/evaluator/olive_evaluator.py
@@ -1572,6 +1572,12 @@ def evaluate(
                 "ep_options": self.ep_options,
                 "device": device,
             }
+        elif self.model_class == "ort-multimodal":
+            init_args = {
+                "pretrained": str(Path(model.model_path).parent),
+                "ep": self.ep or execution_providers,
+                "ep_options": self.ep_options,
+            }
         else:
             raise ValueError(f"Unknown model class: {self.model_class}")
 

From ae69f173a9845bbb93087bd679aa5c909c436e80 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchu@microsoft.com>
Date: Fri, 8 May 2026 01:22:53 +0000
Subject: [PATCH 2/3] perf: reuse Generator via rewind_to(0) in ortgenai
 evaluator

Cache the og.Generator object across model_call invocations and use
rewind_to(0) to reset state instead of creating a new Generator per
sample. Falls back to creating a new Generator if rewind_to is not
supported (e.g. older GenAI versions without multimodal rewind fix).

Performance on Gemma4 E2B-IT MMLU Pro (CUDA EP):
- Per-call: 379ms -> 256ms (1.48x per model call)
- End-to-end limit=200: 160.6s -> 145.2s (1.11x overall)
- Estimated full run savings: ~12 minutes

Requires onnxruntime-genai >= 0.14.0 with microsoft/onnxruntime-genai#2141
for multimodal model support.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Signed-off-by: Justin Chu <justinchu@microsoft.com>
---
 olive/evaluator/lmeval_ort.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py
index 4a031e2e2..2ac4e1d28 100644
--- a/olive/evaluator/lmeval_ort.py
+++ b/olive/evaluator/lmeval_ort.py
@@ -665,6 +665,7 @@ def __init__(
 
         self.device = device
         self._returns_full_logits = self._detect_full_logits()
+        self._cached_generator = None
 
     def _detect_full_logits(self) -> bool:
         """Check if the model returns logits for all input positions or only the last."""
@@ -691,10 +692,24 @@ def tok_encode(self, string: str, **kwargs) -> list[int]:
     def prepare(self, requests: list[LogLikelihoodInputs]):
         pass
 
-    def model_call(self, input_ids: torch.Tensor, cont_len: int = 0) -> torch.Tensor:
-        batch_size, seq_len = input_ids.shape
+    def _get_generator(self, batch_size: int) -> "og.Generator":
+        """Get a Generator, reusing via rewind_to(0) when possible."""
+        if self._cached_generator is not None:
+            try:
+                self._cached_generator.rewind_to(0)
+                return self._cached_generator
+            except Exception:
+                # rewind_to not supported for this model — fall back to new Generator
+                self._cached_generator = None
+
         self.params.set_search_options(batch_size=batch_size)
         generator = og.Generator(self.model, self.params)
+        self._cached_generator = generator
+        return generator
+
+    def model_call(self, input_ids: torch.Tensor, cont_len: int = 0) -> torch.Tensor:
+        batch_size, seq_len = input_ids.shape
+        generator = self._get_generator(batch_size)
 
         if self._returns_full_logits:
             generator.append_tokens(input_ids.tolist())
@@ -724,7 +739,7 @@ def model_call(self, input_ids: torch.Tensor, cont_len: int = 0) -> torch.Tensor
         return torch.cat(all_logits, dim=1)  # [batch, n_logits, vocab]
 
     def complete(self):
-        pass
+        self._cached_generator = None
 
     def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
         """Generate text until a stop sequence is reached.

From 2009970093fb5e4b58d7193b239b5378961ea447 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchu@microsoft.com>
Date: Fri, 8 May 2026 16:23:50 +0000
Subject: [PATCH 3/3] =?UTF-8?q?perf:=20use=20get=5Flogits()=20to=20avoid?=
 =?UTF-8?q?=20472MB=20GPU=E2=86=92CPU=20copy=20per=20call?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace get_output('logits') with incremental get_logits() in the
ortgenai evaluator's model_call. get_output copies the FULL logits
tensor (seq_len × vocab_size × 2 bytes, e.g. 472MB for 900 tokens
with 262K vocab) from GPU to CPU each call, taking 410ms. get_logits()
returns only the last position's logits (~1MB), taking 1.8ms.

For loglikelihood scoring, we only need logits at the continuation
token positions (typically 1-20 tokens). The new approach appends the
context as a bulk prefill, then steps through continuation tokens one
at a time using get_logits(), collecting only the needed positions.

Performance on Gemma4 E2B-IT MMLU Pro (limit=50, CUDA EP):
- Before: 46.9s (10.6 req/s)
- After:  24.2s (20.7 req/s)
- Speedup: 1.94x

Also removes the _returns_full_logits detection since the evaluator
now always uses the incremental path (which works for both model types).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Signed-off-by: Justin Chu <justinchu@microsoft.com>
---
 olive/evaluator/lmeval_ort.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py
index 2ac4e1d28..ad841a360 100644
--- a/olive/evaluator/lmeval_ort.py
+++ b/olive/evaluator/lmeval_ort.py
@@ -711,11 +711,6 @@ def model_call(self, input_ids: torch.Tensor, cont_len: int = 0) -> torch.Tensor
         batch_size, seq_len = input_ids.shape
         generator = self._get_generator(batch_size)
 
-        if self._returns_full_logits:
-            generator.append_tokens(input_ids.tolist())
-            return torch.from_numpy(generator.get_output("logits")).to(self.device)
-
-        # Model only returns logits for the last appended position.
         if batch_size > 1 and cont_len > 1:
             raise ValueError(
                 "batch_size > 1 is not supported when the model returns single-position logits"
@@ -723,15 +718,18 @@ def model_call(self, input_ids: torch.Tensor, cont_len: int = 0) -> torch.Tensor
                 " batch elements. Use batch_size=1 instead."
             )
 
-        # Bulk-append context tokens, then step through the last cont_len tokens
-        # one at a time to collect only the logits we actually need.
+        # Use incremental token appending with get_logits() to avoid copying
+        # the full logits tensor from GPU to CPU. get_output("logits") copies
+        # seq_len * vocab_size * 2 bytes (e.g. 472MB for 900 tokens with
+        # 262K vocab), while get_logits() copies only vocab_size * 4 bytes
+        # (~1MB) per position.
         n_logits = max(cont_len, 1)
         prefix_len = seq_len - n_logits
         generator.append_tokens(input_ids[:, : prefix_len + 1].tolist())
-        all_logits = [torch.from_numpy(generator.get_output("logits")).to(self.device)]
+        all_logits = [torch.from_numpy(generator.get_logits()).to(self.device)]
         for i in range(prefix_len + 1, seq_len):
             generator.append_tokens(input_ids[:, i : i + 1].tolist())
-            all_logits.append(torch.from_numpy(generator.get_output("logits")).to(self.device))
+            all_logits.append(torch.from_numpy(generator.get_logits()).to(self.device))
 
         # No need to pad to [batch, seq_len, vocab]. The slicing in _loglikelihood_tokens computes
         # ctx_len = inplen + (logits.shape[0] - padding_len_inp), which adjusts for the shorter