From 7017132c32d8ea2caaaf9f65adf679b9e96135d7 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <yuri@liquid.ai>
Date: Tue, 12 May 2026 17:06:11 -0400
Subject: [PATCH 1/4] Add chat-template hooks to LMEvalORTGenAIEvaluator

lm-eval's `simple_evaluate(..., apply_chat_template=True)` requires the
underlying LM class to implement `tokenizer_name` and `apply_chat_template`.
The HFLM backend has both; the ORT GenAI backend does not, so any attempt
to evaluate a chat-tuned ONNX model with chat-formatted prompts raises
`NotImplementedError: To use this model with chat templates, please
implement the 'tokenizer_name' property.`

This adds the two members with the minimum surface area:
  - `tokenizer_name` returns the model path (for lm-eval's chat-aware
    result caching), matching the HFLM convention of slash-replacement.
  - `apply_chat_template` defers to the model's HF tokenizer via
    `AutoTokenizer.apply_chat_template`, mirroring HFLM's
    implementation.

The HF tokenizer is loaded once at `__init__` purely for chat-template
rendering; token-level encode/decode still goes through `og.Tokenizer`
and the runtime, so there is no change to generation behavior or any
existing code path.

Verified end-to-end on LFM2.5-350M (int4, k_quant_mixed) MBPP:
without chat-template hooks the eval raised at task start; with them
plus `num_fewshot=0` and a chat-friendly stop list, pass@1 went from
0.0/500 to 67/500 (13.4%) -- the original 0.0 was a prompt-format
artifact (instruct model + completion-style few-shot), not a
conversion regression.
---
 olive/evaluator/lmeval_ort.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py
index c4a158533..f15b60c3a 100644
--- a/olive/evaluator/lmeval_ort.py
+++ b/olive/evaluator/lmeval_ort.py
@@ -498,6 +498,10 @@ def __init__(
                 self.config.set_provider_option(ep, key, value)
         self.model = og.Model(self.config)
         self.tokenizer = og.Tokenizer(self.model)
+        # HF tokenizer kept solely to render `apply_chat_template`; generation
+        # still uses og.Tokenizer above.
+        self._pretrained = str(pretrained)
+        self._hf_tokenizer = AutoTokenizer.from_pretrained(self._pretrained)
 
         # consider adding auto batch sizes
         self.batch_size = int(batch_size)
@@ -521,6 +525,24 @@ def __init__(
         self.device = device
         self._returns_full_logits = self._detect_full_logits()
 
+    @property
+    def tokenizer_name(self) -> str:
+        """Identifier used by lm-eval for chat-template-aware caching."""
+        return self._pretrained.replace("/", "__")
+
+    def apply_chat_template(self, chat_history: list[dict], add_generation_prompt: bool = True) -> str:
+        """Render a chat history through the model's HF chat template.
+
+        Required by lm-eval when `apply_chat_template=True` is passed to
+        `simple_evaluate`; without it, lm-eval raises NotImplementedError.
+        """
+        return self._hf_tokenizer.apply_chat_template(
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
+        )
+
     def _detect_full_logits(self) -> bool:
         """Check if the model returns logits for all input positions or only the last."""
         try:

From 8ae13c60f49adca301c6a5b1a7d696a0f74a32c3 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <yuri@liquid.ai>
Date: Tue, 12 May 2026 18:16:08 -0400
Subject: [PATCH 2/4] Address review feedback: lazy HF tokenizer load,
 cross-platform cache key, tests

- Lazy-load the HF tokenizer on the first ``apply_chat_template`` call rather
  than at ``__init__``. Callers that never enable chat templating no longer
  need HF tokenizer files (``tokenizer_config.json`` etc.) in the model
  directory; eager loading would have regressed those workflows.

- ``tokenizer_name`` now replaces both POSIX and Windows path separators with
  ``__`` so the lm-eval cache identifier is stable across platforms. The
  previous implementation only handled forward slashes, leaving backslashes
  in the key on Windows because ``str(Path(...))`` preserves the native
  separator.

- Add unit tests for both behaviours:
    - ``tokenizer_name`` parametrised over POSIX, relative, and Windows-style
      paths to lock in the normalisation contract.
    - ``apply_chat_template`` verified to (a) not load the HF tokenizer at
      construction, (b) load once on first call, and (c) reuse the cached
      tokenizer on subsequent calls. ``AutoTokenizer`` is patched so the
      tests run without any HF tokenizer files on disk.

All four new tests pass; ``test_olive_evaluator.py`` as a whole stays green
(85 passed). ``lintrunner`` reports no new warnings on the changed files.
---
 olive/evaluator/lmeval_ort.py          | 27 +++++++++----
 test/evaluator/test_olive_evaluator.py | 54 ++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py
index f15b60c3a..5d70b0056 100644
--- a/olive/evaluator/lmeval_ort.py
+++ b/olive/evaluator/lmeval_ort.py
@@ -498,10 +498,11 @@ def __init__(
                 self.config.set_provider_option(ep, key, value)
         self.model = og.Model(self.config)
         self.tokenizer = og.Tokenizer(self.model)
-        # HF tokenizer kept solely to render `apply_chat_template`; generation
-        # still uses og.Tokenizer above.
+        # HF tokenizer is loaded lazily by `apply_chat_template` on first use,
+        # so callers that don't enable chat templating do not need HF tokenizer
+        # files (tokenizer_config.json, etc.) in the model directory.
         self._pretrained = str(pretrained)
-        self._hf_tokenizer = AutoTokenizer.from_pretrained(self._pretrained)
+        self._hf_tokenizer: AutoTokenizer | None = None
 
         # consider adding auto batch sizes
         self.batch_size = int(batch_size)
@@ -527,15 +528,25 @@ def __init__(
 
     @property
     def tokenizer_name(self) -> str:
-        """Identifier used by lm-eval for chat-template-aware caching."""
-        return self._pretrained.replace("/", "__")
+        r"""Stable identifier used by lm-eval's chat-template-aware result cache.
+
+        Replace both POSIX and Windows path separators so the cache key does
+        not embed raw path separators (a Windows path like ``C:\models\foo``
+        and the POSIX form ``C:/models/foo`` produce the same identifier).
+        """
+        return self._pretrained.replace("\\", "__").replace("/", "__")
 
     def apply_chat_template(self, chat_history: list[dict], add_generation_prompt: bool = True) -> str:
-        """Render a chat history through the model's HF chat template.
+        """Render a chat history through the model's HuggingFace chat template.
 
-        Required by lm-eval when `apply_chat_template=True` is passed to
-        `simple_evaluate`; without it, lm-eval raises NotImplementedError.
+        Required by lm-eval when ``apply_chat_template=True`` is passed to
+        ``simple_evaluate``; without it, lm-eval raises ``NotImplementedError``
+        at task setup. The HF tokenizer is loaded on first call so callers that
+        never enable chat templating are not required to ship HF tokenizer
+        files alongside the ONNX model.
         """
+        if self._hf_tokenizer is None:
+            self._hf_tokenizer = AutoTokenizer.from_pretrained(self._pretrained)
         return self._hf_tokenizer.apply_chat_template(
             chat_history,
             tokenize=False,
diff --git a/test/evaluator/test_olive_evaluator.py b/test/evaluator/test_olive_evaluator.py
index e295d069a..bdd10fd65 100644
--- a/test/evaluator/test_olive_evaluator.py
+++ b/test/evaluator/test_olive_evaluator.py
@@ -510,3 +510,57 @@ def test_lm_evaluator_dispatches_to_requested_backend(
         evaluator.evaluate(model, metrics=[], device=Device.CPU, execution_providers=["CPUExecutionProvider"])
 
         get_model_mock.assert_called_once_with(model_class)
+
+
+class TestLMEvalORTGenAIChatTemplate:
+    """Cover the chat-template hooks added to LMEvalORTGenAIEvaluator.
+
+    The hooks must work without instantiating the full evaluator (which requires
+    an ONNX model + onnxruntime-genai), so the tests skip ``__init__`` and
+    exercise the methods directly on a bare instance.
+    """
+
+    def _bare_instance(self, pretrained: str):
+        # pylint: disable=protected-access
+        from olive.evaluator.lmeval_ort import LMEvalORTGenAIEvaluator
+
+        instance = LMEvalORTGenAIEvaluator.__new__(LMEvalORTGenAIEvaluator)
+        instance._pretrained = pretrained
+        instance._hf_tokenizer = None
+        return instance
+
+    @pytest.mark.parametrize(
+        ("pretrained", "expected"),
+        [
+            ("/models/lfm2-350m", "__models__lfm2-350m"),
+            ("relative/path/model", "relative__path__model"),
+            # Windows-style separators must normalize identically to their POSIX form
+            # so the cache key is stable across platforms.
+            ("C:\\models\\lfm2-350m", "C:__models__lfm2-350m"),
+        ],
+    )
+    def test_tokenizer_name_normalizes_separators(self, pretrained, expected):
+        assert self._bare_instance(pretrained).tokenizer_name == expected
+
+    @patch("olive.evaluator.lmeval_ort.AutoTokenizer")
+    def test_apply_chat_template_lazy_loads_hf_tokenizer(self, auto_tokenizer_mock):
+        chat_history = [{"role": "user", "content": "hello"}]
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.apply_chat_template.return_value = "rendered prompt"
+        auto_tokenizer_mock.from_pretrained.return_value = mock_tokenizer
+
+        instance = self._bare_instance("/models/lfm2")
+
+        auto_tokenizer_mock.from_pretrained.assert_not_called()  # not loaded at construction
+        assert instance.apply_chat_template(chat_history) == "rendered prompt"
+        auto_tokenizer_mock.from_pretrained.assert_called_once_with("/models/lfm2")
+
+        # Subsequent calls reuse the cached tokenizer rather than reloading it.
+        instance.apply_chat_template(chat_history, add_generation_prompt=False)
+        auto_tokenizer_mock.from_pretrained.assert_called_once()
+        mock_tokenizer.apply_chat_template.assert_called_with(
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=False,
+            continue_final_message=True,
+        )

From e174f5993a2874891ccd52f637f3f24826272783 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <yuri@liquid.ai>
Date: Tue, 12 May 2026 18:30:16 -0400
Subject: [PATCH 3/4] Trim comments and docstrings on chat-template hooks

---
 olive/evaluator/lmeval_ort.py          | 17 -----------------
 test/evaluator/test_olive_evaluator.py | 12 +-----------
 2 files changed, 1 insertion(+), 28 deletions(-)

diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py
index 5d70b0056..95343f7c2 100644
--- a/olive/evaluator/lmeval_ort.py
+++ b/olive/evaluator/lmeval_ort.py
@@ -498,9 +498,6 @@ def __init__(
                 self.config.set_provider_option(ep, key, value)
         self.model = og.Model(self.config)
         self.tokenizer = og.Tokenizer(self.model)
-        # HF tokenizer is loaded lazily by `apply_chat_template` on first use,
-        # so callers that don't enable chat templating do not need HF tokenizer
-        # files (tokenizer_config.json, etc.) in the model directory.
         self._pretrained = str(pretrained)
         self._hf_tokenizer: AutoTokenizer | None = None
 
@@ -528,23 +525,9 @@ def __init__(
 
     @property
     def tokenizer_name(self) -> str:
-        r"""Stable identifier used by lm-eval's chat-template-aware result cache.
-
-        Replace both POSIX and Windows path separators so the cache key does
-        not embed raw path separators (a Windows path like ``C:\models\foo``
-        and the POSIX form ``C:/models/foo`` produce the same identifier).
-        """
         return self._pretrained.replace("\\", "__").replace("/", "__")
 
     def apply_chat_template(self, chat_history: list[dict], add_generation_prompt: bool = True) -> str:
-        """Render a chat history through the model's HuggingFace chat template.
-
-        Required by lm-eval when ``apply_chat_template=True`` is passed to
-        ``simple_evaluate``; without it, lm-eval raises ``NotImplementedError``
-        at task setup. The HF tokenizer is loaded on first call so callers that
-        never enable chat templating are not required to ship HF tokenizer
-        files alongside the ONNX model.
-        """
         if self._hf_tokenizer is None:
             self._hf_tokenizer = AutoTokenizer.from_pretrained(self._pretrained)
         return self._hf_tokenizer.apply_chat_template(
diff --git a/test/evaluator/test_olive_evaluator.py b/test/evaluator/test_olive_evaluator.py
index bdd10fd65..43dc8ca93 100644
--- a/test/evaluator/test_olive_evaluator.py
+++ b/test/evaluator/test_olive_evaluator.py
@@ -513,13 +513,6 @@ def test_lm_evaluator_dispatches_to_requested_backend(
 
 
 class TestLMEvalORTGenAIChatTemplate:
-    """Cover the chat-template hooks added to LMEvalORTGenAIEvaluator.
-
-    The hooks must work without instantiating the full evaluator (which requires
-    an ONNX model + onnxruntime-genai), so the tests skip ``__init__`` and
-    exercise the methods directly on a bare instance.
-    """
-
     def _bare_instance(self, pretrained: str):
         # pylint: disable=protected-access
         from olive.evaluator.lmeval_ort import LMEvalORTGenAIEvaluator
@@ -534,8 +527,6 @@ def _bare_instance(self, pretrained: str):
         [
             ("/models/lfm2-350m", "__models__lfm2-350m"),
             ("relative/path/model", "relative__path__model"),
-            # Windows-style separators must normalize identically to their POSIX form
-            # so the cache key is stable across platforms.
             ("C:\\models\\lfm2-350m", "C:__models__lfm2-350m"),
         ],
     )
@@ -551,11 +542,10 @@ def test_apply_chat_template_lazy_loads_hf_tokenizer(self, auto_tokenizer_mock):
 
         instance = self._bare_instance("/models/lfm2")
 
-        auto_tokenizer_mock.from_pretrained.assert_not_called()  # not loaded at construction
+        auto_tokenizer_mock.from_pretrained.assert_not_called()
         assert instance.apply_chat_template(chat_history) == "rendered prompt"
         auto_tokenizer_mock.from_pretrained.assert_called_once_with("/models/lfm2")
 
-        # Subsequent calls reuse the cached tokenizer rather than reloading it.
         instance.apply_chat_template(chat_history, add_generation_prompt=False)
         auto_tokenizer_mock.from_pretrained.assert_called_once()
         mock_tokenizer.apply_chat_template.assert_called_with(

From 34ff372572b55655ca9d2aa75659de6516aba086 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <yuri@liquid.ai>
Date: Tue, 12 May 2026 18:33:53 -0400
Subject: [PATCH 4/4] Use object.__new__ in chat-template test helper to
 silence pylint E1120

---
 test/evaluator/test_olive_evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/evaluator/test_olive_evaluator.py b/test/evaluator/test_olive_evaluator.py
index 43dc8ca93..251ab619c 100644
--- a/test/evaluator/test_olive_evaluator.py
+++ b/test/evaluator/test_olive_evaluator.py
@@ -517,7 +517,7 @@ def _bare_instance(self, pretrained: str):
         # pylint: disable=protected-access
         from olive.evaluator.lmeval_ort import LMEvalORTGenAIEvaluator
 
-        instance = LMEvalORTGenAIEvaluator.__new__(LMEvalORTGenAIEvaluator)
+        instance = object.__new__(LMEvalORTGenAIEvaluator)
         instance._pretrained = pretrained
         instance._hf_tokenizer = None
         return instance