From 7017132c32d8ea2caaaf9f65adf679b9e96135d7 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Tue, 12 May 2026 17:06:11 -0400 Subject: [PATCH 1/4] Add chat-template hooks to LMEvalORTGenAIEvaluator lm-eval's `simple_evaluate(..., apply_chat_template=True)` requires the underlying LM class to implement `tokenizer_name` and `apply_chat_template`. The HFLM backend has both; the ORT GenAI backend does not, so any attempt to evaluate a chat-tuned ONNX model with chat-formatted prompts raises `NotImplementedError: To use this model with chat templates, please implement the 'tokenizer_name' property.` This adds the two members with the minimum surface area: - `tokenizer_name` returns the model path (for lm-eval's chat-aware result caching), matching the HFLM convention of slash-replacement. - `apply_chat_template` defers to the model's HF tokenizer via `AutoTokenizer.apply_chat_template`, mirroring HFLM's implementation. The HF tokenizer is loaded once at `__init__` purely for chat-template rendering; token-level encode/decode still goes through `og.Tokenizer` and the runtime, so there is no change to generation behavior or any existing code path. Verified end-to-end on LFM2.5-350M (int4, k_quant_mixed) MBPP: without chat-template hooks the eval raised at task start; with them plus `num_fewshot=0` and a chat-friendly stop list, pass@1 went from 0.0/500 to 67/500 (13.4%) -- the original 0.0 was a prompt-format artifact (instruct model + completion-style few-shot), not a conversion regression. --- olive/evaluator/lmeval_ort.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py index c4a158533..f15b60c3a 100644 --- a/olive/evaluator/lmeval_ort.py +++ b/olive/evaluator/lmeval_ort.py @@ -498,6 +498,10 @@ def __init__( self.config.set_provider_option(ep, key, value) self.model = og.Model(self.config) self.tokenizer = og.Tokenizer(self.model) + # HF tokenizer kept solely to render `apply_chat_template`; generation + # still uses og.Tokenizer above. + self._pretrained = str(pretrained) + self._hf_tokenizer = AutoTokenizer.from_pretrained(self._pretrained) # consider adding auto batch sizes self.batch_size = int(batch_size) @@ -521,6 +525,24 @@ def __init__( self.device = device self._returns_full_logits = self._detect_full_logits() + @property + def tokenizer_name(self) -> str: + """Identifier used by lm-eval for chat-template-aware caching.""" + return self._pretrained.replace("/", "__") + + def apply_chat_template(self, chat_history: list[dict], add_generation_prompt: bool = True) -> str: + """Render a chat history through the model's HF chat template. + + Required by lm-eval when `apply_chat_template=True` is passed to + `simple_evaluate`; without it, lm-eval raises NotImplementedError. + """ + return self._hf_tokenizer.apply_chat_template( + chat_history, + tokenize=False, + add_generation_prompt=add_generation_prompt, + continue_final_message=not add_generation_prompt, + ) + def _detect_full_logits(self) -> bool: """Check if the model returns logits for all input positions or only the last.""" try: From 8ae13c60f49adca301c6a5b1a7d696a0f74a32c3 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Tue, 12 May 2026 18:16:08 -0400 Subject: [PATCH 2/4] Address review feedback: lazy HF tokenizer load, cross-platform cache key, tests - Lazy-load the HF tokenizer on the first ``apply_chat_template`` call rather than at ``__init__``. Callers that never enable chat templating no longer need HF tokenizer files (``tokenizer_config.json`` etc.) in the model directory; eager loading would have regressed those workflows. - ``tokenizer_name`` now replaces both POSIX and Windows path separators with ``__`` so the lm-eval cache identifier is stable across platforms. The previous implementation only handled forward slashes, leaving backslashes in the key on Windows because ``str(Path(...))`` preserves the native separator. - Add unit tests for both behaviours: - ``tokenizer_name`` parametrised over POSIX, relative, and Windows-style paths to lock in the normalisation contract. - ``apply_chat_template`` verified to (a) not load the HF tokenizer at construction, (b) load once on first call, and (c) reuse the cached tokenizer on subsequent calls. ``AutoTokenizer`` is patched so the tests run without any HF tokenizer files on disk. All four new tests pass; ``test_olive_evaluator.py`` as a whole stays green (85 passed). ``lintrunner`` reports no new warnings on the changed files. --- olive/evaluator/lmeval_ort.py | 27 +++++++++---- test/evaluator/test_olive_evaluator.py | 54 ++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py index f15b60c3a..5d70b0056 100644 --- a/olive/evaluator/lmeval_ort.py +++ b/olive/evaluator/lmeval_ort.py @@ -498,10 +498,11 @@ def __init__( self.config.set_provider_option(ep, key, value) self.model = og.Model(self.config) self.tokenizer = og.Tokenizer(self.model) - # HF tokenizer kept solely to render `apply_chat_template`; generation - # still uses og.Tokenizer above. + # HF tokenizer is loaded lazily by `apply_chat_template` on first use, + # so callers that don't enable chat templating do not need HF tokenizer + # files (tokenizer_config.json, etc.) in the model directory. self._pretrained = str(pretrained) - self._hf_tokenizer = AutoTokenizer.from_pretrained(self._pretrained) + self._hf_tokenizer: AutoTokenizer | None = None # consider adding auto batch sizes self.batch_size = int(batch_size) @@ -527,15 +528,25 @@ def __init__( @property def tokenizer_name(self) -> str: - """Identifier used by lm-eval for chat-template-aware caching.""" - return self._pretrained.replace("/", "__") + r"""Stable identifier used by lm-eval's chat-template-aware result cache. + + Replace both POSIX and Windows path separators so the cache key does + not embed raw path separators (a Windows path like ``C:\models\foo`` + and the POSIX form ``C:/models/foo`` produce the same identifier). + """ + return self._pretrained.replace("\\", "__").replace("/", "__") def apply_chat_template(self, chat_history: list[dict], add_generation_prompt: bool = True) -> str: - """Render a chat history through the model's HF chat template. + """Render a chat history through the model's HuggingFace chat template. - Required by lm-eval when `apply_chat_template=True` is passed to - `simple_evaluate`; without it, lm-eval raises NotImplementedError. + Required by lm-eval when ``apply_chat_template=True`` is passed to + ``simple_evaluate``; without it, lm-eval raises ``NotImplementedError`` + at task setup. The HF tokenizer is loaded on first call so callers that + never enable chat templating are not required to ship HF tokenizer + files alongside the ONNX model. """ + if self._hf_tokenizer is None: + self._hf_tokenizer = AutoTokenizer.from_pretrained(self._pretrained) return self._hf_tokenizer.apply_chat_template( chat_history, tokenize=False, diff --git a/test/evaluator/test_olive_evaluator.py b/test/evaluator/test_olive_evaluator.py index e295d069a..bdd10fd65 100644 --- a/test/evaluator/test_olive_evaluator.py +++ b/test/evaluator/test_olive_evaluator.py @@ -510,3 +510,57 @@ def test_lm_evaluator_dispatches_to_requested_backend( evaluator.evaluate(model, metrics=[], device=Device.CPU, execution_providers=["CPUExecutionProvider"]) get_model_mock.assert_called_once_with(model_class) + + +class TestLMEvalORTGenAIChatTemplate: + """Cover the chat-template hooks added to LMEvalORTGenAIEvaluator. + + The hooks must work without instantiating the full evaluator (which requires + an ONNX model + onnxruntime-genai), so the tests skip ``__init__`` and + exercise the methods directly on a bare instance. + """ + + def _bare_instance(self, pretrained: str): + # pylint: disable=protected-access + from olive.evaluator.lmeval_ort import LMEvalORTGenAIEvaluator + + instance = LMEvalORTGenAIEvaluator.__new__(LMEvalORTGenAIEvaluator) + instance._pretrained = pretrained + instance._hf_tokenizer = None + return instance + + @pytest.mark.parametrize( + ("pretrained", "expected"), + [ + ("/models/lfm2-350m", "__models__lfm2-350m"), + ("relative/path/model", "relative__path__model"), + # Windows-style separators must normalize identically to their POSIX form + # so the cache key is stable across platforms. + ("C:\\models\\lfm2-350m", "C:__models__lfm2-350m"), + ], + ) + def test_tokenizer_name_normalizes_separators(self, pretrained, expected): + assert self._bare_instance(pretrained).tokenizer_name == expected + + @patch("olive.evaluator.lmeval_ort.AutoTokenizer") + def test_apply_chat_template_lazy_loads_hf_tokenizer(self, auto_tokenizer_mock): + chat_history = [{"role": "user", "content": "hello"}] + mock_tokenizer = MagicMock() + mock_tokenizer.apply_chat_template.return_value = "rendered prompt" + auto_tokenizer_mock.from_pretrained.return_value = mock_tokenizer + + instance = self._bare_instance("/models/lfm2") + + auto_tokenizer_mock.from_pretrained.assert_not_called() # not loaded at construction + assert instance.apply_chat_template(chat_history) == "rendered prompt" + auto_tokenizer_mock.from_pretrained.assert_called_once_with("/models/lfm2") + + # Subsequent calls reuse the cached tokenizer rather than reloading it. + instance.apply_chat_template(chat_history, add_generation_prompt=False) + auto_tokenizer_mock.from_pretrained.assert_called_once() + mock_tokenizer.apply_chat_template.assert_called_with( + chat_history, + tokenize=False, + add_generation_prompt=False, + continue_final_message=True, + ) From e174f5993a2874891ccd52f637f3f24826272783 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Tue, 12 May 2026 18:30:16 -0400 Subject: [PATCH 3/4] Trim comments and docstrings on chat-template hooks --- olive/evaluator/lmeval_ort.py | 17 ----------------- test/evaluator/test_olive_evaluator.py | 12 +----------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/olive/evaluator/lmeval_ort.py b/olive/evaluator/lmeval_ort.py index 5d70b0056..95343f7c2 100644 --- a/olive/evaluator/lmeval_ort.py +++ b/olive/evaluator/lmeval_ort.py @@ -498,9 +498,6 @@ def __init__( self.config.set_provider_option(ep, key, value) self.model = og.Model(self.config) self.tokenizer = og.Tokenizer(self.model) - # HF tokenizer is loaded lazily by `apply_chat_template` on first use, - # so callers that don't enable chat templating do not need HF tokenizer - # files (tokenizer_config.json, etc.) in the model directory. self._pretrained = str(pretrained) self._hf_tokenizer: AutoTokenizer | None = None @@ -528,23 +525,9 @@ def __init__( @property def tokenizer_name(self) -> str: - r"""Stable identifier used by lm-eval's chat-template-aware result cache. - - Replace both POSIX and Windows path separators so the cache key does - not embed raw path separators (a Windows path like ``C:\models\foo`` - and the POSIX form ``C:/models/foo`` produce the same identifier). - """ return self._pretrained.replace("\\", "__").replace("/", "__") def apply_chat_template(self, chat_history: list[dict], add_generation_prompt: bool = True) -> str: - """Render a chat history through the model's HuggingFace chat template. - - Required by lm-eval when ``apply_chat_template=True`` is passed to - ``simple_evaluate``; without it, lm-eval raises ``NotImplementedError`` - at task setup. The HF tokenizer is loaded on first call so callers that - never enable chat templating are not required to ship HF tokenizer - files alongside the ONNX model. - """ if self._hf_tokenizer is None: self._hf_tokenizer = AutoTokenizer.from_pretrained(self._pretrained) return self._hf_tokenizer.apply_chat_template( diff --git a/test/evaluator/test_olive_evaluator.py b/test/evaluator/test_olive_evaluator.py index bdd10fd65..43dc8ca93 100644 --- a/test/evaluator/test_olive_evaluator.py +++ b/test/evaluator/test_olive_evaluator.py @@ -513,13 +513,6 @@ def test_lm_evaluator_dispatches_to_requested_backend( class TestLMEvalORTGenAIChatTemplate: - """Cover the chat-template hooks added to LMEvalORTGenAIEvaluator. - - The hooks must work without instantiating the full evaluator (which requires - an ONNX model + onnxruntime-genai), so the tests skip ``__init__`` and - exercise the methods directly on a bare instance. - """ - def _bare_instance(self, pretrained: str): # pylint: disable=protected-access from olive.evaluator.lmeval_ort import LMEvalORTGenAIEvaluator @@ -534,8 +527,6 @@ def _bare_instance(self, pretrained: str): [ ("/models/lfm2-350m", "__models__lfm2-350m"), ("relative/path/model", "relative__path__model"), - # Windows-style separators must normalize identically to their POSIX form - # so the cache key is stable across platforms. ("C:\\models\\lfm2-350m", "C:__models__lfm2-350m"), ], ) @@ -551,11 +542,10 @@ def test_apply_chat_template_lazy_loads_hf_tokenizer(self, auto_tokenizer_mock): instance = self._bare_instance("/models/lfm2") - auto_tokenizer_mock.from_pretrained.assert_not_called() # not loaded at construction + auto_tokenizer_mock.from_pretrained.assert_not_called() assert instance.apply_chat_template(chat_history) == "rendered prompt" auto_tokenizer_mock.from_pretrained.assert_called_once_with("/models/lfm2") - # Subsequent calls reuse the cached tokenizer rather than reloading it. instance.apply_chat_template(chat_history, add_generation_prompt=False) auto_tokenizer_mock.from_pretrained.assert_called_once() mock_tokenizer.apply_chat_template.assert_called_with( From 34ff372572b55655ca9d2aa75659de6516aba086 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Tue, 12 May 2026 18:33:53 -0400 Subject: [PATCH 4/4] Use object.__new__ in chat-template test helper to silence pylint E1120 --- test/evaluator/test_olive_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/evaluator/test_olive_evaluator.py b/test/evaluator/test_olive_evaluator.py index 43dc8ca93..251ab619c 100644 --- a/test/evaluator/test_olive_evaluator.py +++ b/test/evaluator/test_olive_evaluator.py @@ -517,7 +517,7 @@ def _bare_instance(self, pretrained: str): # pylint: disable=protected-access from olive.evaluator.lmeval_ort import LMEvalORTGenAIEvaluator - instance = LMEvalORTGenAIEvaluator.__new__(LMEvalORTGenAIEvaluator) + instance = object.__new__(LMEvalORTGenAIEvaluator) instance._pretrained = pretrained instance._hf_tokenizer = None return instance