diff --git a/CHANGELOG.md b/CHANGELOG.md index be4ade61df..90d79abbfa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Update llama.cpp to ggml-org/llama.cpp@d749821db +- fix: model fails to load when chat template uses HuggingFace generation tags by @tobocop2 in #2226 - docs: add contributing guide by @abetlen in #2229 - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1024fb85b9..f24b89f3ef 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -24,6 +24,7 @@ ) import jinja2 +from jinja2.ext import Extension from jinja2.sandbox import ImmutableSandboxedEnvironment import numpy as np @@ -192,6 +193,15 @@ def __call__( class Jinja2ChatFormatter(ChatFormatter): + class IgnoreGenerationTags(Extension): + """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag.""" + + tags = {"generation"} + + def parse(self, parser: jinja2.parser.Parser): + parser.stream.skip(1) + return parser.parse_statements(("name:endgeneration",), drop_needle=True) + def __init__( self, template: str, @@ -213,6 +223,7 @@ def __init__( loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True, + extensions=[Jinja2ChatFormatter.IgnoreGenerationTags], ).from_string(self.template) @staticmethod