Skip to content

Commit bc9464c

Browse files
committed
fix(chat_format): parse Gemma 4 native tool-call tokens into tool_calls (#2227)
Adds @register_chat_completion_handler("gemma4") that: 1. Uses the GGUF-embedded Jinja2 chat template to render prompts (Gemma 4 GGUFs ship a correct one out of the box). 2. After generation, parses Gemma 4 native tool-call tokens <|tool_call>call:NAME{key:value,...}<tool_call|> into OpenAI-compatible tool_calls on the assistant message, and strips the optional <|channel>thought ... <channel|> block emitted when thinking mode is enabled. Argument-value grammar follows the spec at https://ai.google.dev/gemma/docs/core/prompt-formatting-gemma4 : strings via <|"|>...<|"|>, primitives (int/float/bool/null) bare, lists via [v1,v2,...]. The 3-char <|"|> delimiter means a literal double quote inside a string value never terminates it, so no escaping is needed. Mirrors the PEG-grammar fix the C++ side already shipped in ggml-org/llama.cpp#21326. Non-streaming responses get parsed tool calls; streaming responses pass chunks through unchanged for now (callers can re-parse with the public helper). Tests cover: issue repro, mixed primitives, list-of-strings, thought-block stripping, plain-text passthrough, multiple calls, surrounding plain text, and embedded quotes in string values. Closes #2227 🤖 Generated with [Claude Code](https://claude.com/claude-code)
1 parent 3bda091 commit bc9464c

2 files changed

Lines changed: 380 additions & 0 deletions

File tree

llama_cpp/llama_chat_format.py

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import os
4+
import re
45
import sys
56
import json
67
import ctypes
@@ -4026,3 +4027,277 @@ def chatml_function_calling(
40264027
}
40274028

40284029
raise ValueError("Automatic streaming tool choice is not supported")
4030+
4031+
4032+
# ==========================================================================
4033+
# Gemma 4 native tool-call parsing + chat completion handler
4034+
# ==========================================================================
4035+
#
4036+
# Gemma 4 (released 2026-04-02) emits tool calls as native tokens of the form:
4037+
#
4038+
# <|tool_call>call:FUNCTION_NAME{key:value,key:value,...}<tool_call|>
4039+
#
4040+
# Argument values are encoded by type:
4041+
# - string : key:<|"|>value<|"|>
4042+
# - int : key:30
4043+
# - float : key:3.5
4044+
# - bool : key:true / key:false
4045+
# - null : key:null
4046+
# - list : key:[v1,v2,...] (element grammar is the same as above)
4047+
#
4048+
# An optional thinking-mode block precedes the tool call when thinking is on:
4049+
#
4050+
# <|channel>thought ... <channel|><|tool_call>call:...<tool_call|>
4051+
#
4052+
# Without a dedicated handler `create_chat_completion()` returns these tokens
4053+
# verbatim in `message.content` and `tool_calls` stays `None`, which silently
4054+
# breaks every OpenAI-compatible Gemma 4 client (issue #2227). The C++ server
4055+
# parses the same tokens via the PEG grammar added in
4056+
# https://github.com/ggml-org/llama.cpp/pull/21326 ; this is the Python port.
4057+
4058+
_GEMMA4_TOOL_CALL_RE = re.compile(
4059+
r"<\|tool_call>\s*call:(?P<name>[A-Za-z_][A-Za-z0-9_]*)\s*\{(?P<args>.*?)\}\s*<tool_call\|>",
4060+
re.DOTALL,
4061+
)
4062+
_GEMMA4_THOUGHT_RE = re.compile(r"<\|channel>\s*thought.*?<channel\|>", re.DOTALL)
4063+
_GEMMA4_STR_DELIM = '<|"|>'
4064+
4065+
4066+
def _gemma4_parse_value(s: str, pos: int) -> Tuple[Any, int]:
4067+
"""Parse a single Gemma 4 value starting at ``s[pos]``.
4068+
4069+
Returns ``(value, new_pos)`` where ``new_pos`` points just past the value.
4070+
"""
4071+
while pos < len(s) and s[pos].isspace():
4072+
pos += 1
4073+
# string literal: <|"|>...<|"|>
4074+
if s.startswith(_GEMMA4_STR_DELIM, pos):
4075+
start = pos + len(_GEMMA4_STR_DELIM)
4076+
end = s.find(_GEMMA4_STR_DELIM, start)
4077+
if end < 0:
4078+
return s[start:], len(s)
4079+
return s[start:end], end + len(_GEMMA4_STR_DELIM)
4080+
# list literal: [v1,v2,...]
4081+
if pos < len(s) and s[pos] == "[":
4082+
items: List[Any] = []
4083+
pos += 1
4084+
while pos < len(s):
4085+
while pos < len(s) and s[pos].isspace():
4086+
pos += 1
4087+
if pos < len(s) and s[pos] == "]":
4088+
return items, pos + 1
4089+
val, pos = _gemma4_parse_value(s, pos)
4090+
items.append(val)
4091+
while pos < len(s) and s[pos] in " \t,":
4092+
pos += 1
4093+
return items, pos
4094+
# primitive literal: read until separator
4095+
start = pos
4096+
while pos < len(s) and s[pos] not in ",}]":
4097+
pos += 1
4098+
raw = s[start:pos].strip()
4099+
if raw == "true":
4100+
return True, pos
4101+
if raw == "false":
4102+
return False, pos
4103+
if raw == "null":
4104+
return None, pos
4105+
try:
4106+
if "." in raw or "e" in raw.lower():
4107+
return float(raw), pos
4108+
return int(raw), pos
4109+
except ValueError:
4110+
return raw, pos
4111+
4112+
4113+
def _gemma4_parse_args(args_str: str) -> Dict[str, Any]:
4114+
"""Parse the inside of a Gemma 4 ``{...}`` arg block into ``{name: value}``."""
4115+
out: Dict[str, Any] = {}
4116+
pos = 0
4117+
while pos < len(args_str):
4118+
m = re.match(r"\s*([A-Za-z_][A-Za-z0-9_]*)\s*:", args_str[pos:])
4119+
if not m:
4120+
break
4121+
key = m.group(1)
4122+
pos += m.end()
4123+
val, pos = _gemma4_parse_value(args_str, pos)
4124+
out[key] = val
4125+
sep = re.match(r"\s*,\s*", args_str[pos:])
4126+
pos += sep.end() if sep else 0
4127+
return out
4128+
4129+
4130+
def _parse_gemma4_native_tool_calls(
4131+
text: str,
4132+
) -> Tuple[Optional[str], Optional[List[Dict[str, Any]]]]:
4133+
"""Extract Gemma 4 native tool-call tokens from a completion.
4134+
4135+
Returns ``(content_remainder, tool_calls)``. When no ``<|tool_call>`` token
4136+
is present the original ``text`` is returned with ``tool_calls=None`` so
4137+
plain-text replies pass through unchanged.
4138+
"""
4139+
cleaned = _GEMMA4_THOUGHT_RE.sub("", text)
4140+
if "<|tool_call>" not in cleaned:
4141+
return text, None
4142+
tool_calls: List[Dict[str, Any]] = []
4143+
for i, m in enumerate(_GEMMA4_TOOL_CALL_RE.finditer(cleaned)):
4144+
name = m.group("name")
4145+
args = _gemma4_parse_args(m.group("args"))
4146+
suffix = "".join(random.choices(string.hexdigits.lower()[:16], k=8))
4147+
tool_calls.append(
4148+
{
4149+
"id": f"call_{i}_{name}_{suffix}",
4150+
"type": "function",
4151+
"function": {"name": name, "arguments": json.dumps(args)},
4152+
}
4153+
)
4154+
if not tool_calls:
4155+
return text, None
4156+
remainder = _GEMMA4_TOOL_CALL_RE.sub("", cleaned).strip()
4157+
return (remainder or None), tool_calls
4158+
4159+
4160+
@register_chat_completion_handler("gemma4")
4161+
def gemma4_chat_completion(
4162+
llama: llama.Llama,
4163+
messages: List[llama_types.ChatCompletionRequestMessage],
4164+
functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
4165+
function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
4166+
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
4167+
tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
4168+
temperature: float = 0.2,
4169+
top_p: float = 0.95,
4170+
top_k: int = 40,
4171+
min_p: float = 0.05,
4172+
typical_p: float = 1.0,
4173+
stream: bool = False,
4174+
stop: Optional[Union[str, List[str]]] = None,
4175+
seed: Optional[int] = None,
4176+
response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
4177+
max_tokens: Optional[int] = None,
4178+
presence_penalty: float = 0.0,
4179+
frequency_penalty: float = 0.0,
4180+
repeat_penalty: float = 1.1,
4181+
tfs_z: float = 1.0,
4182+
mirostat_mode: int = 0,
4183+
mirostat_tau: float = 5.0,
4184+
mirostat_eta: float = 0.1,
4185+
model: Optional[str] = None,
4186+
logits_processor: Optional[llama.LogitsProcessorList] = None,
4187+
grammar: Optional[llama.LlamaGrammar] = None,
4188+
logit_bias: Optional[Dict[str, float]] = None,
4189+
logprobs: Optional[bool] = None,
4190+
top_logprobs: Optional[int] = None,
4191+
**kwargs, # type: ignore
4192+
) -> Union[
4193+
llama_types.CreateChatCompletionResponse,
4194+
Iterator[llama_types.CreateChatCompletionStreamResponse],
4195+
]:
4196+
"""Chat completion handler for Gemma 4 (issue #2227).
4197+
4198+
Renders prompts via the model's embedded Jinja2 chat template (the
4199+
Gemma 4 GGUFs ship a correct one) and post-parses Gemma 4's native
4200+
tool-call tokens into OpenAI-compatible ``tool_calls`` on the assistant
4201+
message. Streaming responses are passed through unchanged — callers can
4202+
buffer chunks and re-parse via ``_parse_gemma4_native_tool_calls`` until
4203+
an incremental PEG parser is ported from ggml-org/llama.cpp#21326.
4204+
"""
4205+
template = (getattr(llama, "metadata", None) or {}).get("tokenizer.chat_template")
4206+
if not template:
4207+
raise ValueError(
4208+
"chat_format='gemma4' requires a GGUF model with an embedded "
4209+
"tokenizer.chat_template (Gemma 4 GGUFs ship one by default)."
4210+
)
4211+
4212+
eos_id = llama.token_eos()
4213+
bos_id = llama.token_bos()
4214+
eos_token = llama._model.token_get_text(eos_id) if eos_id != -1 else ""
4215+
bos_token = llama._model.token_get_text(bos_id) if bos_id != -1 else ""
4216+
4217+
formatter = Jinja2ChatFormatter(
4218+
template=template,
4219+
eos_token=eos_token,
4220+
bos_token=bos_token,
4221+
add_generation_prompt=True,
4222+
)
4223+
result = formatter(
4224+
messages=messages,
4225+
functions=functions,
4226+
function_call=function_call,
4227+
tools=tools,
4228+
tool_choice=tool_choice,
4229+
)
4230+
prompt = llama.tokenize(
4231+
result.prompt.encode("utf-8"),
4232+
add_bos=not result.added_special,
4233+
special=True,
4234+
)
4235+
4236+
effective_stop: List[str] = []
4237+
if stop:
4238+
effective_stop = [stop] if isinstance(stop, str) else list(stop)
4239+
if result.stop is not None:
4240+
effective_stop += result.stop if isinstance(result.stop, list) else [result.stop]
4241+
4242+
if response_format is not None and response_format.get("type") == "json_object":
4243+
grammar = _grammar_for_response_format(response_format, verbose=llama.verbose)
4244+
4245+
completion_or_chunks = llama.create_completion(
4246+
prompt=prompt,
4247+
temperature=temperature,
4248+
top_p=top_p,
4249+
top_k=top_k,
4250+
min_p=min_p,
4251+
typical_p=typical_p,
4252+
logprobs=top_logprobs if logprobs else None,
4253+
stream=stream,
4254+
stop=effective_stop,
4255+
seed=seed,
4256+
max_tokens=max_tokens,
4257+
presence_penalty=presence_penalty,
4258+
frequency_penalty=frequency_penalty,
4259+
repeat_penalty=repeat_penalty,
4260+
tfs_z=tfs_z,
4261+
mirostat_mode=mirostat_mode,
4262+
mirostat_tau=mirostat_tau,
4263+
mirostat_eta=mirostat_eta,
4264+
model=model,
4265+
logits_processor=logits_processor,
4266+
stopping_criteria=result.stopping_criteria,
4267+
grammar=grammar,
4268+
logit_bias=logit_bias,
4269+
)
4270+
4271+
if stream:
4272+
return _convert_completion_to_chat(completion_or_chunks, stream=True)
4273+
4274+
completion = cast(llama_types.CreateCompletionResponse, completion_or_chunks)
4275+
text = completion["choices"][0]["text"]
4276+
content, tool_calls = _parse_gemma4_native_tool_calls(text)
4277+
4278+
message: Dict[str, Any] = {"role": "assistant", "content": content if not tool_calls else None}
4279+
if tool_calls:
4280+
message["tool_calls"] = tool_calls
4281+
4282+
chat_response: llama_types.CreateChatCompletionResponse = {
4283+
"id": "chat" + completion["id"],
4284+
"object": "chat.completion",
4285+
"created": completion["created"],
4286+
"model": completion["model"],
4287+
"choices": [
4288+
{
4289+
"index": 0,
4290+
"finish_reason": (
4291+
"tool_calls" if tool_calls else completion["choices"][0]["finish_reason"]
4292+
),
4293+
"logprobs": _convert_text_completion_logprobs_to_chat(
4294+
completion["choices"][0]["logprobs"]
4295+
),
4296+
"message": message,
4297+
}
4298+
],
4299+
"usage": completion.get(
4300+
"usage", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
4301+
),
4302+
}
4303+
return chat_response

tests/test_llama_chat_format.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,108 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
9292
)
9393

9494
assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>")
95+
"""Append-only test module for Gemma 4 native tool-call parsing.
96+
97+
These tests intentionally exercise only the pure-Python parser
98+
(`_parse_gemma4_native_tool_calls`) so they run without a real GGUF
99+
model and without llama.cpp linkage.
100+
"""
101+
102+
import json
103+
104+
import llama_cpp.llama_chat_format as llama_chat_format
105+
106+
107+
def _parse(text):
108+
return llama_chat_format._parse_gemma4_native_tool_calls(text)
109+
110+
111+
def test_gemma4_parse_string_args():
112+
text = (
113+
'<|tool_call>call:write_file{'
114+
'content:<|"|>print("hello")<|"|>,'
115+
'file_path:<|"|>hello.py<|"|>'
116+
'}<tool_call|>'
117+
)
118+
content, tool_calls = _parse(text)
119+
assert content is None
120+
assert tool_calls is not None and len(tool_calls) == 1
121+
fn = tool_calls[0]["function"]
122+
assert fn["name"] == "write_file"
123+
assert json.loads(fn["arguments"]) == {
124+
"content": 'print("hello")',
125+
"file_path": "hello.py",
126+
}
127+
128+
129+
def test_gemma4_parse_primitive_args():
130+
text = (
131+
'<|tool_call>call:do_thing{'
132+
'timeout:30,temperature:0.5,background:false,note:null'
133+
'}<tool_call|>'
134+
)
135+
_, tool_calls = _parse(text)
136+
assert json.loads(tool_calls[0]["function"]["arguments"]) == {
137+
"timeout": 30,
138+
"temperature": 0.5,
139+
"background": False,
140+
"note": None,
141+
}
142+
143+
144+
def test_gemma4_parse_list_of_strings():
145+
text = (
146+
'<|tool_call>call:read_files{'
147+
'files:[<|"|>a.py<|"|>,<|"|>b.py<|"|>]'
148+
'}<tool_call|>'
149+
)
150+
_, tool_calls = _parse(text)
151+
assert json.loads(tool_calls[0]["function"]["arguments"]) == {
152+
"files": ["a.py", "b.py"]
153+
}
154+
155+
156+
def test_gemma4_strips_thought_block():
157+
text = (
158+
'<|channel>thought\nLet me call the function.\n<channel|>'
159+
'<|tool_call>call:f{x:1}<tool_call|>'
160+
)
161+
_, tool_calls = _parse(text)
162+
assert tool_calls and json.loads(tool_calls[0]["function"]["arguments"]) == {"x": 1}
163+
164+
165+
def test_gemma4_plain_text_passthrough():
166+
text = "Just a normal reply with no tool call."
167+
content, tool_calls = _parse(text)
168+
assert tool_calls is None
169+
assert content == text
170+
171+
172+
def test_gemma4_multiple_tool_calls():
173+
text = (
174+
'<|tool_call>call:a{x:1}<tool_call|>'
175+
'<|tool_call>call:b{y:<|"|>two<|"|>}<tool_call|>'
176+
)
177+
_, tool_calls = _parse(text)
178+
assert len(tool_calls) == 2
179+
assert tool_calls[0]["function"]["name"] == "a"
180+
assert tool_calls[1]["function"]["name"] == "b"
181+
assert json.loads(tool_calls[1]["function"]["arguments"]) == {"y": "two"}
182+
# IDs must be unique across calls.
183+
assert tool_calls[0]["id"] != tool_calls[1]["id"]
184+
185+
186+
def test_gemma4_surrounding_plain_text():
187+
text = "Sure, I will help.\n<|tool_call>call:f{x:1}<tool_call|>"
188+
content, tool_calls = _parse(text)
189+
assert tool_calls is not None
190+
assert content == "Sure, I will help."
191+
192+
193+
def test_gemma4_string_with_embedded_quotes():
194+
# Delimiter is the 3-char sequence <|"|>, so literal " inside is fine.
195+
text = '<|tool_call>call:say{msg:<|"|>hello, "world"!<|"|>}<tool_call|>'
196+
_, tool_calls = _parse(text)
197+
assert json.loads(tool_calls[0]["function"]["arguments"]) == {
198+
"msg": 'hello, "world"!'
199+
}

0 commit comments

Comments
 (0)