|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import os |
| 4 | +import re |
4 | 5 | import sys |
5 | 6 | import json |
6 | 7 | import ctypes |
@@ -4026,3 +4027,277 @@ def chatml_function_calling( |
4026 | 4027 | } |
4027 | 4028 |
|
4028 | 4029 | raise ValueError("Automatic streaming tool choice is not supported") |
| 4030 | + |
| 4031 | + |
| 4032 | +# ========================================================================== |
| 4033 | +# Gemma 4 native tool-call parsing + chat completion handler |
| 4034 | +# ========================================================================== |
| 4035 | +# |
| 4036 | +# Gemma 4 (released 2026-04-02) emits tool calls as native tokens of the form: |
| 4037 | +# |
| 4038 | +# <|tool_call>call:FUNCTION_NAME{key:value,key:value,...}<tool_call|> |
| 4039 | +# |
| 4040 | +# Argument values are encoded by type: |
| 4041 | +# - string : key:<|"|>value<|"|> |
| 4042 | +# - int : key:30 |
| 4043 | +# - float : key:3.5 |
| 4044 | +# - bool : key:true / key:false |
| 4045 | +# - null : key:null |
| 4046 | +# - list : key:[v1,v2,...] (element grammar is the same as above) |
| 4047 | +# |
| 4048 | +# An optional thinking-mode block precedes the tool call when thinking is on: |
| 4049 | +# |
| 4050 | +# <|channel>thought ... <channel|><|tool_call>call:...<tool_call|> |
| 4051 | +# |
| 4052 | +# Without a dedicated handler `create_chat_completion()` returns these tokens |
| 4053 | +# verbatim in `message.content` and `tool_calls` stays `None`, which silently |
| 4054 | +# breaks every OpenAI-compatible Gemma 4 client (issue #2227). The C++ server |
| 4055 | +# parses the same tokens via the PEG grammar added in |
| 4056 | +# https://github.com/ggml-org/llama.cpp/pull/21326 ; this is the Python port. |
| 4057 | + |
| 4058 | +_GEMMA4_TOOL_CALL_RE = re.compile( |
| 4059 | + r"<\|tool_call>\s*call:(?P<name>[A-Za-z_][A-Za-z0-9_]*)\s*\{(?P<args>.*?)\}\s*<tool_call\|>", |
| 4060 | + re.DOTALL, |
| 4061 | +) |
| 4062 | +_GEMMA4_THOUGHT_RE = re.compile(r"<\|channel>\s*thought.*?<channel\|>", re.DOTALL) |
| 4063 | +_GEMMA4_STR_DELIM = '<|"|>' |
| 4064 | + |
| 4065 | + |
| 4066 | +def _gemma4_parse_value(s: str, pos: int) -> Tuple[Any, int]: |
| 4067 | + """Parse a single Gemma 4 value starting at ``s[pos]``. |
| 4068 | +
|
| 4069 | + Returns ``(value, new_pos)`` where ``new_pos`` points just past the value. |
| 4070 | + """ |
| 4071 | + while pos < len(s) and s[pos].isspace(): |
| 4072 | + pos += 1 |
| 4073 | + # string literal: <|"|>...<|"|> |
| 4074 | + if s.startswith(_GEMMA4_STR_DELIM, pos): |
| 4075 | + start = pos + len(_GEMMA4_STR_DELIM) |
| 4076 | + end = s.find(_GEMMA4_STR_DELIM, start) |
| 4077 | + if end < 0: |
| 4078 | + return s[start:], len(s) |
| 4079 | + return s[start:end], end + len(_GEMMA4_STR_DELIM) |
| 4080 | + # list literal: [v1,v2,...] |
| 4081 | + if pos < len(s) and s[pos] == "[": |
| 4082 | + items: List[Any] = [] |
| 4083 | + pos += 1 |
| 4084 | + while pos < len(s): |
| 4085 | + while pos < len(s) and s[pos].isspace(): |
| 4086 | + pos += 1 |
| 4087 | + if pos < len(s) and s[pos] == "]": |
| 4088 | + return items, pos + 1 |
| 4089 | + val, pos = _gemma4_parse_value(s, pos) |
| 4090 | + items.append(val) |
| 4091 | + while pos < len(s) and s[pos] in " \t,": |
| 4092 | + pos += 1 |
| 4093 | + return items, pos |
| 4094 | + # primitive literal: read until separator |
| 4095 | + start = pos |
| 4096 | + while pos < len(s) and s[pos] not in ",}]": |
| 4097 | + pos += 1 |
| 4098 | + raw = s[start:pos].strip() |
| 4099 | + if raw == "true": |
| 4100 | + return True, pos |
| 4101 | + if raw == "false": |
| 4102 | + return False, pos |
| 4103 | + if raw == "null": |
| 4104 | + return None, pos |
| 4105 | + try: |
| 4106 | + if "." in raw or "e" in raw.lower(): |
| 4107 | + return float(raw), pos |
| 4108 | + return int(raw), pos |
| 4109 | + except ValueError: |
| 4110 | + return raw, pos |
| 4111 | + |
| 4112 | + |
| 4113 | +def _gemma4_parse_args(args_str: str) -> Dict[str, Any]: |
| 4114 | + """Parse the inside of a Gemma 4 ``{...}`` arg block into ``{name: value}``.""" |
| 4115 | + out: Dict[str, Any] = {} |
| 4116 | + pos = 0 |
| 4117 | + while pos < len(args_str): |
| 4118 | + m = re.match(r"\s*([A-Za-z_][A-Za-z0-9_]*)\s*:", args_str[pos:]) |
| 4119 | + if not m: |
| 4120 | + break |
| 4121 | + key = m.group(1) |
| 4122 | + pos += m.end() |
| 4123 | + val, pos = _gemma4_parse_value(args_str, pos) |
| 4124 | + out[key] = val |
| 4125 | + sep = re.match(r"\s*,\s*", args_str[pos:]) |
| 4126 | + pos += sep.end() if sep else 0 |
| 4127 | + return out |
| 4128 | + |
| 4129 | + |
| 4130 | +def _parse_gemma4_native_tool_calls( |
| 4131 | + text: str, |
| 4132 | +) -> Tuple[Optional[str], Optional[List[Dict[str, Any]]]]: |
| 4133 | + """Extract Gemma 4 native tool-call tokens from a completion. |
| 4134 | +
|
| 4135 | + Returns ``(content_remainder, tool_calls)``. When no ``<|tool_call>`` token |
| 4136 | + is present the original ``text`` is returned with ``tool_calls=None`` so |
| 4137 | + plain-text replies pass through unchanged. |
| 4138 | + """ |
| 4139 | + cleaned = _GEMMA4_THOUGHT_RE.sub("", text) |
| 4140 | + if "<|tool_call>" not in cleaned: |
| 4141 | + return text, None |
| 4142 | + tool_calls: List[Dict[str, Any]] = [] |
| 4143 | + for i, m in enumerate(_GEMMA4_TOOL_CALL_RE.finditer(cleaned)): |
| 4144 | + name = m.group("name") |
| 4145 | + args = _gemma4_parse_args(m.group("args")) |
| 4146 | + suffix = "".join(random.choices(string.hexdigits.lower()[:16], k=8)) |
| 4147 | + tool_calls.append( |
| 4148 | + { |
| 4149 | + "id": f"call_{i}_{name}_{suffix}", |
| 4150 | + "type": "function", |
| 4151 | + "function": {"name": name, "arguments": json.dumps(args)}, |
| 4152 | + } |
| 4153 | + ) |
| 4154 | + if not tool_calls: |
| 4155 | + return text, None |
| 4156 | + remainder = _GEMMA4_TOOL_CALL_RE.sub("", cleaned).strip() |
| 4157 | + return (remainder or None), tool_calls |
| 4158 | + |
| 4159 | + |
| 4160 | +@register_chat_completion_handler("gemma4") |
| 4161 | +def gemma4_chat_completion( |
| 4162 | + llama: llama.Llama, |
| 4163 | + messages: List[llama_types.ChatCompletionRequestMessage], |
| 4164 | + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, |
| 4165 | + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, |
| 4166 | + tools: Optional[List[llama_types.ChatCompletionTool]] = None, |
| 4167 | + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, |
| 4168 | + temperature: float = 0.2, |
| 4169 | + top_p: float = 0.95, |
| 4170 | + top_k: int = 40, |
| 4171 | + min_p: float = 0.05, |
| 4172 | + typical_p: float = 1.0, |
| 4173 | + stream: bool = False, |
| 4174 | + stop: Optional[Union[str, List[str]]] = None, |
| 4175 | + seed: Optional[int] = None, |
| 4176 | + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, |
| 4177 | + max_tokens: Optional[int] = None, |
| 4178 | + presence_penalty: float = 0.0, |
| 4179 | + frequency_penalty: float = 0.0, |
| 4180 | + repeat_penalty: float = 1.1, |
| 4181 | + tfs_z: float = 1.0, |
| 4182 | + mirostat_mode: int = 0, |
| 4183 | + mirostat_tau: float = 5.0, |
| 4184 | + mirostat_eta: float = 0.1, |
| 4185 | + model: Optional[str] = None, |
| 4186 | + logits_processor: Optional[llama.LogitsProcessorList] = None, |
| 4187 | + grammar: Optional[llama.LlamaGrammar] = None, |
| 4188 | + logit_bias: Optional[Dict[str, float]] = None, |
| 4189 | + logprobs: Optional[bool] = None, |
| 4190 | + top_logprobs: Optional[int] = None, |
| 4191 | + **kwargs, # type: ignore |
| 4192 | +) -> Union[ |
| 4193 | + llama_types.CreateChatCompletionResponse, |
| 4194 | + Iterator[llama_types.CreateChatCompletionStreamResponse], |
| 4195 | +]: |
| 4196 | + """Chat completion handler for Gemma 4 (issue #2227). |
| 4197 | +
|
| 4198 | + Renders prompts via the model's embedded Jinja2 chat template (the |
| 4199 | + Gemma 4 GGUFs ship a correct one) and post-parses Gemma 4's native |
| 4200 | + tool-call tokens into OpenAI-compatible ``tool_calls`` on the assistant |
| 4201 | + message. Streaming responses are passed through unchanged — callers can |
| 4202 | + buffer chunks and re-parse via ``_parse_gemma4_native_tool_calls`` until |
| 4203 | + an incremental PEG parser is ported from ggml-org/llama.cpp#21326. |
| 4204 | + """ |
| 4205 | + template = (getattr(llama, "metadata", None) or {}).get("tokenizer.chat_template") |
| 4206 | + if not template: |
| 4207 | + raise ValueError( |
| 4208 | + "chat_format='gemma4' requires a GGUF model with an embedded " |
| 4209 | + "tokenizer.chat_template (Gemma 4 GGUFs ship one by default)." |
| 4210 | + ) |
| 4211 | + |
| 4212 | + eos_id = llama.token_eos() |
| 4213 | + bos_id = llama.token_bos() |
| 4214 | + eos_token = llama._model.token_get_text(eos_id) if eos_id != -1 else "" |
| 4215 | + bos_token = llama._model.token_get_text(bos_id) if bos_id != -1 else "" |
| 4216 | + |
| 4217 | + formatter = Jinja2ChatFormatter( |
| 4218 | + template=template, |
| 4219 | + eos_token=eos_token, |
| 4220 | + bos_token=bos_token, |
| 4221 | + add_generation_prompt=True, |
| 4222 | + ) |
| 4223 | + result = formatter( |
| 4224 | + messages=messages, |
| 4225 | + functions=functions, |
| 4226 | + function_call=function_call, |
| 4227 | + tools=tools, |
| 4228 | + tool_choice=tool_choice, |
| 4229 | + ) |
| 4230 | + prompt = llama.tokenize( |
| 4231 | + result.prompt.encode("utf-8"), |
| 4232 | + add_bos=not result.added_special, |
| 4233 | + special=True, |
| 4234 | + ) |
| 4235 | + |
| 4236 | + effective_stop: List[str] = [] |
| 4237 | + if stop: |
| 4238 | + effective_stop = [stop] if isinstance(stop, str) else list(stop) |
| 4239 | + if result.stop is not None: |
| 4240 | + effective_stop += result.stop if isinstance(result.stop, list) else [result.stop] |
| 4241 | + |
| 4242 | + if response_format is not None and response_format.get("type") == "json_object": |
| 4243 | + grammar = _grammar_for_response_format(response_format, verbose=llama.verbose) |
| 4244 | + |
| 4245 | + completion_or_chunks = llama.create_completion( |
| 4246 | + prompt=prompt, |
| 4247 | + temperature=temperature, |
| 4248 | + top_p=top_p, |
| 4249 | + top_k=top_k, |
| 4250 | + min_p=min_p, |
| 4251 | + typical_p=typical_p, |
| 4252 | + logprobs=top_logprobs if logprobs else None, |
| 4253 | + stream=stream, |
| 4254 | + stop=effective_stop, |
| 4255 | + seed=seed, |
| 4256 | + max_tokens=max_tokens, |
| 4257 | + presence_penalty=presence_penalty, |
| 4258 | + frequency_penalty=frequency_penalty, |
| 4259 | + repeat_penalty=repeat_penalty, |
| 4260 | + tfs_z=tfs_z, |
| 4261 | + mirostat_mode=mirostat_mode, |
| 4262 | + mirostat_tau=mirostat_tau, |
| 4263 | + mirostat_eta=mirostat_eta, |
| 4264 | + model=model, |
| 4265 | + logits_processor=logits_processor, |
| 4266 | + stopping_criteria=result.stopping_criteria, |
| 4267 | + grammar=grammar, |
| 4268 | + logit_bias=logit_bias, |
| 4269 | + ) |
| 4270 | + |
| 4271 | + if stream: |
| 4272 | + return _convert_completion_to_chat(completion_or_chunks, stream=True) |
| 4273 | + |
| 4274 | + completion = cast(llama_types.CreateCompletionResponse, completion_or_chunks) |
| 4275 | + text = completion["choices"][0]["text"] |
| 4276 | + content, tool_calls = _parse_gemma4_native_tool_calls(text) |
| 4277 | + |
| 4278 | + message: Dict[str, Any] = {"role": "assistant", "content": content if not tool_calls else None} |
| 4279 | + if tool_calls: |
| 4280 | + message["tool_calls"] = tool_calls |
| 4281 | + |
| 4282 | + chat_response: llama_types.CreateChatCompletionResponse = { |
| 4283 | + "id": "chat" + completion["id"], |
| 4284 | + "object": "chat.completion", |
| 4285 | + "created": completion["created"], |
| 4286 | + "model": completion["model"], |
| 4287 | + "choices": [ |
| 4288 | + { |
| 4289 | + "index": 0, |
| 4290 | + "finish_reason": ( |
| 4291 | + "tool_calls" if tool_calls else completion["choices"][0]["finish_reason"] |
| 4292 | + ), |
| 4293 | + "logprobs": _convert_text_completion_logprobs_to_chat( |
| 4294 | + completion["choices"][0]["logprobs"] |
| 4295 | + ), |
| 4296 | + "message": message, |
| 4297 | + } |
| 4298 | + ], |
| 4299 | + "usage": completion.get( |
| 4300 | + "usage", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} |
| 4301 | + ), |
| 4302 | + } |
| 4303 | + return chat_response |
0 commit comments