Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions common/chat-parser-xml-toolcall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,9 +498,42 @@ inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct
}
auto val_start = builder.pos();

// Test if arg_val is a partial JSON
// vLLM-style: only try to parse value when there is content; empty/whitespace = incomplete (avoids "parse empty input" log).
// When remainder does not look like JSON start, skip try_consume_json and fall through to plain-text path (e.g. "explore").
bool looks_like_json = true;
{
const auto & inp = builder.input();
const size_t rem_len = (val_start < inp.size()) ? (inp.size() - val_start) : 0;
std::string_view rest_sv(inp.data() + val_start, rem_len);
if (rest_sv.empty() || all_space(rest_sv)) {
gen_partial_args([&](auto & rest, auto & needle) { arguments[key] = (form.trim_raw_argval ? string_strip(rest) : rest) + needle; });
throw common_chat_msg_partial_exception(
"Expected " + gbnf_format_literal(form.val_end) +
" after " + gbnf_format_literal(form.key_val_sep) +
(form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
);
}
// Only call try_consume_json when remainder looks like start of a JSON value (avoids SAX error-at-position-0 → "empty input" log).
// Otherwise fall through to plain-text path (e.g. subagent_type=explore).
size_t pos = 0;
while (pos < rest_sv.size() && std::isspace(static_cast<unsigned char>(rest_sv[pos]))) { ++pos; }
if (pos >= rest_sv.size()) {
looks_like_json = false;
} else {
std::string_view rest_trim = rest_sv.substr(pos);
char c = rest_trim[0];
looks_like_json = (c == '"' || c == '{' || c == '[' || (c >= '0' && c <= '9') || c == '-');
if (!looks_like_json) {
if (c == 't') looks_like_json = (rest_trim.size() <= 4 && std::string_view("true").substr(0, rest_trim.size()) == rest_trim);
else if (c == 'f') looks_like_json = (rest_trim.size() <= 5 && std::string_view("false").substr(0, rest_trim.size()) == rest_trim);
else if (c == 'n') looks_like_json = (rest_trim.size() <= 4 && std::string_view("null").substr(0, rest_trim.size()) == rest_trim);
}
}
}

// Test if arg_val is a partial JSON (only when remainder looks like JSON; else plain-text path below)
std::optional<common_json> value_json = std::nullopt;
if (!form.raw_argval || !*form.raw_argval) {
if ((!form.raw_argval || !*form.raw_argval) && looks_like_json) {
try { value_json = builder.try_consume_json(); }
catch (const std::runtime_error&) { builder.move_to(val_start); }
// TODO: Delete this when json_partial adds top-level support for null/true/false
Expand Down
32 changes: 19 additions & 13 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2169,7 +2169,9 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

static common_chat_params common_chat_params_init_glm_4_5(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.grammar_lazy = inputs.tools.is_array() && !inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
// vLLM-style for AUTO: no grammar/trigger during generation; tool calls are parsed from decoded text (common_chat_parse_glm_4_5).
// Only use grammar when tool_choice == REQUIRED (force tool call from first token).
data.grammar_lazy = false;

std::string prompt = apply(tmpl, inputs);

Expand Down Expand Up @@ -2228,18 +2230,22 @@ static common_chat_params common_chat_params_init_glm_4_5(const common_chat_temp
"<|observation|>"
});

// build grammar for tool call
static const xml_tool_call_format form {
/* form.scope_start = */ "",
/* form.tool_start = */ "\n<tool_call>",
/* form.tool_sep = */ "\n",
/* form.key_start = */ "<arg_key>",
/* form.key_val_sep = */ "</arg_key>\n<arg_value>",
/* form.val_end = */ "</arg_value>\n",
/* form.tool_end = */ "</tool_call>\n",
/* form.scope_end = */ "",
};
build_grammar_xml_tool_call(data, inputs.tools, form);
// Build grammar only for tool_choice == REQUIRED (force tool call from first token).
// For AUTO, generate freely and parse tool calls from decoded text (common_chat_parse_glm_4_5).
const bool has_tools = inputs.tools.is_array() && !inputs.tools.empty();
if (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
static const xml_tool_call_format form {
/* form.scope_start = */ "",
/* form.tool_start = */ "\n<tool_call>",
/* form.tool_sep = */ "\n",
/* form.key_start = */ "<arg_key>",
/* form.key_val_sep = */ "</arg_key>\n<arg_value>",
/* form.val_end = */ "</arg_value>\n",
/* form.tool_end = */ "</tool_call>\n",
/* form.scope_end = */ "",
};
build_grammar_xml_tool_call(data, inputs.tools, form);
}

data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_GLM_4_5;
Expand Down
5 changes: 5 additions & 0 deletions common/json-partial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ bool common_json_parse(
auto temptative_end = it + err_loc.position;
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());

// Avoid parsing and logging "empty input" when error is at position 0 (e.g. streaming partial/invalid JSON)
if (temptative_end == it) {
return false;
}

auto input = std::string(it, temptative_end);
try {
out.json = json::parse(input);
Expand Down
4 changes: 1 addition & 3 deletions tests/test-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,7 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
assert_msg_equals(test_message, msg, ignore_whitespace_differences);
}

if (!test_message.tool_calls.empty()) {
GGML_ASSERT(!data.params.grammar.empty());
}
// GLM 4.5 with tool_choice=AUTO uses parse-only (no grammar); other formats set grammar when tools present
if (!data.params.grammar.empty()) {
auto grammar = build_grammar(data.params.grammar);
if (!grammar) {
Expand Down
24 changes: 22 additions & 2 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2747,7 +2747,17 @@ struct server_context_impl {

slot.i_batch = -1;

common_sampler_accept(slot.smpl.get(), id, true);
try {
common_sampler_accept(slot.smpl.get(), id, true);
} catch (const std::runtime_error & e) {
// Grammar constraint violation (e.g. "Unexpected empty grammar stack") - return 500 instead of aborting
SRV_ERR("slot %d: grammar error, releasing slot: %s\n", slot.id, e.what());
send_error(slot, std::string("Grammar constraint violation: ") + e.what(), ERROR_TYPE_SERVER);
slot.print_timings();
metrics.on_prediction(slot);
slot.release();
continue;
}

// here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
const int64_t t_current = ggml_time_us();
Expand Down Expand Up @@ -2791,7 +2801,17 @@ struct server_context_impl {
const size_t n_draft = slot.drafted.size();

// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
std::vector<llama_token> ids;
try {
ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
} catch (const std::runtime_error & e) {
SRV_ERR("slot %d: grammar error during speculative decoding, releasing slot: %s\n", slot.id, e.what());
send_error(slot, std::string("Grammar constraint violation: ") + e.what(), ERROR_TYPE_SERVER);
slot.print_timings();
metrics.on_prediction(slot);
slot.release();
continue;
}
slot.i_batch_dft.clear();
slot.drafted.clear();

Expand Down