From 666f8c78c95ffdd5743b26a1584c2e32ee85dcbd Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:26:18 +0100 Subject: [PATCH 1/7] feat(proto): add JSONSchema and ResponseFormat fields to PredictOptions Add two new fields to PredictOptions in the proto: - JSONSchema (field 52): raw JSON schema string for backends that support native structured output (e.g. vLLM guided decoding) - ResponseFormat (field 53): response format type string These fields allow backends like vLLM to receive structured output constraints natively instead of only through GBNF grammar conversion. Ref: #6857 Signed-off-by: eureka928 --- backend/backend.proto | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/backend.proto b/backend/backend.proto index 6312036b28cf..d2be8823b487 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -163,6 +163,8 @@ message PredictOptions { int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter) int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter) map Metadata = 52; // Generic per-request metadata (e.g., enable_thinking) + string JSONSchema = 53; // Raw JSON schema string for backends that support native structured output (e.g. vLLM guided decoding) + string ResponseFormat = 54; // Response format type: "json_object", "json_schema", or empty } // The response message containing the result From 1fd670c8333a59d016bf535132842ad7542b6351 Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:27:12 +0100 Subject: [PATCH 2/7] feat(backend): pass JSONSchema and ResponseFormat through gRPC Add JSONSchema field to ModelConfig to carry the raw JSON schema string alongside the GBNF Grammar. Pass both JSONSchema and ResponseFormat through gRPCPredictOpts to backends via the new proto fields. This allows backends like vLLM to receive the original JSON schema for native structured output support. Ref: #6857 Signed-off-by: eureka928 --- core/backend/options.go | 2 ++ core/config/model_config.go | 1 + 2 files changed, 3 insertions(+) diff --git a/core/backend/options.go b/core/backend/options.go index 3268c9287554..56369e4bd44f 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -253,6 +253,8 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions TensorSplit: c.TensorSplit, TailFreeSamplingZ: float32(*c.TFZ), TypicalP: float32(*c.TypicalP), + JSONSchema: c.JSONSchema, + ResponseFormat: c.ResponseFormat, } metadata := map[string]string{} diff --git a/core/config/model_config.go b/core/config/model_config.go index bcb6105ac04c..98e9b9a0224b 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -51,6 +51,7 @@ type ModelConfig struct { functionCallString, functionCallNameString string `yaml:"-" json:"-"` ResponseFormat string `yaml:"-" json:"-"` ResponseFormatMap map[string]interface{} `yaml:"-" json:"-"` + JSONSchema string `yaml:"-" json:"-"` FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"` ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"` From 0fa07d32f5af69fa15d215a7288541c544473a54 Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:27:56 +0100 Subject: [PATCH 3/7] feat(endpoints): extract raw JSON schema for structured output In chat and completion endpoints, when response_format is json_schema, extract the raw JSON schema and store it on config.JSONSchema alongside the GBNF grammar. Also set config.ResponseFormat to the format type. This allows backends that support native structured output (like vLLM) to use the JSON schema directly instead of the GBNF grammar. Ref: #6857 Signed-off-by: eureka928 --- core/http/endpoints/openai/chat.go | 9 +++++++++ core/http/endpoints/openai/completion.go | 16 +++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 8f4a44a07469..4ec6118095af 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -430,7 +430,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator switch d.Type { case "json_object": input.Grammar = functions.JSONBNF + config.ResponseFormat = "json_object" case "json_schema": + config.ResponseFormat = "json_schema" d := schema.JsonSchemaRequest{} dat, err := json.Marshal(config.ResponseFormatMap) if err != nil { @@ -440,6 +442,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator if err != nil { return err } + + // Pass raw JSON schema to backends that support native structured output + schemaBytes, err := json.Marshal(d.JsonSchema.Schema) + if err == nil { + config.JSONSchema = string(schemaBytes) + } + fs := &functions.JSONFunctionStructure{ AnyOf: []functions.Item{d.JsonSchema.Schema}, } diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go index 25935120d44d..e57b9d73ce00 100644 --- a/core/http/endpoints/openai/completion.go +++ b/core/http/endpoints/openai/completion.go @@ -87,8 +87,22 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva d := schema.ChatCompletionResponseFormat{} dat, _ := json.Marshal(config.ResponseFormatMap) _ = json.Unmarshal(dat, &d) - if d.Type == "json_object" { + switch d.Type { + case "json_object": input.Grammar = functions.JSONBNF + config.ResponseFormat = "json_object" + case "json_schema": + config.ResponseFormat = "json_schema" + jsr := schema.JsonSchemaRequest{} + dat, err := json.Marshal(config.ResponseFormatMap) + if err == nil { + if err := json.Unmarshal(dat, &jsr); err == nil { + schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema) + if err == nil { + config.JSONSchema = string(schemaBytes) + } + } + } } } From ea89ee88adaeff1d1dab32131c8e1cc98870d76f Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:28:27 +0100 Subject: [PATCH 4/7] feat(vllm): add structured output support via guided decoding Update the vLLM backend to support structured output: - Import GuidedDecodingParams from vllm.sampling_params - Handle JSONSchema: parse and pass as GuidedDecodingParams(json_schema=...) - Handle json_object response format: GuidedDecodingParams(json_object=True) - Fall back to Grammar (GBNF) via GuidedDecodingParams(grammar=...) - Remove phantom GuidedDecoding mapping (field doesn't exist in proto) - Fix missing 'import time' and 'import json' for load_video and schema parsing Priority: JSONSchema > json_object > Grammar (GBNF fallback) Ref: #6857 Signed-off-by: eureka928 --- backend/python/vllm/backend.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 56698a54e5f5..99b79c4c44db 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -2,9 +2,11 @@ import asyncio from concurrent import futures import argparse +import json import signal import sys import os +import time from typing import List from PIL import Image @@ -14,7 +16,7 @@ import grpc from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import SamplingParams, GuidedDecodingParams from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image @@ -218,7 +220,6 @@ async def _predict(self, request, context, streaming=False): "SkipSpecialTokens": "skip_special_tokens", "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", "TruncatePromptTokens": "truncate_prompt_tokens", - "GuidedDecoding": "guided_decoding", } sampling_params = SamplingParams(top_p=0.9, max_tokens=200) @@ -229,6 +230,22 @@ async def _predict(self, request, context, streaming=False): if value not in (None, 0, [], False): setattr(sampling_params, param_field, value) + # Handle structured output via guided decoding + guided_decoding = None + if hasattr(request, 'JSONSchema') and request.JSONSchema: + try: + schema = json.loads(request.JSONSchema) + guided_decoding = GuidedDecodingParams(json_schema=schema) + except json.JSONDecodeError as e: + print(f"Failed to parse JSONSchema: {e}", file=sys.stderr) + elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object": + guided_decoding = GuidedDecodingParams(json_object=True) + elif hasattr(request, 'Grammar') and request.Grammar: + guided_decoding = GuidedDecodingParams(grammar=request.Grammar) + + if guided_decoding is not None: + sampling_params.guided_decoding = guided_decoding + # Extract image paths and process images prompt = request.Prompt From d65b35f92a2762e00586e6d04bb2f20d5c629f27 Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:36:37 +0100 Subject: [PATCH 5/7] fix: refine vLLM structured output implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make GuidedDecodingParams import conditional (try/except) for backwards compatibility with older vLLM versions - Remove GBNF grammar fallback — vLLM expects EBNF, not GBNF, so passing LocalAI's GBNF grammar would produce confusing errors - Pass JSONSchema as string directly instead of parsing to dict (safer across vLLM versions) - Add GBNF grammar generation for json_schema in completion endpoint so non-vLLM backends (llama.cpp) also get grammar enforcement Ref: #6857 Signed-off-by: eureka928 --- backend/python/vllm/backend.py | 29 ++++++++++++------------ core/http/endpoints/openai/completion.go | 9 ++++++++ 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 99b79c4c44db..294ebc003964 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -16,7 +16,11 @@ import grpc from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.sampling_params import SamplingParams, GuidedDecodingParams +from vllm.sampling_params import SamplingParams +try: + from vllm.sampling_params import GuidedDecodingParams +except ImportError: + GuidedDecodingParams = None from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image @@ -231,20 +235,15 @@ async def _predict(self, request, context, streaming=False): setattr(sampling_params, param_field, value) # Handle structured output via guided decoding - guided_decoding = None - if hasattr(request, 'JSONSchema') and request.JSONSchema: - try: - schema = json.loads(request.JSONSchema) - guided_decoding = GuidedDecodingParams(json_schema=schema) - except json.JSONDecodeError as e: - print(f"Failed to parse JSONSchema: {e}", file=sys.stderr) - elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object": - guided_decoding = GuidedDecodingParams(json_object=True) - elif hasattr(request, 'Grammar') and request.Grammar: - guided_decoding = GuidedDecodingParams(grammar=request.Grammar) - - if guided_decoding is not None: - sampling_params.guided_decoding = guided_decoding + if GuidedDecodingParams is not None: + guided_decoding = None + if hasattr(request, 'JSONSchema') and request.JSONSchema: + guided_decoding = GuidedDecodingParams(json_schema=request.JSONSchema) + elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object": + guided_decoding = GuidedDecodingParams(json_object=True) + + if guided_decoding is not None: + sampling_params.guided_decoding = guided_decoding # Extract image paths and process images prompt = request.Prompt diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go index e57b9d73ce00..14615358d6d0 100644 --- a/core/http/endpoints/openai/completion.go +++ b/core/http/endpoints/openai/completion.go @@ -101,6 +101,15 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva if err == nil { config.JSONSchema = string(schemaBytes) } + fs := &functions.JSONFunctionStructure{ + AnyOf: []functions.Item{jsr.JsonSchema.Schema}, + } + g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + input.Grammar = g + } else { + xlog.Error("Failed generating grammar", "error", err) + } } } } From 8511c50ce83ba46a7fefefae8410d7942005a2c1 Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 03:21:58 +0100 Subject: [PATCH 6/7] fix(vllm): support both vLLM API versions and add grammar passthrough MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Handle both StructuredOutputsParams (vLLM latest) and GuidedDecodingParams (vLLM <=0.8.x) with graceful fallback - Use the correct SamplingParams field name for each version (structured_outputs vs guided_decoding) - Use 'json' parameter (not 'json_schema') matching both APIs - Re-add grammar (GBNF/BNF) passthrough — both vLLM APIs accept a 'grammar' parameter handled by xgrammar which supports GBNF - Priority: JSONSchema > json_object > Grammar Ref: #6857 Signed-off-by: eureka928 --- backend/python/vllm/backend.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 294ebc003964..f8aa8f62c485 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -17,10 +17,21 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams + +# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions. +# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs. try: - from vllm.sampling_params import GuidedDecodingParams + from vllm.sampling_params import StructuredOutputsParams + _structured_output_cls = StructuredOutputsParams + _structured_output_field = "structured_outputs" except ImportError: - GuidedDecodingParams = None + try: + from vllm.sampling_params import GuidedDecodingParams + _structured_output_cls = GuidedDecodingParams + _structured_output_field = "guided_decoding" + except ImportError: + _structured_output_cls = None + _structured_output_field = None from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image @@ -234,16 +245,18 @@ async def _predict(self, request, context, streaming=False): if value not in (None, 0, [], False): setattr(sampling_params, param_field, value) - # Handle structured output via guided decoding - if GuidedDecodingParams is not None: - guided_decoding = None + # Handle structured output via guided decoding / structured outputs + if _structured_output_cls is not None: + constraint = None if hasattr(request, 'JSONSchema') and request.JSONSchema: - guided_decoding = GuidedDecodingParams(json_schema=request.JSONSchema) + constraint = _structured_output_cls(json=request.JSONSchema) elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object": - guided_decoding = GuidedDecodingParams(json_object=True) + constraint = _structured_output_cls(json_object=True) + elif hasattr(request, 'Grammar') and request.Grammar: + constraint = _structured_output_cls(grammar=request.Grammar) - if guided_decoding is not None: - sampling_params.guided_decoding = guided_decoding + if constraint is not None: + setattr(sampling_params, _structured_output_field, constraint) # Extract image paths and process images prompt = request.Prompt From bb084540bb8614b22c9dc7ab0fbabedbd84b0622 Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 03:22:42 +0100 Subject: [PATCH 7/7] docs: update constrained grammars with vLLM structured output support Update the compatibility notice to include vLLM alongside llama.cpp. Add a vLLM-specific section with examples for all three supported methods: json_schema, json_object, and grammar (via xgrammar). Ref: #6857 Signed-off-by: eureka928 --- docs/content/features/constrained_grammars.md | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md index 33d50c900ba5..3867efaa709d 100644 --- a/docs/content/features/constrained_grammars.md +++ b/docs/content/features/constrained_grammars.md @@ -10,7 +10,11 @@ url = "/features/constrained_grammars/" The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form). {{% notice note %}} -**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887). +**Compatibility Notice:** Grammar and structured output support is available for the following backends: +- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object` +- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object` + +For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. {{% /notice %}} ## Setup @@ -66,6 +70,59 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars - Character classes (`[a-z]`) - String literals (`"text"`) +## vLLM Backend + +The vLLM backend supports structured output via three methods: + +### JSON Schema (recommended) + +Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure: + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Generate a person object"}], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "person", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"] + } + } + } +}' +``` + +### JSON Object + +Force the model to output valid JSON (without a specific schema): + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Generate a person as JSON"}], + "response_format": {"type": "json_object"} +}' +``` + +### Grammar + +The `grammar` parameter also works with vLLM via xgrammar: + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Do you like apples?"}], + "grammar": "root ::= (\"yes\" | \"no\")" +}' +``` + ## Related Features - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs