From 666f8c78c95ffdd5743b26a1584c2e32ee85dcbd Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:26:18 +0100
Subject: [PATCH 1/7] feat(proto): add JSONSchema and ResponseFormat fields to
 PredictOptions

Add two new fields to PredictOptions in the proto:
- JSONSchema (field 52): raw JSON schema string for backends that
  support native structured output (e.g. vLLM guided decoding)
- ResponseFormat (field 53): response format type string

These fields allow backends like vLLM to receive structured output
constraints natively instead of only through GBNF grammar conversion.

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 backend/backend.proto | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/backend.proto b/backend/backend.proto
index 6312036b28cf..d2be8823b487 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -163,6 +163,8 @@ message PredictOptions {
   int32 Logprobs = 50;  // Number of top logprobs to return (maps to OpenAI logprobs parameter)
   int32 TopLogprobs = 51;  // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
   map<string, string> Metadata = 52;  // Generic per-request metadata (e.g., enable_thinking)
+  string JSONSchema = 53;  // Raw JSON schema string for backends that support native structured output (e.g. vLLM guided decoding)
+  string ResponseFormat = 54;  // Response format type: "json_object", "json_schema", or empty
 }
 
 // The response message containing the result

From 1fd670c8333a59d016bf535132842ad7542b6351 Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:27:12 +0100
Subject: [PATCH 2/7] feat(backend): pass JSONSchema and ResponseFormat through
 gRPC

Add JSONSchema field to ModelConfig to carry the raw JSON schema string
alongside the GBNF Grammar. Pass both JSONSchema and ResponseFormat
through gRPCPredictOpts to backends via the new proto fields.

This allows backends like vLLM to receive the original JSON schema
for native structured output support.

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 core/backend/options.go     | 2 ++
 core/config/model_config.go | 1 +
 2 files changed, 3 insertions(+)

diff --git a/core/backend/options.go b/core/backend/options.go
index 3268c9287554..56369e4bd44f 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -253,6 +253,8 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		TensorSplit:         c.TensorSplit,
 		TailFreeSamplingZ:   float32(*c.TFZ),
 		TypicalP:            float32(*c.TypicalP),
+		JSONSchema:          c.JSONSchema,
+		ResponseFormat:      c.ResponseFormat,
 	}
 
 	metadata := map[string]string{}
diff --git a/core/config/model_config.go b/core/config/model_config.go
index bcb6105ac04c..98e9b9a0224b 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -51,6 +51,7 @@ type ModelConfig struct {
 	functionCallString, functionCallNameString string                 `yaml:"-" json:"-"`
 	ResponseFormat                             string                 `yaml:"-" json:"-"`
 	ResponseFormatMap                          map[string]interface{} `yaml:"-" json:"-"`
+	JSONSchema                                 string                 `yaml:"-" json:"-"`
 
 	FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
 	ReasoningConfig reasoning.Config          `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`

From 0fa07d32f5af69fa15d215a7288541c544473a54 Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:27:56 +0100
Subject: [PATCH 3/7] feat(endpoints): extract raw JSON schema for structured
 output

In chat and completion endpoints, when response_format is json_schema,
extract the raw JSON schema and store it on config.JSONSchema alongside
the GBNF grammar. Also set config.ResponseFormat to the format type.

This allows backends that support native structured output (like vLLM)
to use the JSON schema directly instead of the GBNF grammar.

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 core/http/endpoints/openai/chat.go       |  9 +++++++++
 core/http/endpoints/openai/completion.go | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 8f4a44a07469..4ec6118095af 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -430,7 +430,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			switch d.Type {
 			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
 			case "json_schema":
+				config.ResponseFormat = "json_schema"
 				d := schema.JsonSchemaRequest{}
 				dat, err := json.Marshal(config.ResponseFormatMap)
 				if err != nil {
@@ -440,6 +442,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				if err != nil {
 					return err
 				}
+
+				// Pass raw JSON schema to backends that support native structured output
+				schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
+				if err == nil {
+					config.JSONSchema = string(schemaBytes)
+				}
+
 				fs := &functions.JSONFunctionStructure{
 					AnyOf: []functions.Item{d.JsonSchema.Schema},
 				}
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index 25935120d44d..e57b9d73ce00 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -87,8 +87,22 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 			d := schema.ChatCompletionResponseFormat{}
 			dat, _ := json.Marshal(config.ResponseFormatMap)
 			_ = json.Unmarshal(dat, &d)
-			if d.Type == "json_object" {
+			switch d.Type {
+			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
+			case "json_schema":
+				config.ResponseFormat = "json_schema"
+				jsr := schema.JsonSchemaRequest{}
+				dat, err := json.Marshal(config.ResponseFormatMap)
+				if err == nil {
+					if err := json.Unmarshal(dat, &jsr); err == nil {
+						schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema)
+						if err == nil {
+							config.JSONSchema = string(schemaBytes)
+						}
+					}
+				}
 			}
 		}
 

From ea89ee88adaeff1d1dab32131c8e1cc98870d76f Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:28:27 +0100
Subject: [PATCH 4/7] feat(vllm): add structured output support via guided
 decoding

Update the vLLM backend to support structured output:
- Import GuidedDecodingParams from vllm.sampling_params
- Handle JSONSchema: parse and pass as GuidedDecodingParams(json_schema=...)
- Handle json_object response format: GuidedDecodingParams(json_object=True)
- Fall back to Grammar (GBNF) via GuidedDecodingParams(grammar=...)
- Remove phantom GuidedDecoding mapping (field doesn't exist in proto)
- Fix missing 'import time' and 'import json' for load_video and schema parsing

Priority: JSONSchema > json_object > Grammar (GBNF fallback)

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 backend/python/vllm/backend.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 56698a54e5f5..99b79c4c44db 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -2,9 +2,11 @@
 import asyncio
 from concurrent import futures
 import argparse
+import json
 import signal
 import sys
 import os
+import time
 from typing import List
 from PIL import Image
 
@@ -14,7 +16,7 @@
 import grpc
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import SamplingParams, GuidedDecodingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
@@ -218,7 +220,6 @@ async def _predict(self, request, context, streaming=False):
             "SkipSpecialTokens": "skip_special_tokens",
             "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
             "TruncatePromptTokens": "truncate_prompt_tokens",
-            "GuidedDecoding": "guided_decoding",
         }
 
         sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
@@ -229,6 +230,22 @@ async def _predict(self, request, context, streaming=False):
                 if value not in (None, 0, [], False):
                     setattr(sampling_params, param_field, value)
 
+        # Handle structured output via guided decoding
+        guided_decoding = None
+        if hasattr(request, 'JSONSchema') and request.JSONSchema:
+            try:
+                schema = json.loads(request.JSONSchema)
+                guided_decoding = GuidedDecodingParams(json_schema=schema)
+            except json.JSONDecodeError as e:
+                print(f"Failed to parse JSONSchema: {e}", file=sys.stderr)
+        elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object":
+            guided_decoding = GuidedDecodingParams(json_object=True)
+        elif hasattr(request, 'Grammar') and request.Grammar:
+            guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
+
+        if guided_decoding is not None:
+            sampling_params.guided_decoding = guided_decoding
+
         # Extract image paths and process images
         prompt = request.Prompt
 

From d65b35f92a2762e00586e6d04bb2f20d5c629f27 Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:36:37 +0100
Subject: [PATCH 5/7] fix: refine vLLM structured output implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Make GuidedDecodingParams import conditional (try/except) for
  backwards compatibility with older vLLM versions
- Remove GBNF grammar fallback — vLLM expects EBNF, not GBNF, so
  passing LocalAI's GBNF grammar would produce confusing errors
- Pass JSONSchema as string directly instead of parsing to dict
  (safer across vLLM versions)
- Add GBNF grammar generation for json_schema in completion endpoint
  so non-vLLM backends (llama.cpp) also get grammar enforcement

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 backend/python/vllm/backend.py           | 29 ++++++++++++------------
 core/http/endpoints/openai/completion.go |  9 ++++++++
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 99b79c4c44db..294ebc003964 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -16,7 +16,11 @@
 import grpc
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams, GuidedDecodingParams
+from vllm.sampling_params import SamplingParams
+try:
+    from vllm.sampling_params import GuidedDecodingParams
+except ImportError:
+    GuidedDecodingParams = None
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
@@ -231,20 +235,15 @@ async def _predict(self, request, context, streaming=False):
                     setattr(sampling_params, param_field, value)
 
         # Handle structured output via guided decoding
-        guided_decoding = None
-        if hasattr(request, 'JSONSchema') and request.JSONSchema:
-            try:
-                schema = json.loads(request.JSONSchema)
-                guided_decoding = GuidedDecodingParams(json_schema=schema)
-            except json.JSONDecodeError as e:
-                print(f"Failed to parse JSONSchema: {e}", file=sys.stderr)
-        elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object":
-            guided_decoding = GuidedDecodingParams(json_object=True)
-        elif hasattr(request, 'Grammar') and request.Grammar:
-            guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
-
-        if guided_decoding is not None:
-            sampling_params.guided_decoding = guided_decoding
+        if GuidedDecodingParams is not None:
+            guided_decoding = None
+            if hasattr(request, 'JSONSchema') and request.JSONSchema:
+                guided_decoding = GuidedDecodingParams(json_schema=request.JSONSchema)
+            elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object":
+                guided_decoding = GuidedDecodingParams(json_object=True)
+
+            if guided_decoding is not None:
+                sampling_params.guided_decoding = guided_decoding
 
         # Extract image paths and process images
         prompt = request.Prompt
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index e57b9d73ce00..14615358d6d0 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -101,6 +101,15 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 						if err == nil {
 							config.JSONSchema = string(schemaBytes)
 						}
+						fs := &functions.JSONFunctionStructure{
+							AnyOf: []functions.Item{jsr.JsonSchema.Schema},
+						}
+						g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...)
+						if err == nil {
+							input.Grammar = g
+						} else {
+							xlog.Error("Failed generating grammar", "error", err)
+						}
 					}
 				}
 			}

From 8511c50ce83ba46a7fefefae8410d7942005a2c1 Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 03:21:58 +0100
Subject: [PATCH 6/7] fix(vllm): support both vLLM API versions and add grammar
 passthrough
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Handle both StructuredOutputsParams (vLLM latest) and
  GuidedDecodingParams (vLLM <=0.8.x) with graceful fallback
- Use the correct SamplingParams field name for each version
  (structured_outputs vs guided_decoding)
- Use 'json' parameter (not 'json_schema') matching both APIs
- Re-add grammar (GBNF/BNF) passthrough — both vLLM APIs accept
  a 'grammar' parameter handled by xgrammar which supports GBNF
- Priority: JSONSchema > json_object > Grammar

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 backend/python/vllm/backend.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 294ebc003964..f8aa8f62c485 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -17,10 +17,21 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
+
+# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions.
+# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs.
 try:
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
+    _structured_output_cls = StructuredOutputsParams
+    _structured_output_field = "structured_outputs"
 except ImportError:
-    GuidedDecodingParams = None
+    try:
+        from vllm.sampling_params import GuidedDecodingParams
+        _structured_output_cls = GuidedDecodingParams
+        _structured_output_field = "guided_decoding"
+    except ImportError:
+        _structured_output_cls = None
+        _structured_output_field = None
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
@@ -234,16 +245,18 @@ async def _predict(self, request, context, streaming=False):
                 if value not in (None, 0, [], False):
                     setattr(sampling_params, param_field, value)
 
-        # Handle structured output via guided decoding
-        if GuidedDecodingParams is not None:
-            guided_decoding = None
+        # Handle structured output via guided decoding / structured outputs
+        if _structured_output_cls is not None:
+            constraint = None
             if hasattr(request, 'JSONSchema') and request.JSONSchema:
-                guided_decoding = GuidedDecodingParams(json_schema=request.JSONSchema)
+                constraint = _structured_output_cls(json=request.JSONSchema)
             elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object":
-                guided_decoding = GuidedDecodingParams(json_object=True)
+                constraint = _structured_output_cls(json_object=True)
+            elif hasattr(request, 'Grammar') and request.Grammar:
+                constraint = _structured_output_cls(grammar=request.Grammar)
 
-            if guided_decoding is not None:
-                sampling_params.guided_decoding = guided_decoding
+            if constraint is not None:
+                setattr(sampling_params, _structured_output_field, constraint)
 
         # Extract image paths and process images
         prompt = request.Prompt

From bb084540bb8614b22c9dc7ab0fbabedbd84b0622 Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 03:22:42 +0100
Subject: [PATCH 7/7] docs: update constrained grammars with vLLM structured
 output support

Update the compatibility notice to include vLLM alongside llama.cpp.
Add a vLLM-specific section with examples for all three supported
methods: json_schema, json_object, and grammar (via xgrammar).

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 docs/content/features/constrained_grammars.md | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md
index 33d50c900ba5..3867efaa709d 100644
--- a/docs/content/features/constrained_grammars.md
+++ b/docs/content/features/constrained_grammars.md
@@ -10,7 +10,11 @@ url = "/features/constrained_grammars/"
 The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form).
 
 {{% notice note %}}
-**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887).
+**Compatibility Notice:** Grammar and structured output support is available for the following backends:
+- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object`
+- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object`
+
+For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page.
  {{% /notice %}}
 
 ## Setup
@@ -66,6 +70,59 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars
 - Character classes (`[a-z]`)
 - String literals (`"text"`)
 
+## vLLM Backend
+
+The vLLM backend supports structured output via three methods:
+
+### JSON Schema (recommended)
+
+Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person object"}],
+  "response_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "person",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "age": {"type": "integer"}
+        },
+        "required": ["name", "age"]
+      }
+    }
+  }
+}'
+```
+
+### JSON Object
+
+Force the model to output valid JSON (without a specific schema):
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person as JSON"}],
+  "response_format": {"type": "json_object"}
+}'
+```
+
+### Grammar
+
+The `grammar` parameter also works with vLLM via xgrammar:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Do you like apples?"}],
+  "grammar": "root ::= (\"yes\" | \"no\")"
+}'
+```
+
 ## Related Features
 
 - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs