Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ message PredictOptions {
int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter)
int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
map<string, string> Metadata = 52; // Generic per-request metadata (e.g., enable_thinking)
string JSONSchema = 53; // Raw JSON schema string for backends that support native structured output (e.g. vLLM guided decoding)
string ResponseFormat = 54; // Response format type: "json_object", "json_schema", or empty
}

// The response message containing the result
Expand Down
31 changes: 30 additions & 1 deletion backend/python/vllm/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import asyncio
from concurrent import futures
import argparse
import json
import signal
import sys
import os
import time
from typing import List
from PIL import Image

Expand All @@ -15,6 +17,21 @@
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams

# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions.
# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs.
try:
from vllm.sampling_params import StructuredOutputsParams
_structured_output_cls = StructuredOutputsParams
_structured_output_field = "structured_outputs"
except ImportError:
try:
from vllm.sampling_params import GuidedDecodingParams
_structured_output_cls = GuidedDecodingParams
_structured_output_field = "guided_decoding"
except ImportError:
_structured_output_cls = None
_structured_output_field = None
from vllm.utils import random_uuid
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.multimodal.utils import fetch_image
Expand Down Expand Up @@ -218,7 +235,6 @@ async def _predict(self, request, context, streaming=False):
"SkipSpecialTokens": "skip_special_tokens",
"SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
"TruncatePromptTokens": "truncate_prompt_tokens",
"GuidedDecoding": "guided_decoding",
}

sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
Expand All @@ -229,6 +245,19 @@ async def _predict(self, request, context, streaming=False):
if value not in (None, 0, [], False):
setattr(sampling_params, param_field, value)

# Handle structured output via guided decoding / structured outputs
if _structured_output_cls is not None:
constraint = None
if hasattr(request, 'JSONSchema') and request.JSONSchema:
constraint = _structured_output_cls(json=request.JSONSchema)
elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object":
constraint = _structured_output_cls(json_object=True)
elif hasattr(request, 'Grammar') and request.Grammar:
constraint = _structured_output_cls(grammar=request.Grammar)

if constraint is not None:
setattr(sampling_params, _structured_output_field, constraint)

# Extract image paths and process images
prompt = request.Prompt

Expand Down
2 changes: 2 additions & 0 deletions core/backend/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
TensorSplit: c.TensorSplit,
TailFreeSamplingZ: float32(*c.TFZ),
TypicalP: float32(*c.TypicalP),
JSONSchema: c.JSONSchema,
ResponseFormat: c.ResponseFormat,
}

metadata := map[string]string{}
Expand Down
1 change: 1 addition & 0 deletions core/config/model_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ type ModelConfig struct {
functionCallString, functionCallNameString string `yaml:"-" json:"-"`
ResponseFormat string `yaml:"-" json:"-"`
ResponseFormatMap map[string]interface{} `yaml:"-" json:"-"`
JSONSchema string `yaml:"-" json:"-"`

FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`
Expand Down
9 changes: 9 additions & 0 deletions core/http/endpoints/openai/chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
switch d.Type {
case "json_object":
input.Grammar = functions.JSONBNF
config.ResponseFormat = "json_object"
case "json_schema":
config.ResponseFormat = "json_schema"
d := schema.JsonSchemaRequest{}
dat, err := json.Marshal(config.ResponseFormatMap)
if err != nil {
Expand All @@ -440,6 +442,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
if err != nil {
return err
}

// Pass raw JSON schema to backends that support native structured output
schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
if err == nil {
config.JSONSchema = string(schemaBytes)
}

fs := &functions.JSONFunctionStructure{
AnyOf: []functions.Item{d.JsonSchema.Schema},
}
Expand Down
25 changes: 24 additions & 1 deletion core/http/endpoints/openai/completion.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,31 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
d := schema.ChatCompletionResponseFormat{}
dat, _ := json.Marshal(config.ResponseFormatMap)
_ = json.Unmarshal(dat, &d)
if d.Type == "json_object" {
switch d.Type {
case "json_object":
input.Grammar = functions.JSONBNF
config.ResponseFormat = "json_object"
case "json_schema":
config.ResponseFormat = "json_schema"
jsr := schema.JsonSchemaRequest{}
dat, err := json.Marshal(config.ResponseFormatMap)
if err == nil {
if err := json.Unmarshal(dat, &jsr); err == nil {
schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema)
if err == nil {
config.JSONSchema = string(schemaBytes)
}
fs := &functions.JSONFunctionStructure{
AnyOf: []functions.Item{jsr.JsonSchema.Schema},
}
g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...)
if err == nil {
input.Grammar = g
} else {
xlog.Error("Failed generating grammar", "error", err)
}
}
}
}
}

Expand Down
59 changes: 58 additions & 1 deletion docs/content/features/constrained_grammars.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ url = "/features/constrained_grammars/"
The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form).

{{% notice note %}}
**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887).
**Compatibility Notice:** Grammar and structured output support is available for the following backends:
- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object`
- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object`

For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page.
{{% /notice %}}

## Setup
Expand Down Expand Up @@ -66,6 +70,59 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars
- Character classes (`[a-z]`)
- String literals (`"text"`)

## vLLM Backend

The vLLM backend supports structured output via three methods:

### JSON Schema (recommended)

Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure:

```bash
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "my-vllm-model",
"messages": [{"role": "user", "content": "Generate a person object"}],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "person",
"schema": {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"}
},
"required": ["name", "age"]
}
}
}
}'
```

### JSON Object

Force the model to output valid JSON (without a specific schema):

```bash
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "my-vllm-model",
"messages": [{"role": "user", "content": "Generate a person as JSON"}],
"response_format": {"type": "json_object"}
}'
```

### Grammar

The `grammar` parameter also works with vLLM via xgrammar:

```bash
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "my-vllm-model",
"messages": [{"role": "user", "content": "Do you like apples?"}],
"grammar": "root ::= (\"yes\" | \"no\")"
}'
```

## Related Features

- [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs
Expand Down
Loading