OpenGradient · Ridwannurudeen · Feb 24, 2026
diff --git a/examples/README.md b/examples/README.md
@@ -73,6 +73,18 @@ python examples/llm_chat_streaming.py
 - Demonstrates real-time token streaming
 - Returns chunks as they arrive from the model
 
+#### `llm_structured_output.py`
+Demonstrates structured LLM output via JSON schema enforcement.
+
+```bash
+python examples/llm_structured_output.py
+```
+
+**What it does:**
+- Defines a JSON schema for sentiment analysis output
+- Uses `response_format` to constrain the model's response to match the schema
+- Parses the guaranteed-structured JSON response
+
 #### `llm_tool_calling.py`
 Demonstrates LLM tool/function calling.
 

diff --git a/examples/llm_structured_output.py b/examples/llm_structured_output.py
@@ -0,0 +1,70 @@
+"""
+Structured LLM output via JSON schema enforcement.
+
+Constrains the model to return a response matching a predefined JSON schema,
+ensuring predictable, machine-readable output.
+
+Usage:
+    export OG_PRIVATE_KEY="your_private_key"
+    python examples/llm_structured_output.py
+"""
+
+import json
+import os
+
+import opengradient as og
+
+client = og.Client(
+    private_key=os.environ.get("OG_PRIVATE_KEY"),
+)
+client.llm.ensure_opg_approval(opg_amount=2)
+
+# Define a JSON schema for sentiment analysis output
+response_format = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "sentiment_analysis",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "properties": {
+                "sentiment": {
+                    "type": "string",
+                    "enum": ["positive", "negative", "neutral"],
+                },
+                "confidence": {
+                    "type": "number",
+                    "description": "Confidence score between 0 and 1",
+                },
+                "reasoning": {
+                    "type": "string",
+                    "description": "Brief explanation for the classification",
+                },
+            },
+            "required": ["sentiment", "confidence", "reasoning"],
+            "additionalProperties": False,
+        },
+    },
+}
+
+result = client.llm.chat(
+    model=og.TEE_LLM.GPT_4O,
+    messages=[
+        {
+            "role": "system",
+            "content": "You are a sentiment analysis assistant. Analyze the sentiment of the given text.",
+        },
+        {
+            "role": "user",
+            "content": "I absolutely love this new feature, it makes everything so much easier!",
+        },
+    ],
+    max_tokens=200,
+    response_format=response_format,
+)
+
+# The response content is guaranteed to match the schema
+output = json.loads(result.chat_output["content"])
+print(f"Sentiment:  {output['sentiment']}")
+print(f"Confidence: {output['confidence']}")
+print(f"Reasoning:  {output['reasoning']}")
diff --git a/src/opengradient/__init__.py b/src/opengradient/__init__.py
@@ -102,6 +102,7 @@
     InferenceResult,
     ModelOutput,
     ModelRepository,
+    ResponseFormat,
     SchedulerParams,
     TextGenerationOutput,
     TextGenerationStream,
@@ -164,6 +165,7 @@ def init(
     "SchedulerParams",
     "CandleType",
     "CandleOrder",
+    "ResponseFormat",
     "TextGenerationOutput",
     "TextGenerationStream",
     "x402SettlementMode",

diff --git a/src/opengradient/cli.py b/src/opengradient/cli.py
@@ -359,6 +359,43 @@ def infer(ctx, model_cid: str, inference_mode: str, input_data, input_file: Path
         click.echo(f"Error running inference: {str(e)}")
 
 
+def _parse_response_format(ctx, response_format: Optional[str], response_format_file: Optional[Path]) -> Optional[dict]:
+    """Parse --response-format / --response-format-file into a dict (or None)."""
+    if response_format and response_format_file:
+        click.echo("Cannot specify both --response-format and --response-format-file")
+        ctx.exit(1)
+        return None
+
+    if response_format:
+        try:
+            parsed = json.loads(response_format)
+            if not isinstance(parsed, dict):
+                click.echo("--response-format must be a JSON object")
+                ctx.exit(1)
+                return None
+            return parsed
+        except json.JSONDecodeError as e:
+            click.echo(f"Failed to parse --response-format JSON: {e}")
+            ctx.exit(1)
+            return None
+
+    if response_format_file:
+        try:
+            with response_format_file.open("r") as f:
+                parsed = json.load(f)
+            if not isinstance(parsed, dict):
+                click.echo("Response format file must contain a JSON object")
+                ctx.exit(1)
+                return None
+            return parsed
+        except Exception as e:
+            click.echo(f"Failed to load response format from file: {e}")
+            ctx.exit(1)
+            return None
+
+    return None
+
+
 @cli.command()
 @click.option(
     "--model",
@@ -378,6 +415,13 @@ def infer(ctx, model_cid: str, inference_mode: str, input_data, input_file: Path
     default="settle-batch",
     help="Settlement mode for x402 payments: settle (payment only), settle-batch (batched, default), settle-metadata (full data)",
 )
+@click.option("--response-format", type=str, default=None, help="Response format config as JSON for structured outputs")
+@click.option(
+    "--response-format-file",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to JSON file containing response format configuration",
+)
 @click.pass_context
 def completion(
     ctx,
@@ -387,6 +431,8 @@ def completion(
     max_tokens: int,
     stop_sequence: List[str],
     temperature: float,
+    response_format: Optional[str],
+    response_format_file: Optional[Path],
 ):
     """
     Run completion inference on an LLM model via TEE.
@@ -404,13 +450,17 @@ def completion(
     try:
         click.echo(f'Running TEE LLM completion for model "{model_cid}"\n')
 
+        # Parse response format
+        parsed_response_format = _parse_response_format(ctx, response_format, response_format_file)
+
         completion_output = client.llm.completion(
             model=model_cid,
             prompt=prompt,
             max_tokens=max_tokens,
             stop_sequence=list(stop_sequence),
             temperature=temperature,
             x402_settlement_mode=x402SettlementModes[x402_settlement_mode],
+            response_format=parsed_response_format,
         )
 
         print_llm_completion_result(model_cid, completion_output.transaction_hash, completion_output.completion_output, is_vanilla=False)
@@ -472,6 +522,13 @@ def print_llm_completion_result(model_cid, tx_hash, llm_output, is_vanilla=True)
     help="Settlement mode for x402 payments: settle (payment only), settle-batch (batched, default), settle-metadata (full data)",
 )
 @click.option("--stream", is_flag=True, default=False, help="Stream the output from the LLM")
+@click.option("--response-format", type=str, default=None, help="Response format config as JSON for structured outputs")
+@click.option(
+    "--response-format-file",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to JSON file containing response format configuration",
+)
 @click.pass_context
 def chat(
     ctx,
@@ -486,6 +543,8 @@ def chat(
     tool_choice: Optional[str],
     x402_settlement_mode: Optional[str],
     stream: bool,
+    response_format: Optional[str],
+    response_format_file: Optional[Path],
 ):
     """
     Run chat inference on an LLM model via TEE.
@@ -562,6 +621,9 @@ def chat(
         if not tools and not tools_file:
             parsed_tools = None
 
+        # Parse response format
+        parsed_response_format = _parse_response_format(ctx, response_format, response_format_file)
+
         result = client.llm.chat(
             model=model_cid,
             messages=messages,
@@ -572,6 +634,7 @@ def chat(
             tool_choice=tool_choice,
             x402_settlement_mode=x402SettlementModes[x402_settlement_mode],
             stream=stream,
+            response_format=parsed_response_format,
         )
 
         # Handle response based on streaming flag

diff --git a/src/opengradient/client/llm.py b/src/opengradient/client/llm.py
@@ -14,7 +14,7 @@
 from x402v2.mechanisms.evm.exact.register import register_exact_evm_client as register_exact_evm_clientv2
 from x402v2.mechanisms.evm.upto.register import register_upto_evm_client as register_upto_evm_clientv2
 
-from ..types import TEE_LLM, StreamChunk, TextGenerationOutput, TextGenerationStream, x402SettlementMode
+from ..types import TEE_LLM, ResponseFormat, StreamChunk, TextGenerationOutput, TextGenerationStream, x402SettlementMode
 from .exceptions import OpenGradientError
 from .opg_token import Permit2ApprovalResult, ensure_opg_approval
 
@@ -148,6 +148,7 @@ def completion(
         stop_sequence: Optional[List[str]] = None,
         temperature: float = 0.0,
         x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ) -> TextGenerationOutput:
         """
         Perform inference on an LLM model using completions via TEE.
@@ -163,6 +164,10 @@ def completion(
                 - SETTLE_BATCH: Aggregates multiple inferences into batch hashes (most cost-efficient).
                 - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
                 Defaults to SETTLE_BATCH.
+            response_format (dict or ResponseFormat, optional): Constrain the output format.
+                Use ``{"type": "json_object"}`` for freeform JSON, or provide a full JSON schema
+                via ``ResponseFormat`` / a dict with ``type`` and ``json_schema`` keys.
+                See `OpenAI structured outputs <https://platform.openai.com/docs/guides/structured-outputs>`_.
 
         Returns:
             TextGenerationOutput: Generated text results including:
@@ -180,6 +185,7 @@ def completion(
             stop_sequence=stop_sequence,
             temperature=temperature,
             x402_settlement_mode=x402_settlement_mode,
+            response_format=response_format,
         )
 
     def _tee_llm_completion(
@@ -190,6 +196,7 @@ def _tee_llm_completion(
         stop_sequence: Optional[List[str]] = None,
         temperature: float = 0.0,
         x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ) -> TextGenerationOutput:
         """
         Route completion request to OpenGradient TEE LLM server with x402 payments.
@@ -212,6 +219,9 @@ async def make_request_v2():
             if stop_sequence:
                 payload["stop"] = stop_sequence
 
+            if response_format is not None:
+                payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format
+
             try:
                 response = await self._request_client.post(
                     self._og_llm_server_url + "/v1/completions", json=payload, headers=headers, timeout=60
@@ -246,6 +256,7 @@ def chat(
         tool_choice: Optional[str] = None,
         x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
         stream: bool = False,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ) -> Union[TextGenerationOutput, TextGenerationStream]:
         """
         Perform inference on an LLM model using chat via TEE.
@@ -264,6 +275,10 @@ def chat(
                 - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
                 Defaults to SETTLE_BATCH.
             stream (bool, optional): Whether to stream the response. Default is False.
+            response_format (dict or ResponseFormat, optional): Constrain the output format.
+                Use ``{"type": "json_object"}`` for freeform JSON, or provide a full JSON schema
+                via ``ResponseFormat`` / a dict with ``type`` and ``json_schema`` keys.
+                See `OpenAI structured outputs <https://platform.openai.com/docs/guides/structured-outputs>`_.
 
         Returns:
             Union[TextGenerationOutput, TextGenerationStream]:
@@ -284,6 +299,7 @@ def chat(
                 tools=tools,
                 tool_choice=tool_choice,
                 x402_settlement_mode=x402_settlement_mode,
+                response_format=response_format,
             )
         else:
             # Non-streaming
@@ -296,6 +312,7 @@ def chat(
                 tools=tools,
                 tool_choice=tool_choice,
                 x402_settlement_mode=x402_settlement_mode,
+                response_format=response_format,
             )
 
     def _tee_llm_chat(
@@ -308,6 +325,7 @@ def _tee_llm_chat(
         tools: Optional[List[Dict]] = None,
         tool_choice: Optional[str] = None,
         x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ) -> TextGenerationOutput:
         """
         Route chat request to OpenGradient TEE LLM server with x402 payments.
@@ -334,6 +352,9 @@ async def make_request_v2():
                 payload["tools"] = tools
                 payload["tool_choice"] = tool_choice or "auto"
 
+            if response_format is not None:
+                payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format
+
             try:
                 endpoint = "/v1/chat/completions"
                 response = await self._request_client.post(
@@ -374,6 +395,7 @@ def _tee_llm_chat_stream_sync(
         tools: Optional[List[Dict]] = None,
         tool_choice: Optional[str] = None,
         x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ):
         """
         Sync streaming using threading bridge - TRUE real-time streaming.
@@ -395,6 +417,7 @@ async def _stream():
                     tools=tools,
                     tool_choice=tool_choice,
                     x402_settlement_mode=x402_settlement_mode,
+                    response_format=response_format,
                 ):
                     queue.put(chunk)
             except Exception as e:
@@ -430,6 +453,7 @@ async def _tee_llm_chat_stream_async(
         tools: Optional[List[Dict]] = None,
         tool_choice: Optional[str] = None,
         x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ):
         """
         Internal async streaming implementation for TEE LLM with x402 payments.
@@ -455,6 +479,8 @@ async def _tee_llm_chat_stream_async(
         if tools:
             payload["tools"] = tools
             payload["tool_choice"] = tool_choice or "auto"
+        if response_format is not None:
+            payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format
 
         async def _parse_sse_response(response) -> AsyncGenerator[StreamChunk, None]:
             status_code = getattr(response, "status_code", None)