From 06d3d921f367fcd35c4ae008b7869fd2a8995279 Mon Sep 17 00:00:00 2001
From: Ridwannurudeen <nraheemat@gmail.com>
Date: Tue, 24 Feb 2026 16:16:07 +0100
Subject: [PATCH] feat: add structured LLM output support (response_format)

Add response_format parameter to chat() and completion() methods,
enabling JSON schema enforcement for predictable, machine-readable
LLM output. Follows the OpenAI structured outputs specification.

Changes:
- Add ResponseFormat dataclass to types.py
- Thread response_format through all LLM methods (public + internal)
- Add --response-format and --response-format-file CLI options
- Export ResponseFormat from opengradient package
- Add llm_structured_output.py example with sentiment analysis demo

Closes #155
---
 examples/README.md                | 12 ++++
 examples/llm_structured_output.py | 70 +++++++++++++++++++++++
 src/opengradient/__init__.py      |  2 +
 src/opengradient/cli.py           | 63 +++++++++++++++++++++
 src/opengradient/client/llm.py    | 28 +++++++++-
 src/opengradient/types.py         | 42 ++++++++++++++
 tests/client_test.py              | 93 +++++++++++++++++++++++++++++++
 7 files changed, 309 insertions(+), 1 deletion(-)
 create mode 100644 examples/llm_structured_output.py

diff --git a/examples/README.md b/examples/README.md
index acb97be..c48ba97 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -73,6 +73,18 @@ python examples/llm_chat_streaming.py
 - Demonstrates real-time token streaming
 - Returns chunks as they arrive from the model
 
+#### `llm_structured_output.py`
+Demonstrates structured LLM output via JSON schema enforcement.
+
+```bash
+python examples/llm_structured_output.py
+```
+
+**What it does:**
+- Defines a JSON schema for sentiment analysis output
+- Uses `response_format` to constrain the model's response to match the schema
+- Parses the guaranteed-structured JSON response
+
 #### `llm_tool_calling.py`
 Demonstrates LLM tool/function calling.
 
diff --git a/examples/llm_structured_output.py b/examples/llm_structured_output.py
new file mode 100644
index 0000000..c5e8c22
--- /dev/null
+++ b/examples/llm_structured_output.py
@@ -0,0 +1,70 @@
+"""
+Structured LLM output via JSON schema enforcement.
+
+Constrains the model to return a response matching a predefined JSON schema,
+ensuring predictable, machine-readable output.
+
+Usage:
+    export OG_PRIVATE_KEY="your_private_key"
+    python examples/llm_structured_output.py
+"""
+
+import json
+import os
+
+import opengradient as og
+
+client = og.Client(
+    private_key=os.environ.get("OG_PRIVATE_KEY"),
+)
+client.llm.ensure_opg_approval(opg_amount=2)
+
+# Define a JSON schema for sentiment analysis output
+response_format = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "sentiment_analysis",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "properties": {
+                "sentiment": {
+                    "type": "string",
+                    "enum": ["positive", "negative", "neutral"],
+                },
+                "confidence": {
+                    "type": "number",
+                    "description": "Confidence score between 0 and 1",
+                },
+                "reasoning": {
+                    "type": "string",
+                    "description": "Brief explanation for the classification",
+                },
+            },
+            "required": ["sentiment", "confidence", "reasoning"],
+            "additionalProperties": False,
+        },
+    },
+}
+
+result = client.llm.chat(
+    model=og.TEE_LLM.GPT_4O,
+    messages=[
+        {
+            "role": "system",
+            "content": "You are a sentiment analysis assistant. Analyze the sentiment of the given text.",
+        },
+        {
+            "role": "user",
+            "content": "I absolutely love this new feature, it makes everything so much easier!",
+        },
+    ],
+    max_tokens=200,
+    response_format=response_format,
+)
+
+# The response content is guaranteed to match the schema
+output = json.loads(result.chat_output["content"])
+print(f"Sentiment:  {output['sentiment']}")
+print(f"Confidence: {output['confidence']}")
+print(f"Reasoning:  {output['reasoning']}")
diff --git a/src/opengradient/__init__.py b/src/opengradient/__init__.py
index 562ad4e..76da42a 100644
--- a/src/opengradient/__init__.py
+++ b/src/opengradient/__init__.py
@@ -102,6 +102,7 @@
     InferenceResult,
     ModelOutput,
     ModelRepository,
+    ResponseFormat,
     SchedulerParams,
     TextGenerationOutput,
     TextGenerationStream,
@@ -164,6 +165,7 @@ def init(
     "SchedulerParams",
     "CandleType",
     "CandleOrder",
+    "ResponseFormat",
     "TextGenerationOutput",
     "TextGenerationStream",
     "x402SettlementMode",
diff --git a/src/opengradient/cli.py b/src/opengradient/cli.py
index 2c5b07e..d3b617f 100644
--- a/src/opengradient/cli.py
+++ b/src/opengradient/cli.py
@@ -359,6 +359,43 @@ def infer(ctx, model_cid: str, inference_mode: str, input_data, input_file: Path
         click.echo(f"Error running inference: {str(e)}")
 
 
+def _parse_response_format(ctx, response_format: Optional[str], response_format_file: Optional[Path]) -> Optional[dict]:
+    """Parse --response-format / --response-format-file into a dict (or None)."""
+    if response_format and response_format_file:
+        click.echo("Cannot specify both --response-format and --response-format-file")
+        ctx.exit(1)
+        return None
+
+    if response_format:
+        try:
+            parsed = json.loads(response_format)
+            if not isinstance(parsed, dict):
+                click.echo("--response-format must be a JSON object")
+                ctx.exit(1)
+                return None
+            return parsed
+        except json.JSONDecodeError as e:
+            click.echo(f"Failed to parse --response-format JSON: {e}")
+            ctx.exit(1)
+            return None
+
+    if response_format_file:
+        try:
+            with response_format_file.open("r") as f:
+                parsed = json.load(f)
+            if not isinstance(parsed, dict):
+                click.echo("Response format file must contain a JSON object")
+                ctx.exit(1)
+                return None
+            return parsed
+        except Exception as e:
+            click.echo(f"Failed to load response format from file: {e}")
+            ctx.exit(1)
+            return None
+
+    return None
+
+
 @cli.command()
 @click.option(
     "--model",
@@ -378,6 +415,13 @@ def infer(ctx, model_cid: str, inference_mode: str, input_data, input_file: Path
     default="settle-batch",
     help="Settlement mode for x402 payments: settle (payment only), settle-batch (batched, default), settle-metadata (full data)",
 )
+@click.option("--response-format", type=str, default=None, help="Response format config as JSON for structured outputs")
+@click.option(
+    "--response-format-file",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to JSON file containing response format configuration",
+)
 @click.pass_context
 def completion(
     ctx,
@@ -387,6 +431,8 @@ def completion(
     max_tokens: int,
     stop_sequence: List[str],
     temperature: float,
+    response_format: Optional[str],
+    response_format_file: Optional[Path],
 ):
     """
     Run completion inference on an LLM model via TEE.
@@ -404,6 +450,9 @@ def completion(
     try:
         click.echo(f'Running TEE LLM completion for model "{model_cid}"\n')
 
+        # Parse response format
+        parsed_response_format = _parse_response_format(ctx, response_format, response_format_file)
+
         completion_output = client.llm.completion(
             model=model_cid,
             prompt=prompt,
@@ -411,6 +460,7 @@ def completion(
             stop_sequence=list(stop_sequence),
             temperature=temperature,
             x402_settlement_mode=x402SettlementModes[x402_settlement_mode],
+            response_format=parsed_response_format,
         )
 
         print_llm_completion_result(model_cid, completion_output.transaction_hash, completion_output.completion_output, is_vanilla=False)
@@ -472,6 +522,13 @@ def print_llm_completion_result(model_cid, tx_hash, llm_output, is_vanilla=True)
     help="Settlement mode for x402 payments: settle (payment only), settle-batch (batched, default), settle-metadata (full data)",
 )
 @click.option("--stream", is_flag=True, default=False, help="Stream the output from the LLM")
+@click.option("--response-format", type=str, default=None, help="Response format config as JSON for structured outputs")
+@click.option(
+    "--response-format-file",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to JSON file containing response format configuration",
+)
 @click.pass_context
 def chat(
     ctx,
@@ -486,6 +543,8 @@ def chat(
     tool_choice: Optional[str],
     x402_settlement_mode: Optional[str],
     stream: bool,
+    response_format: Optional[str],
+    response_format_file: Optional[Path],
 ):
     """
     Run chat inference on an LLM model via TEE.
@@ -562,6 +621,9 @@ def chat(
         if not tools and not tools_file:
             parsed_tools = None
 
+        # Parse response format
+        parsed_response_format = _parse_response_format(ctx, response_format, response_format_file)
+
         result = client.llm.chat(
             model=model_cid,
             messages=messages,
@@ -572,6 +634,7 @@ def chat(
             tool_choice=tool_choice,
             x402_settlement_mode=x402SettlementModes[x402_settlement_mode],
             stream=stream,
+            response_format=parsed_response_format,
         )
 
         # Handle response based on streaming flag
diff --git a/src/opengradient/client/llm.py b/src/opengradient/client/llm.py
index e1490cb..106230a 100644
--- a/src/opengradient/client/llm.py
+++ b/src/opengradient/client/llm.py
@@ -14,7 +14,7 @@
 from x402v2.mechanisms.evm.exact.register import register_exact_evm_client as register_exact_evm_clientv2
 from x402v2.mechanisms.evm.upto.register import register_upto_evm_client as register_upto_evm_clientv2
 
-from ..types import TEE_LLM, StreamChunk, TextGenerationOutput, TextGenerationStream, x402SettlementMode
+from ..types import TEE_LLM, ResponseFormat, StreamChunk, TextGenerationOutput, TextGenerationStream, x402SettlementMode
 from .exceptions import OpenGradientError
 from .opg_token import Permit2ApprovalResult, ensure_opg_approval
 
@@ -148,6 +148,7 @@ def completion(
         stop_sequence: Optional[List[str]] = None,
         temperature: float = 0.0,
         x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ) -> TextGenerationOutput:
         """
         Perform inference on an LLM model using completions via TEE.
@@ -163,6 +164,10 @@ def completion(
                 - SETTLE_BATCH: Aggregates multiple inferences into batch hashes (most cost-efficient).
                 - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
                 Defaults to SETTLE_BATCH.
+            response_format (dict or ResponseFormat, optional): Constrain the output format.
+                Use ``{"type": "json_object"}`` for freeform JSON, or provide a full JSON schema
+                via ``ResponseFormat`` / a dict with ``type`` and ``json_schema`` keys.
+                See `OpenAI structured outputs <https://platform.openai.com/docs/guides/structured-outputs>`_.
 
         Returns:
             TextGenerationOutput: Generated text results including:
@@ -180,6 +185,7 @@ def completion(
             stop_sequence=stop_sequence,
             temperature=temperature,
             x402_settlement_mode=x402_settlement_mode,
+            response_format=response_format,
         )
 
     def _tee_llm_completion(
@@ -190,6 +196,7 @@ def _tee_llm_completion(
         stop_sequence: Optional[List[str]] = None,
         temperature: float = 0.0,
         x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ) -> TextGenerationOutput:
         """
         Route completion request to OpenGradient TEE LLM server with x402 payments.
@@ -212,6 +219,9 @@ async def make_request_v2():
             if stop_sequence:
                 payload["stop"] = stop_sequence
 
+            if response_format is not None:
+                payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format
+
             try:
                 response = await self._request_client.post(
                     self._og_llm_server_url + "/v1/completions", json=payload, headers=headers, timeout=60
@@ -246,6 +256,7 @@ def chat(
         tool_choice: Optional[str] = None,
         x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
         stream: bool = False,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ) -> Union[TextGenerationOutput, TextGenerationStream]:
         """
         Perform inference on an LLM model using chat via TEE.
@@ -264,6 +275,10 @@ def chat(
                 - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
                 Defaults to SETTLE_BATCH.
             stream (bool, optional): Whether to stream the response. Default is False.
+            response_format (dict or ResponseFormat, optional): Constrain the output format.
+                Use ``{"type": "json_object"}`` for freeform JSON, or provide a full JSON schema
+                via ``ResponseFormat`` / a dict with ``type`` and ``json_schema`` keys.
+                See `OpenAI structured outputs <https://platform.openai.com/docs/guides/structured-outputs>`_.
 
         Returns:
             Union[TextGenerationOutput, TextGenerationStream]:
@@ -284,6 +299,7 @@ def chat(
                 tools=tools,
                 tool_choice=tool_choice,
                 x402_settlement_mode=x402_settlement_mode,
+                response_format=response_format,
             )
         else:
             # Non-streaming
@@ -296,6 +312,7 @@ def chat(
                 tools=tools,
                 tool_choice=tool_choice,
                 x402_settlement_mode=x402_settlement_mode,
+                response_format=response_format,
             )
 
     def _tee_llm_chat(
@@ -308,6 +325,7 @@ def _tee_llm_chat(
         tools: Optional[List[Dict]] = None,
         tool_choice: Optional[str] = None,
         x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ) -> TextGenerationOutput:
         """
         Route chat request to OpenGradient TEE LLM server with x402 payments.
@@ -334,6 +352,9 @@ async def make_request_v2():
                 payload["tools"] = tools
                 payload["tool_choice"] = tool_choice or "auto"
 
+            if response_format is not None:
+                payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format
+
             try:
                 endpoint = "/v1/chat/completions"
                 response = await self._request_client.post(
@@ -374,6 +395,7 @@ def _tee_llm_chat_stream_sync(
         tools: Optional[List[Dict]] = None,
         tool_choice: Optional[str] = None,
         x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ):
         """
         Sync streaming using threading bridge - TRUE real-time streaming.
@@ -395,6 +417,7 @@ async def _stream():
                     tools=tools,
                     tool_choice=tool_choice,
                     x402_settlement_mode=x402_settlement_mode,
+                    response_format=response_format,
                 ):
                     queue.put(chunk)
             except Exception as e:
@@ -430,6 +453,7 @@ async def _tee_llm_chat_stream_async(
         tools: Optional[List[Dict]] = None,
         tool_choice: Optional[str] = None,
         x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
+        response_format: Optional[Union[Dict, ResponseFormat]] = None,
     ):
         """
         Internal async streaming implementation for TEE LLM with x402 payments.
@@ -455,6 +479,8 @@ async def _tee_llm_chat_stream_async(
         if tools:
             payload["tools"] = tools
             payload["tool_choice"] = tool_choice or "auto"
+        if response_format is not None:
+            payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format
 
         async def _parse_sse_response(response) -> AsyncGenerator[StreamChunk, None]:
             status_code = getattr(response, "status_code", None)
diff --git a/src/opengradient/types.py b/src/opengradient/types.py
index 866bfba..cab35d7 100644
--- a/src/opengradient/types.py
+++ b/src/opengradient/types.py
@@ -10,6 +10,48 @@
 import numpy as np
 
 
+@dataclass
+class ResponseFormat:
+    """Configuration for structured LLM output following the OpenAI structured outputs spec.
+
+    Constrains the model to produce output matching a JSON schema, ensuring
+    predictable, machine-readable responses.
+
+    Attributes:
+        type: The response format type. Use ``"json_schema"`` for schema-constrained
+            output, or ``"json_object"`` for freeform JSON.
+        json_schema: Schema definition when ``type`` is ``"json_schema"``.
+            Must include ``name`` (str) and ``schema`` (JSON Schema object).
+            May include ``description`` (str) and ``strict`` (bool, default True).
+
+    Examples:
+        >>> fmt = ResponseFormat(
+        ...     type="json_schema",
+        ...     json_schema={
+        ...         "name": "sentiment",
+        ...         "schema": {
+        ...             "type": "object",
+        ...             "properties": {
+        ...                 "label": {"type": "string", "enum": ["positive", "negative", "neutral"]},
+        ...                 "score": {"type": "number"},
+        ...             },
+        ...             "required": ["label", "score"],
+        ...         },
+        ...     },
+        ... )
+    """
+
+    type: str
+    json_schema: Optional[Dict] = None
+
+    def to_dict(self) -> Dict:
+        """Serialize to the payload format expected by the API."""
+        d: Dict = {"type": self.type}
+        if self.json_schema is not None:
+            d["json_schema"] = self.json_schema
+        return d
+
+
 class x402SettlementMode(str, Enum):
     """
     Settlement modes for x402 payment protocol transactions.
diff --git a/tests/client_test.py b/tests/client_test.py
index f17283b..f1ad927 100644
--- a/tests/client_test.py
+++ b/tests/client_test.py
@@ -10,6 +10,7 @@
 from src.opengradient.client import Client
 from src.opengradient.types import (
     TEE_LLM,
+    ResponseFormat,
     StreamChunk,
     TextGenerationOutput,
     x402SettlementMode,
@@ -319,3 +320,95 @@ def test_settlement_mode_aliases(self):
         """Test settlement mode aliases."""
         assert x402SettlementMode.SETTLE_INDIVIDUAL == x402SettlementMode.SETTLE
         assert x402SettlementMode.SETTLE_INDIVIDUAL_WITH_METADATA == x402SettlementMode.SETTLE_METADATA
+
+
+# --- ResponseFormat Tests ---
+
+
+class TestResponseFormat:
+    def test_json_object_format(self):
+        """Test basic json_object response format serialization."""
+        fmt = ResponseFormat(type="json_object")
+        assert fmt.to_dict() == {"type": "json_object"}
+
+    def test_json_schema_format(self):
+        """Test json_schema response format with full schema."""
+        schema = {
+            "name": "sentiment",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "label": {"type": "string", "enum": ["positive", "negative"]},
+                    "score": {"type": "number"},
+                },
+                "required": ["label", "score"],
+            },
+        }
+        fmt = ResponseFormat(type="json_schema", json_schema=schema)
+        result = fmt.to_dict()
+
+        assert result["type"] == "json_schema"
+        assert result["json_schema"]["name"] == "sentiment"
+        assert "properties" in result["json_schema"]["schema"]
+
+    def test_json_schema_none_omitted(self):
+        """Test that json_schema key is omitted when not provided."""
+        fmt = ResponseFormat(type="json_object")
+        result = fmt.to_dict()
+        assert "json_schema" not in result
+
+    def test_completion_passes_response_format(self, client):
+        """Test that completion forwards response_format to the internal method."""
+        fmt = ResponseFormat(type="json_object")
+        with patch.object(client.llm, "_tee_llm_completion") as mock_tee:
+            mock_tee.return_value = TextGenerationOutput(
+                transaction_hash="external",
+                completion_output='{"result": "ok"}',
+            )
+
+            client.llm.completion(
+                model=TEE_LLM.GPT_4O,
+                prompt="Return JSON",
+                response_format=fmt,
+            )
+
+            call_kwargs = mock_tee.call_args[1]
+            assert call_kwargs["response_format"] is fmt
+
+    def test_chat_passes_response_format(self, client):
+        """Test that chat forwards response_format to the internal method."""
+        fmt = {"type": "json_object"}
+        with patch.object(client.llm, "_tee_llm_chat") as mock_tee:
+            mock_tee.return_value = TextGenerationOutput(
+                transaction_hash="external",
+                chat_output={"role": "assistant", "content": '{"ok": true}'},
+                finish_reason="stop",
+            )
+
+            client.llm.chat(
+                model=TEE_LLM.GPT_4O,
+                messages=[{"role": "user", "content": "Return JSON"}],
+                response_format=fmt,
+            )
+
+            call_kwargs = mock_tee.call_args[1]
+            assert call_kwargs["response_format"] is fmt
+
+    def test_chat_stream_passes_response_format(self, client):
+        """Test that streaming chat forwards response_format."""
+        fmt = ResponseFormat(type="json_object")
+        with patch.object(client.llm, "_tee_llm_chat_stream_sync") as mock_stream:
+            mock_stream.return_value = iter([
+                StreamChunk(choices=[], model="gpt-4o", is_final=True),
+            ])
+
+            result = client.llm.chat(
+                model=TEE_LLM.GPT_4O,
+                messages=[{"role": "user", "content": "Return JSON"}],
+                stream=True,
+                response_format=fmt,
+            )
+            list(result)
+
+            call_kwargs = mock_stream.call_args[1]
+            assert call_kwargs["response_format"] is fmt