From 06d3d921f367fcd35c4ae008b7869fd2a8995279 Mon Sep 17 00:00:00 2001 From: Ridwannurudeen Date: Tue, 24 Feb 2026 16:16:07 +0100 Subject: [PATCH] feat: add structured LLM output support (response_format) Add response_format parameter to chat() and completion() methods, enabling JSON schema enforcement for predictable, machine-readable LLM output. Follows the OpenAI structured outputs specification. Changes: - Add ResponseFormat dataclass to types.py - Thread response_format through all LLM methods (public + internal) - Add --response-format and --response-format-file CLI options - Export ResponseFormat from opengradient package - Add llm_structured_output.py example with sentiment analysis demo Closes #155 --- examples/README.md | 12 ++++ examples/llm_structured_output.py | 70 +++++++++++++++++++++++ src/opengradient/__init__.py | 2 + src/opengradient/cli.py | 63 +++++++++++++++++++++ src/opengradient/client/llm.py | 28 +++++++++- src/opengradient/types.py | 42 ++++++++++++++ tests/client_test.py | 93 +++++++++++++++++++++++++++++++ 7 files changed, 309 insertions(+), 1 deletion(-) create mode 100644 examples/llm_structured_output.py diff --git a/examples/README.md b/examples/README.md index acb97be..c48ba97 100644 --- a/examples/README.md +++ b/examples/README.md @@ -73,6 +73,18 @@ python examples/llm_chat_streaming.py - Demonstrates real-time token streaming - Returns chunks as they arrive from the model +#### `llm_structured_output.py` +Demonstrates structured LLM output via JSON schema enforcement. + +```bash +python examples/llm_structured_output.py +``` + +**What it does:** +- Defines a JSON schema for sentiment analysis output +- Uses `response_format` to constrain the model's response to match the schema +- Parses the guaranteed-structured JSON response + #### `llm_tool_calling.py` Demonstrates LLM tool/function calling. diff --git a/examples/llm_structured_output.py b/examples/llm_structured_output.py new file mode 100644 index 0000000..c5e8c22 --- /dev/null +++ b/examples/llm_structured_output.py @@ -0,0 +1,70 @@ +""" +Structured LLM output via JSON schema enforcement. + +Constrains the model to return a response matching a predefined JSON schema, +ensuring predictable, machine-readable output. + +Usage: + export OG_PRIVATE_KEY="your_private_key" + python examples/llm_structured_output.py +""" + +import json +import os + +import opengradient as og + +client = og.Client( + private_key=os.environ.get("OG_PRIVATE_KEY"), +) +client.llm.ensure_opg_approval(opg_amount=2) + +# Define a JSON schema for sentiment analysis output +response_format = { + "type": "json_schema", + "json_schema": { + "name": "sentiment_analysis", + "strict": True, + "schema": { + "type": "object", + "properties": { + "sentiment": { + "type": "string", + "enum": ["positive", "negative", "neutral"], + }, + "confidence": { + "type": "number", + "description": "Confidence score between 0 and 1", + }, + "reasoning": { + "type": "string", + "description": "Brief explanation for the classification", + }, + }, + "required": ["sentiment", "confidence", "reasoning"], + "additionalProperties": False, + }, + }, +} + +result = client.llm.chat( + model=og.TEE_LLM.GPT_4O, + messages=[ + { + "role": "system", + "content": "You are a sentiment analysis assistant. Analyze the sentiment of the given text.", + }, + { + "role": "user", + "content": "I absolutely love this new feature, it makes everything so much easier!", + }, + ], + max_tokens=200, + response_format=response_format, +) + +# The response content is guaranteed to match the schema +output = json.loads(result.chat_output["content"]) +print(f"Sentiment: {output['sentiment']}") +print(f"Confidence: {output['confidence']}") +print(f"Reasoning: {output['reasoning']}") diff --git a/src/opengradient/__init__.py b/src/opengradient/__init__.py index 562ad4e..76da42a 100644 --- a/src/opengradient/__init__.py +++ b/src/opengradient/__init__.py @@ -102,6 +102,7 @@ InferenceResult, ModelOutput, ModelRepository, + ResponseFormat, SchedulerParams, TextGenerationOutput, TextGenerationStream, @@ -164,6 +165,7 @@ def init( "SchedulerParams", "CandleType", "CandleOrder", + "ResponseFormat", "TextGenerationOutput", "TextGenerationStream", "x402SettlementMode", diff --git a/src/opengradient/cli.py b/src/opengradient/cli.py index 2c5b07e..d3b617f 100644 --- a/src/opengradient/cli.py +++ b/src/opengradient/cli.py @@ -359,6 +359,43 @@ def infer(ctx, model_cid: str, inference_mode: str, input_data, input_file: Path click.echo(f"Error running inference: {str(e)}") +def _parse_response_format(ctx, response_format: Optional[str], response_format_file: Optional[Path]) -> Optional[dict]: + """Parse --response-format / --response-format-file into a dict (or None).""" + if response_format and response_format_file: + click.echo("Cannot specify both --response-format and --response-format-file") + ctx.exit(1) + return None + + if response_format: + try: + parsed = json.loads(response_format) + if not isinstance(parsed, dict): + click.echo("--response-format must be a JSON object") + ctx.exit(1) + return None + return parsed + except json.JSONDecodeError as e: + click.echo(f"Failed to parse --response-format JSON: {e}") + ctx.exit(1) + return None + + if response_format_file: + try: + with response_format_file.open("r") as f: + parsed = json.load(f) + if not isinstance(parsed, dict): + click.echo("Response format file must contain a JSON object") + ctx.exit(1) + return None + return parsed + except Exception as e: + click.echo(f"Failed to load response format from file: {e}") + ctx.exit(1) + return None + + return None + + @cli.command() @click.option( "--model", @@ -378,6 +415,13 @@ def infer(ctx, model_cid: str, inference_mode: str, input_data, input_file: Path default="settle-batch", help="Settlement mode for x402 payments: settle (payment only), settle-batch (batched, default), settle-metadata (full data)", ) +@click.option("--response-format", type=str, default=None, help="Response format config as JSON for structured outputs") +@click.option( + "--response-format-file", + type=click.Path(exists=True, path_type=Path), + default=None, + help="Path to JSON file containing response format configuration", +) @click.pass_context def completion( ctx, @@ -387,6 +431,8 @@ def completion( max_tokens: int, stop_sequence: List[str], temperature: float, + response_format: Optional[str], + response_format_file: Optional[Path], ): """ Run completion inference on an LLM model via TEE. @@ -404,6 +450,9 @@ def completion( try: click.echo(f'Running TEE LLM completion for model "{model_cid}"\n') + # Parse response format + parsed_response_format = _parse_response_format(ctx, response_format, response_format_file) + completion_output = client.llm.completion( model=model_cid, prompt=prompt, @@ -411,6 +460,7 @@ def completion( stop_sequence=list(stop_sequence), temperature=temperature, x402_settlement_mode=x402SettlementModes[x402_settlement_mode], + response_format=parsed_response_format, ) print_llm_completion_result(model_cid, completion_output.transaction_hash, completion_output.completion_output, is_vanilla=False) @@ -472,6 +522,13 @@ def print_llm_completion_result(model_cid, tx_hash, llm_output, is_vanilla=True) help="Settlement mode for x402 payments: settle (payment only), settle-batch (batched, default), settle-metadata (full data)", ) @click.option("--stream", is_flag=True, default=False, help="Stream the output from the LLM") +@click.option("--response-format", type=str, default=None, help="Response format config as JSON for structured outputs") +@click.option( + "--response-format-file", + type=click.Path(exists=True, path_type=Path), + default=None, + help="Path to JSON file containing response format configuration", +) @click.pass_context def chat( ctx, @@ -486,6 +543,8 @@ def chat( tool_choice: Optional[str], x402_settlement_mode: Optional[str], stream: bool, + response_format: Optional[str], + response_format_file: Optional[Path], ): """ Run chat inference on an LLM model via TEE. @@ -562,6 +621,9 @@ def chat( if not tools and not tools_file: parsed_tools = None + # Parse response format + parsed_response_format = _parse_response_format(ctx, response_format, response_format_file) + result = client.llm.chat( model=model_cid, messages=messages, @@ -572,6 +634,7 @@ def chat( tool_choice=tool_choice, x402_settlement_mode=x402SettlementModes[x402_settlement_mode], stream=stream, + response_format=parsed_response_format, ) # Handle response based on streaming flag diff --git a/src/opengradient/client/llm.py b/src/opengradient/client/llm.py index e1490cb..106230a 100644 --- a/src/opengradient/client/llm.py +++ b/src/opengradient/client/llm.py @@ -14,7 +14,7 @@ from x402v2.mechanisms.evm.exact.register import register_exact_evm_client as register_exact_evm_clientv2 from x402v2.mechanisms.evm.upto.register import register_upto_evm_client as register_upto_evm_clientv2 -from ..types import TEE_LLM, StreamChunk, TextGenerationOutput, TextGenerationStream, x402SettlementMode +from ..types import TEE_LLM, ResponseFormat, StreamChunk, TextGenerationOutput, TextGenerationStream, x402SettlementMode from .exceptions import OpenGradientError from .opg_token import Permit2ApprovalResult, ensure_opg_approval @@ -148,6 +148,7 @@ def completion( stop_sequence: Optional[List[str]] = None, temperature: float = 0.0, x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH, + response_format: Optional[Union[Dict, ResponseFormat]] = None, ) -> TextGenerationOutput: """ Perform inference on an LLM model using completions via TEE. @@ -163,6 +164,10 @@ def completion( - SETTLE_BATCH: Aggregates multiple inferences into batch hashes (most cost-efficient). - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata. Defaults to SETTLE_BATCH. + response_format (dict or ResponseFormat, optional): Constrain the output format. + Use ``{"type": "json_object"}`` for freeform JSON, or provide a full JSON schema + via ``ResponseFormat`` / a dict with ``type`` and ``json_schema`` keys. + See `OpenAI structured outputs `_. Returns: TextGenerationOutput: Generated text results including: @@ -180,6 +185,7 @@ def completion( stop_sequence=stop_sequence, temperature=temperature, x402_settlement_mode=x402_settlement_mode, + response_format=response_format, ) def _tee_llm_completion( @@ -190,6 +196,7 @@ def _tee_llm_completion( stop_sequence: Optional[List[str]] = None, temperature: float = 0.0, x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH, + response_format: Optional[Union[Dict, ResponseFormat]] = None, ) -> TextGenerationOutput: """ Route completion request to OpenGradient TEE LLM server with x402 payments. @@ -212,6 +219,9 @@ async def make_request_v2(): if stop_sequence: payload["stop"] = stop_sequence + if response_format is not None: + payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format + try: response = await self._request_client.post( self._og_llm_server_url + "/v1/completions", json=payload, headers=headers, timeout=60 @@ -246,6 +256,7 @@ def chat( tool_choice: Optional[str] = None, x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH, stream: bool = False, + response_format: Optional[Union[Dict, ResponseFormat]] = None, ) -> Union[TextGenerationOutput, TextGenerationStream]: """ Perform inference on an LLM model using chat via TEE. @@ -264,6 +275,10 @@ def chat( - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata. Defaults to SETTLE_BATCH. stream (bool, optional): Whether to stream the response. Default is False. + response_format (dict or ResponseFormat, optional): Constrain the output format. + Use ``{"type": "json_object"}`` for freeform JSON, or provide a full JSON schema + via ``ResponseFormat`` / a dict with ``type`` and ``json_schema`` keys. + See `OpenAI structured outputs `_. Returns: Union[TextGenerationOutput, TextGenerationStream]: @@ -284,6 +299,7 @@ def chat( tools=tools, tool_choice=tool_choice, x402_settlement_mode=x402_settlement_mode, + response_format=response_format, ) else: # Non-streaming @@ -296,6 +312,7 @@ def chat( tools=tools, tool_choice=tool_choice, x402_settlement_mode=x402_settlement_mode, + response_format=response_format, ) def _tee_llm_chat( @@ -308,6 +325,7 @@ def _tee_llm_chat( tools: Optional[List[Dict]] = None, tool_choice: Optional[str] = None, x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH, + response_format: Optional[Union[Dict, ResponseFormat]] = None, ) -> TextGenerationOutput: """ Route chat request to OpenGradient TEE LLM server with x402 payments. @@ -334,6 +352,9 @@ async def make_request_v2(): payload["tools"] = tools payload["tool_choice"] = tool_choice or "auto" + if response_format is not None: + payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format + try: endpoint = "/v1/chat/completions" response = await self._request_client.post( @@ -374,6 +395,7 @@ def _tee_llm_chat_stream_sync( tools: Optional[List[Dict]] = None, tool_choice: Optional[str] = None, x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH, + response_format: Optional[Union[Dict, ResponseFormat]] = None, ): """ Sync streaming using threading bridge - TRUE real-time streaming. @@ -395,6 +417,7 @@ async def _stream(): tools=tools, tool_choice=tool_choice, x402_settlement_mode=x402_settlement_mode, + response_format=response_format, ): queue.put(chunk) except Exception as e: @@ -430,6 +453,7 @@ async def _tee_llm_chat_stream_async( tools: Optional[List[Dict]] = None, tool_choice: Optional[str] = None, x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH, + response_format: Optional[Union[Dict, ResponseFormat]] = None, ): """ Internal async streaming implementation for TEE LLM with x402 payments. @@ -455,6 +479,8 @@ async def _tee_llm_chat_stream_async( if tools: payload["tools"] = tools payload["tool_choice"] = tool_choice or "auto" + if response_format is not None: + payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format async def _parse_sse_response(response) -> AsyncGenerator[StreamChunk, None]: status_code = getattr(response, "status_code", None) diff --git a/src/opengradient/types.py b/src/opengradient/types.py index 866bfba..cab35d7 100644 --- a/src/opengradient/types.py +++ b/src/opengradient/types.py @@ -10,6 +10,48 @@ import numpy as np +@dataclass +class ResponseFormat: + """Configuration for structured LLM output following the OpenAI structured outputs spec. + + Constrains the model to produce output matching a JSON schema, ensuring + predictable, machine-readable responses. + + Attributes: + type: The response format type. Use ``"json_schema"`` for schema-constrained + output, or ``"json_object"`` for freeform JSON. + json_schema: Schema definition when ``type`` is ``"json_schema"``. + Must include ``name`` (str) and ``schema`` (JSON Schema object). + May include ``description`` (str) and ``strict`` (bool, default True). + + Examples: + >>> fmt = ResponseFormat( + ... type="json_schema", + ... json_schema={ + ... "name": "sentiment", + ... "schema": { + ... "type": "object", + ... "properties": { + ... "label": {"type": "string", "enum": ["positive", "negative", "neutral"]}, + ... "score": {"type": "number"}, + ... }, + ... "required": ["label", "score"], + ... }, + ... }, + ... ) + """ + + type: str + json_schema: Optional[Dict] = None + + def to_dict(self) -> Dict: + """Serialize to the payload format expected by the API.""" + d: Dict = {"type": self.type} + if self.json_schema is not None: + d["json_schema"] = self.json_schema + return d + + class x402SettlementMode(str, Enum): """ Settlement modes for x402 payment protocol transactions. diff --git a/tests/client_test.py b/tests/client_test.py index f17283b..f1ad927 100644 --- a/tests/client_test.py +++ b/tests/client_test.py @@ -10,6 +10,7 @@ from src.opengradient.client import Client from src.opengradient.types import ( TEE_LLM, + ResponseFormat, StreamChunk, TextGenerationOutput, x402SettlementMode, @@ -319,3 +320,95 @@ def test_settlement_mode_aliases(self): """Test settlement mode aliases.""" assert x402SettlementMode.SETTLE_INDIVIDUAL == x402SettlementMode.SETTLE assert x402SettlementMode.SETTLE_INDIVIDUAL_WITH_METADATA == x402SettlementMode.SETTLE_METADATA + + +# --- ResponseFormat Tests --- + + +class TestResponseFormat: + def test_json_object_format(self): + """Test basic json_object response format serialization.""" + fmt = ResponseFormat(type="json_object") + assert fmt.to_dict() == {"type": "json_object"} + + def test_json_schema_format(self): + """Test json_schema response format with full schema.""" + schema = { + "name": "sentiment", + "schema": { + "type": "object", + "properties": { + "label": {"type": "string", "enum": ["positive", "negative"]}, + "score": {"type": "number"}, + }, + "required": ["label", "score"], + }, + } + fmt = ResponseFormat(type="json_schema", json_schema=schema) + result = fmt.to_dict() + + assert result["type"] == "json_schema" + assert result["json_schema"]["name"] == "sentiment" + assert "properties" in result["json_schema"]["schema"] + + def test_json_schema_none_omitted(self): + """Test that json_schema key is omitted when not provided.""" + fmt = ResponseFormat(type="json_object") + result = fmt.to_dict() + assert "json_schema" not in result + + def test_completion_passes_response_format(self, client): + """Test that completion forwards response_format to the internal method.""" + fmt = ResponseFormat(type="json_object") + with patch.object(client.llm, "_tee_llm_completion") as mock_tee: + mock_tee.return_value = TextGenerationOutput( + transaction_hash="external", + completion_output='{"result": "ok"}', + ) + + client.llm.completion( + model=TEE_LLM.GPT_4O, + prompt="Return JSON", + response_format=fmt, + ) + + call_kwargs = mock_tee.call_args[1] + assert call_kwargs["response_format"] is fmt + + def test_chat_passes_response_format(self, client): + """Test that chat forwards response_format to the internal method.""" + fmt = {"type": "json_object"} + with patch.object(client.llm, "_tee_llm_chat") as mock_tee: + mock_tee.return_value = TextGenerationOutput( + transaction_hash="external", + chat_output={"role": "assistant", "content": '{"ok": true}'}, + finish_reason="stop", + ) + + client.llm.chat( + model=TEE_LLM.GPT_4O, + messages=[{"role": "user", "content": "Return JSON"}], + response_format=fmt, + ) + + call_kwargs = mock_tee.call_args[1] + assert call_kwargs["response_format"] is fmt + + def test_chat_stream_passes_response_format(self, client): + """Test that streaming chat forwards response_format.""" + fmt = ResponseFormat(type="json_object") + with patch.object(client.llm, "_tee_llm_chat_stream_sync") as mock_stream: + mock_stream.return_value = iter([ + StreamChunk(choices=[], model="gpt-4o", is_final=True), + ]) + + result = client.llm.chat( + model=TEE_LLM.GPT_4O, + messages=[{"role": "user", "content": "Return JSON"}], + stream=True, + response_format=fmt, + ) + list(result) + + call_kwargs = mock_stream.call_args[1] + assert call_kwargs["response_format"] is fmt