Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,18 @@ python examples/llm_chat_streaming.py
- Demonstrates real-time token streaming
- Returns chunks as they arrive from the model

#### `llm_structured_output.py`
Demonstrates structured LLM output via JSON schema enforcement.

```bash
python examples/llm_structured_output.py
```

**What it does:**
- Defines a JSON schema for sentiment analysis output
- Uses `response_format` to constrain the model's response to match the schema
- Parses the guaranteed-structured JSON response

#### `llm_tool_calling.py`
Demonstrates LLM tool/function calling.

Expand Down
70 changes: 70 additions & 0 deletions examples/llm_structured_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Structured LLM output via JSON schema enforcement.

Constrains the model to return a response matching a predefined JSON schema,
ensuring predictable, machine-readable output.

Usage:
export OG_PRIVATE_KEY="your_private_key"
python examples/llm_structured_output.py
"""

import json
import os

import opengradient as og

client = og.Client(
private_key=os.environ.get("OG_PRIVATE_KEY"),
)
client.llm.ensure_opg_approval(opg_amount=2)

# Define a JSON schema for sentiment analysis output
response_format = {
"type": "json_schema",
"json_schema": {
"name": "sentiment_analysis",
"strict": True,
"schema": {
"type": "object",
"properties": {
"sentiment": {
"type": "string",
"enum": ["positive", "negative", "neutral"],
},
"confidence": {
"type": "number",
"description": "Confidence score between 0 and 1",
},
"reasoning": {
"type": "string",
"description": "Brief explanation for the classification",
},
},
"required": ["sentiment", "confidence", "reasoning"],
"additionalProperties": False,
},
},
}

result = client.llm.chat(
model=og.TEE_LLM.GPT_4O,
messages=[
{
"role": "system",
"content": "You are a sentiment analysis assistant. Analyze the sentiment of the given text.",
},
{
"role": "user",
"content": "I absolutely love this new feature, it makes everything so much easier!",
},
],
max_tokens=200,
response_format=response_format,
)

# The response content is guaranteed to match the schema
output = json.loads(result.chat_output["content"])
print(f"Sentiment: {output['sentiment']}")
print(f"Confidence: {output['confidence']}")
print(f"Reasoning: {output['reasoning']}")
2 changes: 2 additions & 0 deletions src/opengradient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
InferenceResult,
ModelOutput,
ModelRepository,
ResponseFormat,
SchedulerParams,
TextGenerationOutput,
TextGenerationStream,
Expand Down Expand Up @@ -164,6 +165,7 @@ def init(
"SchedulerParams",
"CandleType",
"CandleOrder",
"ResponseFormat",
"TextGenerationOutput",
"TextGenerationStream",
"x402SettlementMode",
Expand Down
63 changes: 63 additions & 0 deletions src/opengradient/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,43 @@ def infer(ctx, model_cid: str, inference_mode: str, input_data, input_file: Path
click.echo(f"Error running inference: {str(e)}")


def _parse_response_format(ctx, response_format: Optional[str], response_format_file: Optional[Path]) -> Optional[dict]:
"""Parse --response-format / --response-format-file into a dict (or None)."""
if response_format and response_format_file:
click.echo("Cannot specify both --response-format and --response-format-file")
ctx.exit(1)
return None

if response_format:
try:
parsed = json.loads(response_format)
if not isinstance(parsed, dict):
click.echo("--response-format must be a JSON object")
ctx.exit(1)
return None
return parsed
except json.JSONDecodeError as e:
click.echo(f"Failed to parse --response-format JSON: {e}")
ctx.exit(1)
return None

if response_format_file:
try:
with response_format_file.open("r") as f:
parsed = json.load(f)
if not isinstance(parsed, dict):
click.echo("Response format file must contain a JSON object")
ctx.exit(1)
return None
return parsed
except Exception as e:
click.echo(f"Failed to load response format from file: {e}")
ctx.exit(1)
return None

return None


@cli.command()
@click.option(
"--model",
Expand All @@ -378,6 +415,13 @@ def infer(ctx, model_cid: str, inference_mode: str, input_data, input_file: Path
default="settle-batch",
help="Settlement mode for x402 payments: settle (payment only), settle-batch (batched, default), settle-metadata (full data)",
)
@click.option("--response-format", type=str, default=None, help="Response format config as JSON for structured outputs")
@click.option(
"--response-format-file",
type=click.Path(exists=True, path_type=Path),
default=None,
help="Path to JSON file containing response format configuration",
)
@click.pass_context
def completion(
ctx,
Expand All @@ -387,6 +431,8 @@ def completion(
max_tokens: int,
stop_sequence: List[str],
temperature: float,
response_format: Optional[str],
response_format_file: Optional[Path],
):
"""
Run completion inference on an LLM model via TEE.
Expand All @@ -404,13 +450,17 @@ def completion(
try:
click.echo(f'Running TEE LLM completion for model "{model_cid}"\n')

# Parse response format
parsed_response_format = _parse_response_format(ctx, response_format, response_format_file)

completion_output = client.llm.completion(
model=model_cid,
prompt=prompt,
max_tokens=max_tokens,
stop_sequence=list(stop_sequence),
temperature=temperature,
x402_settlement_mode=x402SettlementModes[x402_settlement_mode],
response_format=parsed_response_format,
)

print_llm_completion_result(model_cid, completion_output.transaction_hash, completion_output.completion_output, is_vanilla=False)
Expand Down Expand Up @@ -472,6 +522,13 @@ def print_llm_completion_result(model_cid, tx_hash, llm_output, is_vanilla=True)
help="Settlement mode for x402 payments: settle (payment only), settle-batch (batched, default), settle-metadata (full data)",
)
@click.option("--stream", is_flag=True, default=False, help="Stream the output from the LLM")
@click.option("--response-format", type=str, default=None, help="Response format config as JSON for structured outputs")
@click.option(
"--response-format-file",
type=click.Path(exists=True, path_type=Path),
default=None,
help="Path to JSON file containing response format configuration",
)
@click.pass_context
def chat(
ctx,
Expand All @@ -486,6 +543,8 @@ def chat(
tool_choice: Optional[str],
x402_settlement_mode: Optional[str],
stream: bool,
response_format: Optional[str],
response_format_file: Optional[Path],
):
"""
Run chat inference on an LLM model via TEE.
Expand Down Expand Up @@ -562,6 +621,9 @@ def chat(
if not tools and not tools_file:
parsed_tools = None

# Parse response format
parsed_response_format = _parse_response_format(ctx, response_format, response_format_file)

result = client.llm.chat(
model=model_cid,
messages=messages,
Expand All @@ -572,6 +634,7 @@ def chat(
tool_choice=tool_choice,
x402_settlement_mode=x402SettlementModes[x402_settlement_mode],
stream=stream,
response_format=parsed_response_format,
)

# Handle response based on streaming flag
Expand Down
28 changes: 27 additions & 1 deletion src/opengradient/client/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from x402v2.mechanisms.evm.exact.register import register_exact_evm_client as register_exact_evm_clientv2
from x402v2.mechanisms.evm.upto.register import register_upto_evm_client as register_upto_evm_clientv2

from ..types import TEE_LLM, StreamChunk, TextGenerationOutput, TextGenerationStream, x402SettlementMode
from ..types import TEE_LLM, ResponseFormat, StreamChunk, TextGenerationOutput, TextGenerationStream, x402SettlementMode
from .exceptions import OpenGradientError
from .opg_token import Permit2ApprovalResult, ensure_opg_approval

Expand Down Expand Up @@ -148,6 +148,7 @@ def completion(
stop_sequence: Optional[List[str]] = None,
temperature: float = 0.0,
x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
response_format: Optional[Union[Dict, ResponseFormat]] = None,
) -> TextGenerationOutput:
"""
Perform inference on an LLM model using completions via TEE.
Expand All @@ -163,6 +164,10 @@ def completion(
- SETTLE_BATCH: Aggregates multiple inferences into batch hashes (most cost-efficient).
- SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
Defaults to SETTLE_BATCH.
response_format (dict or ResponseFormat, optional): Constrain the output format.
Use ``{"type": "json_object"}`` for freeform JSON, or provide a full JSON schema
via ``ResponseFormat`` / a dict with ``type`` and ``json_schema`` keys.
See `OpenAI structured outputs <https://platform.openai.com/docs/guides/structured-outputs>`_.

Returns:
TextGenerationOutput: Generated text results including:
Expand All @@ -180,6 +185,7 @@ def completion(
stop_sequence=stop_sequence,
temperature=temperature,
x402_settlement_mode=x402_settlement_mode,
response_format=response_format,
)

def _tee_llm_completion(
Expand All @@ -190,6 +196,7 @@ def _tee_llm_completion(
stop_sequence: Optional[List[str]] = None,
temperature: float = 0.0,
x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
response_format: Optional[Union[Dict, ResponseFormat]] = None,
) -> TextGenerationOutput:
"""
Route completion request to OpenGradient TEE LLM server with x402 payments.
Expand All @@ -212,6 +219,9 @@ async def make_request_v2():
if stop_sequence:
payload["stop"] = stop_sequence

if response_format is not None:
payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format

try:
response = await self._request_client.post(
self._og_llm_server_url + "/v1/completions", json=payload, headers=headers, timeout=60
Expand Down Expand Up @@ -246,6 +256,7 @@ def chat(
tool_choice: Optional[str] = None,
x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
stream: bool = False,
response_format: Optional[Union[Dict, ResponseFormat]] = None,
) -> Union[TextGenerationOutput, TextGenerationStream]:
"""
Perform inference on an LLM model using chat via TEE.
Expand All @@ -264,6 +275,10 @@ def chat(
- SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
Defaults to SETTLE_BATCH.
stream (bool, optional): Whether to stream the response. Default is False.
response_format (dict or ResponseFormat, optional): Constrain the output format.
Use ``{"type": "json_object"}`` for freeform JSON, or provide a full JSON schema
via ``ResponseFormat`` / a dict with ``type`` and ``json_schema`` keys.
See `OpenAI structured outputs <https://platform.openai.com/docs/guides/structured-outputs>`_.

Returns:
Union[TextGenerationOutput, TextGenerationStream]:
Expand All @@ -284,6 +299,7 @@ def chat(
tools=tools,
tool_choice=tool_choice,
x402_settlement_mode=x402_settlement_mode,
response_format=response_format,
)
else:
# Non-streaming
Expand All @@ -296,6 +312,7 @@ def chat(
tools=tools,
tool_choice=tool_choice,
x402_settlement_mode=x402_settlement_mode,
response_format=response_format,
)

def _tee_llm_chat(
Expand All @@ -308,6 +325,7 @@ def _tee_llm_chat(
tools: Optional[List[Dict]] = None,
tool_choice: Optional[str] = None,
x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
response_format: Optional[Union[Dict, ResponseFormat]] = None,
) -> TextGenerationOutput:
"""
Route chat request to OpenGradient TEE LLM server with x402 payments.
Expand All @@ -334,6 +352,9 @@ async def make_request_v2():
payload["tools"] = tools
payload["tool_choice"] = tool_choice or "auto"

if response_format is not None:
payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format

try:
endpoint = "/v1/chat/completions"
response = await self._request_client.post(
Expand Down Expand Up @@ -374,6 +395,7 @@ def _tee_llm_chat_stream_sync(
tools: Optional[List[Dict]] = None,
tool_choice: Optional[str] = None,
x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
response_format: Optional[Union[Dict, ResponseFormat]] = None,
):
"""
Sync streaming using threading bridge - TRUE real-time streaming.
Expand All @@ -395,6 +417,7 @@ async def _stream():
tools=tools,
tool_choice=tool_choice,
x402_settlement_mode=x402_settlement_mode,
response_format=response_format,
):
queue.put(chunk)
except Exception as e:
Expand Down Expand Up @@ -430,6 +453,7 @@ async def _tee_llm_chat_stream_async(
tools: Optional[List[Dict]] = None,
tool_choice: Optional[str] = None,
x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
response_format: Optional[Union[Dict, ResponseFormat]] = None,
):
"""
Internal async streaming implementation for TEE LLM with x402 payments.
Expand All @@ -455,6 +479,8 @@ async def _tee_llm_chat_stream_async(
if tools:
payload["tools"] = tools
payload["tool_choice"] = tool_choice or "auto"
if response_format is not None:
payload["response_format"] = response_format.to_dict() if isinstance(response_format, ResponseFormat) else response_format

async def _parse_sse_response(response) -> AsyncGenerator[StreamChunk, None]:
status_code = getattr(response, "status_code", None)
Expand Down
Loading