Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pip install verdikt-sdk
## Usage

```python
from verdikt_sdk import VerdiktClient, EvaluationType, Question
from verdikt_sdk import AnswerWithCost, VerdiktClient, EvaluationType, Question
from yalc import LLMModel

client = VerdiktClient(
Expand All @@ -28,18 +28,29 @@ await client.add_questions("my-app", [
Question(question="What is the capital of France?", human_answer="Paris"),
])

# Your callback returns the answer plus the cost it took your app to produce it.
# `cost` is optional — pass None when you do not track it.
async def my_llm_function(question: str) -> AnswerWithCost:
answer, cost = await my_app(question)
return AnswerWithCost(answer=answer, cost=cost)

# Run an evaluation cycle
await client.run_evaluation(
app_slug="my-app",
app_version="v1.2.0",
callback=my_llm_function, # async fn(question: str) -> str
callback=my_llm_function,
evaluation_type=EvaluationType.LLM_ONLY,
llm_judge_models=[LLMModel.gpt_4o_mini],
)
```

`run_evaluation` calls your `callback` concurrently for every question in the dataset, then submits all answers to Verdikt for judgment.

> **Breaking change in 0.2.0:** the `callback` now returns
> `AnswerWithCost(answer=..., cost=...)` instead of a bare `str`. Callers on
> 0.1.x must wrap their return value (`return AnswerWithCost(answer=ans)` is
> a drop-in equivalent of the old behaviour).

## Authentication

The SDK authenticates via Zitadel OAuth2 client credentials. Create a machine user in your Zitadel project and pass its `client_id` and `client_secret` to `EvaluationClient`.
8 changes: 6 additions & 2 deletions SDK.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,17 @@ Idempotent — safe to call on every deploy. Uses SHA-256 of the question text a
### `run_evaluation(app_slug, app_version, callback, ...)`
1. Resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}` (cached per client instance)
2. `GET /v1/app/{id}/datasets` → full question list
3. For each dataset item: `answer = callback(item["question"])`
3. For each dataset item: `result = await callback(item["question"])` where
`result` is an `AnswerWithCost(answer: str, cost: float | None)`
4. `POST /v1/app/{id}/evaluation` with:
```json
{
"app_version": "<app_version>",
"evaluation_type": "<evaluation_type>",
"app_answers": { "<dataset_id>": "<answer>", ... },
"app_answers": {
"<dataset_id>": { "answer": "<answer>", "cost": 0.0123 },
"<dataset_id>": { "answer": "<answer>", "cost": null }
},
"llm_judge_models": ["gpt-4o-mini"]
}
```
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
[project]
name = "verdikt-sdk"
version = "0.1.1"
version = "0.2.0"
description = "Python SDK for the Verdikt Evaluation API"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"httpx>=0.28.1",
"pydantic>=2.0",
"yalc>=0.2.1",
"yalc>=0.3.2",
]

[build-system]
Expand All @@ -33,5 +33,8 @@ plugins = [
"pydantic.mypy"
]

[tool.pytest.ini_options]
asyncio_mode = "auto"

[tool.ruff]
line-length = 88
112 changes: 112 additions & 0 deletions tests/test_run_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import json

import httpx
import pytest
from yalc import LLMModel

from verdikt_sdk import AnswerWithCost, EvaluationType, VerdiktClient


def _build_handler(captured: dict) -> httpx.MockTransport:
"""Mock transport that fakes the auth + datasets + evaluation endpoints.

Captures the evaluation POST body in *captured* so tests can assert it.
"""

def handler(request: httpx.Request) -> httpx.Response:
url = str(request.url)
if url.endswith("/.well-known"):
return httpx.Response(200, json={"issuer": "http://issuer.test"})
if "/oauth/v2/token" in url:
return httpx.Response(
200,
json={
"access_token": "tok",
"id_token": "id",
"token_type": "Bearer",
"expires_in": 3600,
},
)
if "/v1/app/by-slug/" in url:
return httpx.Response(
200, json={"id": 7, "slug": "my-app", "name": "My App"}
)
if url.endswith("/v1/app/7/datasets") and request.method == "GET":
return httpx.Response(
200,
json=[
{"id": 11, "question": "Q1", "human_answer": "A1"},
{"id": 12, "question": "Q2", "human_answer": "A2"},
],
)
if url.endswith("/v1/app/7/evaluation") and request.method == "POST":
captured["body"] = json.loads(request.content)
return httpx.Response(201)
return httpx.Response(404)

return httpx.MockTransport(handler)


@pytest.mark.asyncio
async def test_run_evaluation_posts_answers_with_cost_per_dataset_id():
# Arrange
captured: dict = {}
client = VerdiktClient(
base_url="http://verdikt.test",
client_id="cid",
client_secret="csec",
)
client._http = httpx.AsyncClient(transport=_build_handler(captured))
client._auth._http = client._http

async def callback(question: str) -> AnswerWithCost:
return AnswerWithCost(
answer=f"answer-for-{question}",
cost=0.5 if question == "Q1" else 1.25,
)

# Act
await client.run_evaluation(
app_slug="my-app",
app_version="v1.0.0",
callback=callback,
evaluation_type=EvaluationType.LLM_ONLY,
llm_judge_models=[LLMModel.gpt_4o_mini],
)

# Assert
assert captured["body"]["app_answers"] == {
"11": {"answer": "answer-for-Q1", "cost": 0.5},
"12": {"answer": "answer-for-Q2", "cost": 1.25},
}


@pytest.mark.asyncio
async def test_run_evaluation_serializes_null_cost_when_callback_omits_it():
# Arrange
captured: dict = {}
client = VerdiktClient(
base_url="http://verdikt.test",
client_id="cid",
client_secret="csec",
)
client._http = httpx.AsyncClient(transport=_build_handler(captured))
client._auth._http = client._http

async def callback(question: str) -> AnswerWithCost:
return AnswerWithCost(answer=f"answer-for-{question}")

# Act
await client.run_evaluation(
app_slug="my-app",
app_version="v1.0.0",
callback=callback,
evaluation_type=EvaluationType.LLM_ONLY,
llm_judge_models=[LLMModel.gpt_4o_mini],
)

# Assert
assert captured["body"]["app_answers"] == {
"11": {"answer": "answer-for-Q1", "cost": None},
"12": {"answer": "answer-for-Q2", "cost": None},
}
28 changes: 7 additions & 21 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion verdikt_sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from verdikt_sdk.client import VerdiktClient
from verdikt_sdk.models import (
AnswerWithCost,
EvaluationType,
Question,
)

__all__ = [
"VerdiktClient",
"AnswerWithCost",
"EvaluationType",
"Question",
"VerdiktClient",
]
6 changes: 4 additions & 2 deletions verdikt_sdk/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from verdikt_sdk.auth import TokenAuth
from verdikt_sdk.http import raise_for_status
from verdikt_sdk.models import (
AnswerWithCost,
AppResponse,
CreateAppRequest,
CreateDatasetRequest,
Expand Down Expand Up @@ -139,7 +140,7 @@ async def run_evaluation(
self,
app_slug: str,
app_version: str,
callback: Callable[[str], Coroutine[None, None, str]],
callback: Callable[[str], Coroutine[None, None, AnswerWithCost]],
evaluation_type: EvaluationType,
llm_judge_models: list[LLMModel],
) -> None:
Expand All @@ -151,7 +152,8 @@ async def run_evaluation(
app_slug: Slug of the target app.
app_version: Semantic version string identifying this build.
callback: Async function that receives a question string and returns
an answer string.
an :class:`AnswerWithCost` (the answer plus the optional cost
of producing it).
evaluation_type: Whether to use LLM scoring only or both human and
LLM scoring.
llm_judge_models: List of model identifiers to use as judges.
Expand Down
13 changes: 12 additions & 1 deletion verdikt_sdk/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,21 @@ class EvaluationType(StrEnum):
HUMAN_AND_LLM = "HUMAN_AND_LLM"


class AnswerWithCost(BaseModel):
"""Answer plus the cost incurred producing it.

Returned by ``run_evaluation`` callbacks. ``cost`` is optional — pass
``None`` (or omit) when cost tracking is not relevant.
"""

answer: str
cost: float | None = None


class CreateEvaluationRequest(BaseModel):
"""Request body for ``POST /v1/app/{id}/evaluation``."""

app_version: str
evaluation_type: EvaluationType
app_answers: dict[str, str]
app_answers: dict[str, AnswerWithCost]
llm_judge_models: list[LLMModel]
Loading