diff --git a/README.md b/README.md index 61a3bd4..32908b6 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ pip install verdikt-sdk ## Usage ```python -from verdikt_sdk import VerdiktClient, EvaluationType, Question +from verdikt_sdk import AnswerWithCost, VerdiktClient, EvaluationType, Question from yalc import LLMModel client = VerdiktClient( @@ -28,11 +28,17 @@ await client.add_questions("my-app", [ Question(question="What is the capital of France?", human_answer="Paris"), ]) +# Your callback returns the answer plus the cost it took your app to produce it. +# `cost` is optional — pass None when you do not track it. +async def my_llm_function(question: str) -> AnswerWithCost: + answer, cost = await my_app(question) + return AnswerWithCost(answer=answer, cost=cost) + # Run an evaluation cycle await client.run_evaluation( app_slug="my-app", app_version="v1.2.0", - callback=my_llm_function, # async fn(question: str) -> str + callback=my_llm_function, evaluation_type=EvaluationType.LLM_ONLY, llm_judge_models=[LLMModel.gpt_4o_mini], ) @@ -40,6 +46,11 @@ await client.run_evaluation( `run_evaluation` calls your `callback` concurrently for every question in the dataset, then submits all answers to Verdikt for judgment. +> **Breaking change in 0.2.0:** the `callback` now returns +> `AnswerWithCost(answer=..., cost=...)` instead of a bare `str`. Callers on +> 0.1.x must wrap their return value (`return AnswerWithCost(answer=ans)` is +> a drop-in equivalent of the old behaviour). + ## Authentication The SDK authenticates via Zitadel OAuth2 client credentials. Create a machine user in your Zitadel project and pass its `client_id` and `client_secret` to `EvaluationClient`. diff --git a/SDK.md b/SDK.md index 7662d00..b9ca420 100644 --- a/SDK.md +++ b/SDK.md @@ -97,13 +97,17 @@ Idempotent — safe to call on every deploy. Uses SHA-256 of the question text a ### `run_evaluation(app_slug, app_version, callback, ...)` 1. Resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}` (cached per client instance) 2. `GET /v1/app/{id}/datasets` → full question list -3. For each dataset item: `answer = callback(item["question"])` +3. For each dataset item: `result = await callback(item["question"])` where + `result` is an `AnswerWithCost(answer: str, cost: float | None)` 4. `POST /v1/app/{id}/evaluation` with: ```json { "app_version": "", "evaluation_type": "", - "app_answers": { "": "", ... }, + "app_answers": { + "": { "answer": "", "cost": 0.0123 }, + "": { "answer": "", "cost": null } + }, "llm_judge_models": ["gpt-4o-mini"] } ``` diff --git a/pyproject.toml b/pyproject.toml index 4516e76..da532c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [project] name = "verdikt-sdk" -version = "0.1.1" +version = "0.2.0" description = "Python SDK for the Verdikt Evaluation API" readme = "README.md" requires-python = ">=3.13" dependencies = [ "httpx>=0.28.1", "pydantic>=2.0", - "yalc>=0.2.1", + "yalc>=0.3.2", ] [build-system] @@ -33,5 +33,8 @@ plugins = [ "pydantic.mypy" ] +[tool.pytest.ini_options] +asyncio_mode = "auto" + [tool.ruff] line-length = 88 diff --git a/tests/test_run_evaluation.py b/tests/test_run_evaluation.py new file mode 100644 index 0000000..91a53a1 --- /dev/null +++ b/tests/test_run_evaluation.py @@ -0,0 +1,112 @@ +import json + +import httpx +import pytest +from yalc import LLMModel + +from verdikt_sdk import AnswerWithCost, EvaluationType, VerdiktClient + + +def _build_handler(captured: dict) -> httpx.MockTransport: + """Mock transport that fakes the auth + datasets + evaluation endpoints. + + Captures the evaluation POST body in *captured* so tests can assert it. + """ + + def handler(request: httpx.Request) -> httpx.Response: + url = str(request.url) + if url.endswith("/.well-known"): + return httpx.Response(200, json={"issuer": "http://issuer.test"}) + if "/oauth/v2/token" in url: + return httpx.Response( + 200, + json={ + "access_token": "tok", + "id_token": "id", + "token_type": "Bearer", + "expires_in": 3600, + }, + ) + if "/v1/app/by-slug/" in url: + return httpx.Response( + 200, json={"id": 7, "slug": "my-app", "name": "My App"} + ) + if url.endswith("/v1/app/7/datasets") and request.method == "GET": + return httpx.Response( + 200, + json=[ + {"id": 11, "question": "Q1", "human_answer": "A1"}, + {"id": 12, "question": "Q2", "human_answer": "A2"}, + ], + ) + if url.endswith("/v1/app/7/evaluation") and request.method == "POST": + captured["body"] = json.loads(request.content) + return httpx.Response(201) + return httpx.Response(404) + + return httpx.MockTransport(handler) + + +@pytest.mark.asyncio +async def test_run_evaluation_posts_answers_with_cost_per_dataset_id(): + # Arrange + captured: dict = {} + client = VerdiktClient( + base_url="http://verdikt.test", + client_id="cid", + client_secret="csec", + ) + client._http = httpx.AsyncClient(transport=_build_handler(captured)) + client._auth._http = client._http + + async def callback(question: str) -> AnswerWithCost: + return AnswerWithCost( + answer=f"answer-for-{question}", + cost=0.5 if question == "Q1" else 1.25, + ) + + # Act + await client.run_evaluation( + app_slug="my-app", + app_version="v1.0.0", + callback=callback, + evaluation_type=EvaluationType.LLM_ONLY, + llm_judge_models=[LLMModel.gpt_4o_mini], + ) + + # Assert + assert captured["body"]["app_answers"] == { + "11": {"answer": "answer-for-Q1", "cost": 0.5}, + "12": {"answer": "answer-for-Q2", "cost": 1.25}, + } + + +@pytest.mark.asyncio +async def test_run_evaluation_serializes_null_cost_when_callback_omits_it(): + # Arrange + captured: dict = {} + client = VerdiktClient( + base_url="http://verdikt.test", + client_id="cid", + client_secret="csec", + ) + client._http = httpx.AsyncClient(transport=_build_handler(captured)) + client._auth._http = client._http + + async def callback(question: str) -> AnswerWithCost: + return AnswerWithCost(answer=f"answer-for-{question}") + + # Act + await client.run_evaluation( + app_slug="my-app", + app_version="v1.0.0", + callback=callback, + evaluation_type=EvaluationType.LLM_ONLY, + llm_judge_models=[LLMModel.gpt_4o_mini], + ) + + # Assert + assert captured["body"]["app_answers"] == { + "11": {"answer": "answer-for-Q1", "cost": None}, + "12": {"answer": "answer-for-Q2", "cost": None}, + } diff --git a/uv.lock b/uv.lock index 62c6ad2..828a628 100644 --- a/uv.lock +++ b/uv.lock @@ -561,12 +561,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/45/04/e442e1356c97b03a6d30d2b462f7c0bdfbf207e75f6833815fd1225a75b4/instructor-1.14.5-py3-none-any.whl", hash = "sha256:2a5a31222b008c0989be1cc001e33a237f49506e80ac5833f6d36d7690bae7b1", size = 177445, upload-time = "2026-01-29T14:18:53.641Z" }, ] -[package.optional-dependencies] -anthropic = [ - { name = "anthropic" }, - { name = "xmltodict" }, -] - [[package]] name = "jinja2" version = "3.1.6" @@ -1603,7 +1597,7 @@ wheels = [ [[package]] name = "verdikt-sdk" -version = "0.1.1" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "httpx" }, @@ -1626,7 +1620,7 @@ dev = [ requires-dist = [ { name = "httpx", specifier = ">=0.28.1" }, { name = "pydantic", specifier = ">=2.0" }, - { name = "yalc", specifier = ">=0.2.1" }, + { name = "yalc", specifier = ">=0.3.2" }, ] [package.metadata.requires-dev] @@ -1655,27 +1649,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/55/896b06bf93a49bec0f4ae2a6f1ed12bd05c8860744ac3a70eda041064e4d/virtualenv-21.1.0-py3-none-any.whl", hash = "sha256:164f5e14c5587d170cf98e60378eb91ea35bf037be313811905d3a24ea33cc07", size = 5825072, upload-time = "2026-02-27T08:49:27.516Z" }, ] -[[package]] -name = "xmltodict" -version = "1.0.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/19/70/80f3b7c10d2630aa66414bf23d210386700aa390547278c789afa994fd7e/xmltodict-1.0.4.tar.gz", hash = "sha256:6d94c9f834dd9e44514162799d344d815a3a4faec913717a9ecbfa5be1bb8e61", size = 26124, upload-time = "2026-02-22T02:21:22.074Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/38/34/98a2f52245f4d47be93b580dae5f9861ef58977d73a79eb47c58f1ad1f3a/xmltodict-1.0.4-py3-none-any.whl", hash = "sha256:a4a00d300b0e1c59fc2bfccb53d7b2e88c32f200df138a0dd2229f842497026a", size = 13580, upload-time = "2026-02-22T02:21:21.039Z" }, -] - [[package]] name = "yalc" -version = "0.2.1" +version = "0.3.2" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "anthropic" }, { name = "cachetools" }, - { name = "instructor", extra = ["anthropic"] }, + { name = "instructor" }, { name = "litellm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b6/ca/e6b13d5b9261342feb0dc18caa277f7e4eefc43cf486f1665749411f6625/yalc-0.2.1.tar.gz", hash = "sha256:8ce5cb4fd2101b1b575d3138137a82c5c5a8be7e0f05bdc98ee27ad5ec4f0026", size = 7217, upload-time = "2026-02-21T13:54:54.071Z" } +sdist = { url = "https://files.pythonhosted.org/packages/63/f3/ba1a90f2f35243ca7d0f2b3bb78ddd087c18aabbe7842305098c6ee34a80/yalc-0.3.2.tar.gz", hash = "sha256:71eb3f87e385dd15c589bffa366242dba7ffa357e2bff2c62d1b383e9710bc69", size = 110616, upload-time = "2026-04-26T09:52:34.923Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ea/24/9c4af6a39a1b7af3c125c5ff914ebfb4ce0a795ed1ccfd3ad2a08d1f8875/yalc-0.2.1-py3-none-any.whl", hash = "sha256:23cb5608b518a08d516cecceab95cc76c844878d8da1f1ad5d9c0a9fe8877bdf", size = 9178, upload-time = "2026-02-21T13:54:52.75Z" }, + { url = "https://files.pythonhosted.org/packages/f4/e6/79373059f254a6917c1dd7220e2a175f231dea17244cea94a29caed7eb8f/yalc-0.3.2-py3-none-any.whl", hash = "sha256:07035b68f180e2f56d4ea249a6a4adbddd531bd7fecf708bfbf1e99ee88c48b1", size = 8993, upload-time = "2026-04-26T09:52:33.462Z" }, ] [[package]] diff --git a/verdikt_sdk/__init__.py b/verdikt_sdk/__init__.py index 40c3e53..7c96022 100644 --- a/verdikt_sdk/__init__.py +++ b/verdikt_sdk/__init__.py @@ -2,12 +2,14 @@ from verdikt_sdk.client import VerdiktClient from verdikt_sdk.models import ( + AnswerWithCost, EvaluationType, Question, ) __all__ = [ - "VerdiktClient", + "AnswerWithCost", "EvaluationType", "Question", + "VerdiktClient", ] diff --git a/verdikt_sdk/client.py b/verdikt_sdk/client.py index 38b94c5..e88395f 100644 --- a/verdikt_sdk/client.py +++ b/verdikt_sdk/client.py @@ -12,6 +12,7 @@ from verdikt_sdk.auth import TokenAuth from verdikt_sdk.http import raise_for_status from verdikt_sdk.models import ( + AnswerWithCost, AppResponse, CreateAppRequest, CreateDatasetRequest, @@ -139,7 +140,7 @@ async def run_evaluation( self, app_slug: str, app_version: str, - callback: Callable[[str], Coroutine[None, None, str]], + callback: Callable[[str], Coroutine[None, None, AnswerWithCost]], evaluation_type: EvaluationType, llm_judge_models: list[LLMModel], ) -> None: @@ -151,7 +152,8 @@ async def run_evaluation( app_slug: Slug of the target app. app_version: Semantic version string identifying this build. callback: Async function that receives a question string and returns - an answer string. + an :class:`AnswerWithCost` (the answer plus the optional cost + of producing it). evaluation_type: Whether to use LLM scoring only or both human and LLM scoring. llm_judge_models: List of model identifiers to use as judges. diff --git a/verdikt_sdk/models.py b/verdikt_sdk/models.py index c8585a4..c3d25f9 100644 --- a/verdikt_sdk/models.py +++ b/verdikt_sdk/models.py @@ -99,10 +99,21 @@ class EvaluationType(StrEnum): HUMAN_AND_LLM = "HUMAN_AND_LLM" +class AnswerWithCost(BaseModel): + """Answer plus the cost incurred producing it. + + Returned by ``run_evaluation`` callbacks. ``cost`` is optional — pass + ``None`` (or omit) when cost tracking is not relevant. + """ + + answer: str + cost: float | None = None + + class CreateEvaluationRequest(BaseModel): """Request body for ``POST /v1/app/{id}/evaluation``.""" app_version: str evaluation_type: EvaluationType - app_answers: dict[str, str] + app_answers: dict[str, AnswerWithCost] llm_judge_models: list[LLMModel]