cognitai-labs-dev · SamoKopecky · May 9, 2026 · May 9, 2026
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ pip install verdikt-sdk
 ## Usage
 
 ```python
-from verdikt_sdk import VerdiktClient, EvaluationType, Question
+from verdikt_sdk import AnswerWithCost, VerdiktClient, EvaluationType, Question
 from yalc import LLMModel
 
 client = VerdiktClient(
@@ -28,18 +28,29 @@ await client.add_questions("my-app", [
     Question(question="What is the capital of France?", human_answer="Paris"),
 ])
 
+# Your callback returns the answer plus the cost it took your app to produce it.
+# `cost` is optional — pass None when you do not track it.
+async def my_llm_function(question: str) -> AnswerWithCost:
+    answer, cost = await my_app(question)
+    return AnswerWithCost(answer=answer, cost=cost)
+
 # Run an evaluation cycle
 await client.run_evaluation(
     app_slug="my-app",
     app_version="v1.2.0",
-    callback=my_llm_function,  # async fn(question: str) -> str
+    callback=my_llm_function,
     evaluation_type=EvaluationType.LLM_ONLY,
     llm_judge_models=[LLMModel.gpt_4o_mini],
 )
 ```
 
 `run_evaluation` calls your `callback` concurrently for every question in the dataset, then submits all answers to Verdikt for judgment.
 
+> **Breaking change in 0.2.0:** the `callback` now returns
+> `AnswerWithCost(answer=..., cost=...)` instead of a bare `str`. Callers on
+> 0.1.x must wrap their return value (`return AnswerWithCost(answer=ans)` is
+> a drop-in equivalent of the old behaviour).
+
 ## Authentication
 
 The SDK authenticates via Zitadel OAuth2 client credentials. Create a machine user in your Zitadel project and pass its `client_id` and `client_secret` to `EvaluationClient`.
diff --git a/SDK.md b/SDK.md
@@ -97,13 +97,17 @@ Idempotent — safe to call on every deploy. Uses SHA-256 of the question text a
 ### `run_evaluation(app_slug, app_version, callback, ...)`
 1. Resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}` (cached per client instance)
 2. `GET /v1/app/{id}/datasets` → full question list
-3. For each dataset item: `answer = callback(item["question"])`
+3. For each dataset item: `result = await callback(item["question"])` where
+   `result` is an `AnswerWithCost(answer: str, cost: float | None)`
 4. `POST /v1/app/{id}/evaluation` with:
    ```json
    {
      "app_version": "<app_version>",
      "evaluation_type": "<evaluation_type>",
-     "app_answers": { "<dataset_id>": "<answer>", ... },
+     "app_answers": {
+       "<dataset_id>": { "answer": "<answer>", "cost": 0.0123 },
+       "<dataset_id>": { "answer": "<answer>", "cost": null }
+     },
      "llm_judge_models": ["gpt-4o-mini"]
    }
    ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,13 @@
 [project]
 name = "verdikt-sdk"
-version = "0.1.1"
+version = "0.2.0"
 description = "Python SDK for the Verdikt Evaluation API"
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
     "httpx>=0.28.1",
     "pydantic>=2.0",
-    "yalc>=0.2.1",
+    "yalc>=0.3.2",
 ]
 
 [build-system]
@@ -33,5 +33,8 @@ plugins = [
     "pydantic.mypy"
 ]
 
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+
 [tool.ruff]
 line-length = 88
diff --git a/tests/test_run_evaluation.py b/tests/test_run_evaluation.py
@@ -0,0 +1,112 @@
+import json
+
+import httpx
+import pytest
+from yalc import LLMModel
+
+from verdikt_sdk import AnswerWithCost, EvaluationType, VerdiktClient
+
+
+def _build_handler(captured: dict) -> httpx.MockTransport:
+    """Mock transport that fakes the auth + datasets + evaluation endpoints.
+
+    Captures the evaluation POST body in *captured* so tests can assert it.
+    """
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        url = str(request.url)
+        if url.endswith("/.well-known"):
+            return httpx.Response(200, json={"issuer": "http://issuer.test"})
+        if "/oauth/v2/token" in url:
+            return httpx.Response(
+                200,
+                json={
+                    "access_token": "tok",
+                    "id_token": "id",
+                    "token_type": "Bearer",
+                    "expires_in": 3600,
+                },
+            )
+        if "/v1/app/by-slug/" in url:
+            return httpx.Response(
+                200, json={"id": 7, "slug": "my-app", "name": "My App"}
+            )
+        if url.endswith("/v1/app/7/datasets") and request.method == "GET":
+            return httpx.Response(
+                200,
+                json=[
+                    {"id": 11, "question": "Q1", "human_answer": "A1"},
+                    {"id": 12, "question": "Q2", "human_answer": "A2"},
+                ],
+            )
+        if url.endswith("/v1/app/7/evaluation") and request.method == "POST":
+            captured["body"] = json.loads(request.content)
+            return httpx.Response(201)
+        return httpx.Response(404)
+
+    return httpx.MockTransport(handler)
+
+
+@pytest.mark.asyncio
+async def test_run_evaluation_posts_answers_with_cost_per_dataset_id():
+    # Arrange
+    captured: dict = {}
+    client = VerdiktClient(
+        base_url="http://verdikt.test",
+        client_id="cid",
+        client_secret="csec",
+    )
+    client._http = httpx.AsyncClient(transport=_build_handler(captured))
+    client._auth._http = client._http
+
+    async def callback(question: str) -> AnswerWithCost:
+        return AnswerWithCost(
+            answer=f"answer-for-{question}",
+            cost=0.5 if question == "Q1" else 1.25,
+        )
+
+    # Act
+    await client.run_evaluation(
+        app_slug="my-app",
+        app_version="v1.0.0",
+        callback=callback,
+        evaluation_type=EvaluationType.LLM_ONLY,
+        llm_judge_models=[LLMModel.gpt_4o_mini],
+    )
+
+    # Assert
+    assert captured["body"]["app_answers"] == {
+        "11": {"answer": "answer-for-Q1", "cost": 0.5},
+        "12": {"answer": "answer-for-Q2", "cost": 1.25},
+    }
+
+
+@pytest.mark.asyncio
+async def test_run_evaluation_serializes_null_cost_when_callback_omits_it():
+    # Arrange
+    captured: dict = {}
+    client = VerdiktClient(
+        base_url="http://verdikt.test",
+        client_id="cid",
+        client_secret="csec",
+    )
+    client._http = httpx.AsyncClient(transport=_build_handler(captured))
+    client._auth._http = client._http
+
+    async def callback(question: str) -> AnswerWithCost:
+        return AnswerWithCost(answer=f"answer-for-{question}")
+
+    # Act
+    await client.run_evaluation(
+        app_slug="my-app",
+        app_version="v1.0.0",
+        callback=callback,
+        evaluation_type=EvaluationType.LLM_ONLY,
+        llm_judge_models=[LLMModel.gpt_4o_mini],
+    )
+
+    # Assert
+    assert captured["body"]["app_answers"] == {
+        "11": {"answer": "answer-for-Q1", "cost": None},
+        "12": {"answer": "answer-for-Q2", "cost": None},
+    }
diff --git a/uv.lock b/uv.lock
diff --git a/verdikt_sdk/__init__.py b/verdikt_sdk/__init__.py
@@ -2,12 +2,14 @@
 
 from verdikt_sdk.client import VerdiktClient
 from verdikt_sdk.models import (
+    AnswerWithCost,
     EvaluationType,
     Question,
 )
 
 __all__ = [
-    "VerdiktClient",
+    "AnswerWithCost",
     "EvaluationType",
     "Question",
+    "VerdiktClient",
 ]
diff --git a/verdikt_sdk/client.py b/verdikt_sdk/client.py
@@ -12,6 +12,7 @@
 from verdikt_sdk.auth import TokenAuth
 from verdikt_sdk.http import raise_for_status
 from verdikt_sdk.models import (
+    AnswerWithCost,
     AppResponse,
     CreateAppRequest,
     CreateDatasetRequest,
@@ -139,7 +140,7 @@ async def run_evaluation(
         self,
         app_slug: str,
         app_version: str,
-        callback: Callable[[str], Coroutine[None, None, str]],
+        callback: Callable[[str], Coroutine[None, None, AnswerWithCost]],
         evaluation_type: EvaluationType,
         llm_judge_models: list[LLMModel],
     ) -> None:
@@ -151,7 +152,8 @@ async def run_evaluation(
             app_slug: Slug of the target app.
             app_version: Semantic version string identifying this build.
             callback: Async function that receives a question string and returns
-                an answer string.
+                an :class:`AnswerWithCost` (the answer plus the optional cost
+                of producing it).
             evaluation_type: Whether to use LLM scoring only or both human and
                 LLM scoring.
             llm_judge_models: List of model identifiers to use as judges.

diff --git a/verdikt_sdk/models.py b/verdikt_sdk/models.py
@@ -99,10 +99,21 @@ class EvaluationType(StrEnum):
     HUMAN_AND_LLM = "HUMAN_AND_LLM"
 
 
+class AnswerWithCost(BaseModel):
+    """Answer plus the cost incurred producing it.
+
+    Returned by ``run_evaluation`` callbacks. ``cost`` is optional — pass
+    ``None`` (or omit) when cost tracking is not relevant.
+    """
+
+    answer: str
+    cost: float | None = None
+
+
 class CreateEvaluationRequest(BaseModel):
     """Request body for ``POST /v1/app/{id}/evaluation``."""
 
     app_version: str
     evaluation_type: EvaluationType
-    app_answers: dict[str, str]
+    app_answers: dict[str, AnswerWithCost]
     llm_judge_models: list[LLMModel]