diff --git a/README.md b/README.md
index 61a3bd4..32908b6 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ pip install verdikt-sdk
 ## Usage
 
 ```python
-from verdikt_sdk import VerdiktClient, EvaluationType, Question
+from verdikt_sdk import AnswerWithCost, VerdiktClient, EvaluationType, Question
 from yalc import LLMModel
 
 client = VerdiktClient(
@@ -28,11 +28,17 @@ await client.add_questions("my-app", [
     Question(question="What is the capital of France?", human_answer="Paris"),
 ])
 
+# Your callback returns the answer plus the cost it took your app to produce it.
+# `cost` is optional — pass None when you do not track it.
+async def my_llm_function(question: str) -> AnswerWithCost:
+    answer, cost = await my_app(question)
+    return AnswerWithCost(answer=answer, cost=cost)
+
 # Run an evaluation cycle
 await client.run_evaluation(
     app_slug="my-app",
     app_version="v1.2.0",
-    callback=my_llm_function,  # async fn(question: str) -> str
+    callback=my_llm_function,
     evaluation_type=EvaluationType.LLM_ONLY,
     llm_judge_models=[LLMModel.gpt_4o_mini],
 )
@@ -40,6 +46,11 @@ await client.run_evaluation(
 
 `run_evaluation` calls your `callback` concurrently for every question in the dataset, then submits all answers to Verdikt for judgment.
 
+> **Breaking change in 0.2.0:** the `callback` now returns
+> `AnswerWithCost(answer=..., cost=...)` instead of a bare `str`. Callers on
+> 0.1.x must wrap their return value (`return AnswerWithCost(answer=ans)` is
+> a drop-in equivalent of the old behaviour).
+
 ## Authentication
 
 The SDK authenticates via Zitadel OAuth2 client credentials. Create a machine user in your Zitadel project and pass its `client_id` and `client_secret` to `EvaluationClient`.
diff --git a/SDK.md b/SDK.md
index 7662d00..b9ca420 100644
--- a/SDK.md
+++ b/SDK.md
@@ -97,13 +97,17 @@ Idempotent — safe to call on every deploy. Uses SHA-256 of the question text a
 ### `run_evaluation(app_slug, app_version, callback, ...)`
 1. Resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}` (cached per client instance)
 2. `GET /v1/app/{id}/datasets` → full question list
-3. For each dataset item: `answer = callback(item["question"])`
+3. For each dataset item: `result = await callback(item["question"])` where
+   `result` is an `AnswerWithCost(answer: str, cost: float | None)`
 4. `POST /v1/app/{id}/evaluation` with:
    ```json
    {
      "app_version": "<app_version>",
      "evaluation_type": "<evaluation_type>",
-     "app_answers": { "<dataset_id>": "<answer>", ... },
+     "app_answers": {
+       "<dataset_id>": { "answer": "<answer>", "cost": 0.0123 },
+       "<dataset_id>": { "answer": "<answer>", "cost": null }
+     },
      "llm_judge_models": ["gpt-4o-mini"]
    }
    ```
diff --git a/pyproject.toml b/pyproject.toml
index 4516e76..da532c1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,13 @@
 [project]
 name = "verdikt-sdk"
-version = "0.1.1"
+version = "0.2.0"
 description = "Python SDK for the Verdikt Evaluation API"
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
     "httpx>=0.28.1",
     "pydantic>=2.0",
-    "yalc>=0.2.1",
+    "yalc>=0.3.2",
 ]
 
 [build-system]
@@ -33,5 +33,8 @@ plugins = [
     "pydantic.mypy"
 ]
 
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+
 [tool.ruff]
 line-length = 88
diff --git a/tests/test_run_evaluation.py b/tests/test_run_evaluation.py
new file mode 100644
index 0000000..91a53a1
--- /dev/null
+++ b/tests/test_run_evaluation.py
@@ -0,0 +1,112 @@
+import json
+
+import httpx
+import pytest
+from yalc import LLMModel
+
+from verdikt_sdk import AnswerWithCost, EvaluationType, VerdiktClient
+
+
+def _build_handler(captured: dict) -> httpx.MockTransport:
+    """Mock transport that fakes the auth + datasets + evaluation endpoints.
+
+    Captures the evaluation POST body in *captured* so tests can assert it.
+    """
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        url = str(request.url)
+        if url.endswith("/.well-known"):
+            return httpx.Response(200, json={"issuer": "http://issuer.test"})
+        if "/oauth/v2/token" in url:
+            return httpx.Response(
+                200,
+                json={
+                    "access_token": "tok",
+                    "id_token": "id",
+                    "token_type": "Bearer",
+                    "expires_in": 3600,
+                },
+            )
+        if "/v1/app/by-slug/" in url:
+            return httpx.Response(
+                200, json={"id": 7, "slug": "my-app", "name": "My App"}
+            )
+        if url.endswith("/v1/app/7/datasets") and request.method == "GET":
+            return httpx.Response(
+                200,
+                json=[
+                    {"id": 11, "question": "Q1", "human_answer": "A1"},
+                    {"id": 12, "question": "Q2", "human_answer": "A2"},
+                ],
+            )
+        if url.endswith("/v1/app/7/evaluation") and request.method == "POST":
+            captured["body"] = json.loads(request.content)
+            return httpx.Response(201)
+        return httpx.Response(404)
+
+    return httpx.MockTransport(handler)
+
+
+@pytest.mark.asyncio
+async def test_run_evaluation_posts_answers_with_cost_per_dataset_id():
+    # Arrange
+    captured: dict = {}
+    client = VerdiktClient(
+        base_url="http://verdikt.test",
+        client_id="cid",
+        client_secret="csec",
+    )
+    client._http = httpx.AsyncClient(transport=_build_handler(captured))
+    client._auth._http = client._http
+
+    async def callback(question: str) -> AnswerWithCost:
+        return AnswerWithCost(
+            answer=f"answer-for-{question}",
+            cost=0.5 if question == "Q1" else 1.25,
+        )
+
+    # Act
+    await client.run_evaluation(
+        app_slug="my-app",
+        app_version="v1.0.0",
+        callback=callback,
+        evaluation_type=EvaluationType.LLM_ONLY,
+        llm_judge_models=[LLMModel.gpt_4o_mini],
+    )
+
+    # Assert
+    assert captured["body"]["app_answers"] == {
+        "11": {"answer": "answer-for-Q1", "cost": 0.5},
+        "12": {"answer": "answer-for-Q2", "cost": 1.25},
+    }
+
+
+@pytest.mark.asyncio
+async def test_run_evaluation_serializes_null_cost_when_callback_omits_it():
+    # Arrange
+    captured: dict = {}
+    client = VerdiktClient(
+        base_url="http://verdikt.test",
+        client_id="cid",
+        client_secret="csec",
+    )
+    client._http = httpx.AsyncClient(transport=_build_handler(captured))
+    client._auth._http = client._http
+
+    async def callback(question: str) -> AnswerWithCost:
+        return AnswerWithCost(answer=f"answer-for-{question}")
+
+    # Act
+    await client.run_evaluation(
+        app_slug="my-app",
+        app_version="v1.0.0",
+        callback=callback,
+        evaluation_type=EvaluationType.LLM_ONLY,
+        llm_judge_models=[LLMModel.gpt_4o_mini],
+    )
+
+    # Assert
+    assert captured["body"]["app_answers"] == {
+        "11": {"answer": "answer-for-Q1", "cost": None},
+        "12": {"answer": "answer-for-Q2", "cost": None},
+    }
diff --git a/uv.lock b/uv.lock
index 62c6ad2..828a628 100644
--- a/uv.lock
+++ b/uv.lock
@@ -561,12 +561,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/45/04/e442e1356c97b03a6d30d2b462f7c0bdfbf207e75f6833815fd1225a75b4/instructor-1.14.5-py3-none-any.whl", hash = "sha256:2a5a31222b008c0989be1cc001e33a237f49506e80ac5833f6d36d7690bae7b1", size = 177445, upload-time = "2026-01-29T14:18:53.641Z" },
 ]
 
-[package.optional-dependencies]
-anthropic = [
-    { name = "anthropic" },
-    { name = "xmltodict" },
-]
-
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -1603,7 +1597,7 @@ wheels = [
 
 [[package]]
 name = "verdikt-sdk"
-version = "0.1.1"
+version = "0.2.0"
 source = { editable = "." }
 dependencies = [
     { name = "httpx" },
@@ -1626,7 +1620,7 @@ dev = [
 requires-dist = [
     { name = "httpx", specifier = ">=0.28.1" },
     { name = "pydantic", specifier = ">=2.0" },
-    { name = "yalc", specifier = ">=0.2.1" },
+    { name = "yalc", specifier = ">=0.3.2" },
 ]
 
 [package.metadata.requires-dev]
@@ -1655,27 +1649,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/55/896b06bf93a49bec0f4ae2a6f1ed12bd05c8860744ac3a70eda041064e4d/virtualenv-21.1.0-py3-none-any.whl", hash = "sha256:164f5e14c5587d170cf98e60378eb91ea35bf037be313811905d3a24ea33cc07", size = 5825072, upload-time = "2026-02-27T08:49:27.516Z" },
 ]
 
-[[package]]
-name = "xmltodict"
-version = "1.0.4"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/19/70/80f3b7c10d2630aa66414bf23d210386700aa390547278c789afa994fd7e/xmltodict-1.0.4.tar.gz", hash = "sha256:6d94c9f834dd9e44514162799d344d815a3a4faec913717a9ecbfa5be1bb8e61", size = 26124, upload-time = "2026-02-22T02:21:22.074Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/38/34/98a2f52245f4d47be93b580dae5f9861ef58977d73a79eb47c58f1ad1f3a/xmltodict-1.0.4-py3-none-any.whl", hash = "sha256:a4a00d300b0e1c59fc2bfccb53d7b2e88c32f200df138a0dd2229f842497026a", size = 13580, upload-time = "2026-02-22T02:21:21.039Z" },
-]
-
 [[package]]
 name = "yalc"
-version = "0.2.1"
+version = "0.3.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "anthropic" },
     { name = "cachetools" },
-    { name = "instructor", extra = ["anthropic"] },
+    { name = "instructor" },
     { name = "litellm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b6/ca/e6b13d5b9261342feb0dc18caa277f7e4eefc43cf486f1665749411f6625/yalc-0.2.1.tar.gz", hash = "sha256:8ce5cb4fd2101b1b575d3138137a82c5c5a8be7e0f05bdc98ee27ad5ec4f0026", size = 7217, upload-time = "2026-02-21T13:54:54.071Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/63/f3/ba1a90f2f35243ca7d0f2b3bb78ddd087c18aabbe7842305098c6ee34a80/yalc-0.3.2.tar.gz", hash = "sha256:71eb3f87e385dd15c589bffa366242dba7ffa357e2bff2c62d1b383e9710bc69", size = 110616, upload-time = "2026-04-26T09:52:34.923Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ea/24/9c4af6a39a1b7af3c125c5ff914ebfb4ce0a795ed1ccfd3ad2a08d1f8875/yalc-0.2.1-py3-none-any.whl", hash = "sha256:23cb5608b518a08d516cecceab95cc76c844878d8da1f1ad5d9c0a9fe8877bdf", size = 9178, upload-time = "2026-02-21T13:54:52.75Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/e6/79373059f254a6917c1dd7220e2a175f231dea17244cea94a29caed7eb8f/yalc-0.3.2-py3-none-any.whl", hash = "sha256:07035b68f180e2f56d4ea249a6a4adbddd531bd7fecf708bfbf1e99ee88c48b1", size = 8993, upload-time = "2026-04-26T09:52:33.462Z" },
 ]
 
 [[package]]
diff --git a/verdikt_sdk/__init__.py b/verdikt_sdk/__init__.py
index 40c3e53..7c96022 100644
--- a/verdikt_sdk/__init__.py
+++ b/verdikt_sdk/__init__.py
@@ -2,12 +2,14 @@
 
 from verdikt_sdk.client import VerdiktClient
 from verdikt_sdk.models import (
+    AnswerWithCost,
     EvaluationType,
     Question,
 )
 
 __all__ = [
-    "VerdiktClient",
+    "AnswerWithCost",
     "EvaluationType",
     "Question",
+    "VerdiktClient",
 ]
diff --git a/verdikt_sdk/client.py b/verdikt_sdk/client.py
index 38b94c5..e88395f 100644
--- a/verdikt_sdk/client.py
+++ b/verdikt_sdk/client.py
@@ -12,6 +12,7 @@
 from verdikt_sdk.auth import TokenAuth
 from verdikt_sdk.http import raise_for_status
 from verdikt_sdk.models import (
+    AnswerWithCost,
     AppResponse,
     CreateAppRequest,
     CreateDatasetRequest,
@@ -139,7 +140,7 @@ async def run_evaluation(
         self,
         app_slug: str,
         app_version: str,
-        callback: Callable[[str], Coroutine[None, None, str]],
+        callback: Callable[[str], Coroutine[None, None, AnswerWithCost]],
         evaluation_type: EvaluationType,
         llm_judge_models: list[LLMModel],
     ) -> None:
@@ -151,7 +152,8 @@ async def run_evaluation(
             app_slug: Slug of the target app.
             app_version: Semantic version string identifying this build.
             callback: Async function that receives a question string and returns
-                an answer string.
+                an :class:`AnswerWithCost` (the answer plus the optional cost
+                of producing it).
             evaluation_type: Whether to use LLM scoring only or both human and
                 LLM scoring.
             llm_judge_models: List of model identifiers to use as judges.
diff --git a/verdikt_sdk/models.py b/verdikt_sdk/models.py
index c8585a4..c3d25f9 100644
--- a/verdikt_sdk/models.py
+++ b/verdikt_sdk/models.py
@@ -99,10 +99,21 @@ class EvaluationType(StrEnum):
     HUMAN_AND_LLM = "HUMAN_AND_LLM"
 
 
+class AnswerWithCost(BaseModel):
+    """Answer plus the cost incurred producing it.
+
+    Returned by ``run_evaluation`` callbacks. ``cost`` is optional — pass
+    ``None`` (or omit) when cost tracking is not relevant.
+    """
+
+    answer: str
+    cost: float | None = None
+
+
 class CreateEvaluationRequest(BaseModel):
     """Request body for ``POST /v1/app/{id}/evaluation``."""
 
     app_version: str
     evaluation_type: EvaluationType
-    app_answers: dict[str, str]
+    app_answers: dict[str, AnswerWithCost]
     llm_judge_models: list[LLMModel]