diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 959074c..9318b74 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,7 +1,7 @@
 blank_issues_enabled: true
 contact_links:
   - name: Documentation
-    url: https://github.com/intuit/fasteval/tree/main/docs
+    url: https://fasteval.io
     about: Browse the documentation for guides and API reference.
   - name: Discussions
     url: https://github.com/intuit/fasteval/discussions
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e7373c2..2c9d39f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -54,13 +54,14 @@ jobs:
       - name: Install dependencies
         run: uv sync --all-extras --group test
 
-      - name: Run tests
+      - name: Run tests with coverage
         run: |
           uv run pytest tests/ \
             --color=yes \
             --cov=fasteval \
             --cov-report=term \
             --cov-report=xml:coverage.xml \
+            --cov-fail-under=85 \
             -v
 
       - name: Upload coverage
@@ -76,7 +77,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        plugin: [fasteval-langfuse, fasteval-langgraph, fasteval-observe]
+        include:
+          - plugin: fasteval-langfuse
+            module: fasteval_langfuse
+          - plugin: fasteval-langgraph
+            module: fasteval_langgraph
+          - plugin: fasteval-observe
+            module: fasteval_observe
     steps:
       - uses: actions/checkout@v4
 
@@ -93,6 +100,16 @@ jobs:
         working-directory: plugins/${{ matrix.plugin }}
         run: uv sync --all-extras --group dev
 
-      - name: Run plugin tests
+      - name: Install coverage tools
         working-directory: plugins/${{ matrix.plugin }}
-        run: uv run pytest tests/ -v --color=yes
+        run: uv pip install pytest-cov
+
+      - name: Run plugin tests with coverage
+        working-directory: plugins/${{ matrix.plugin }}
+        run: |
+          uv run pytest tests/ \
+            --color=yes \
+            --cov=${{ matrix.module }} \
+            --cov-report=term \
+            --cov-fail-under=85 \
+            -v
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b7c0e99..c783e3a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -30,10 +30,10 @@ Thank you for your interest in contributing to fasteval! Whether it's fixing a b
    cd fasteval
    ```
 
-2. Install dependencies:
+2. Install all dependencies (including dev and test groups):
 
    ```bash
-   uv sync --all-extras
+   uv sync --all-extras --group dev --group test
    ```
 
 3. Verify everything works:
@@ -97,8 +97,9 @@ uv run mypy .
 ## Testing
 
 - All new functionality must have corresponding tests
-- Maintain code coverage at or above 80%
+- Maintain code coverage at or above **85%**
 - Tests live in `tests/` for the core package and `plugins/*/tests/` for plugins
+- Coverage configuration is in `pyproject.toml` under `[tool.coverage.run]` and `[tool.coverage.report]` -- models, vision/audio/multimodal metrics, and other non-logic files are excluded from measurement
 
 Run tests:
 
@@ -106,17 +107,23 @@ Run tests:
 # Full test suite across Python versions
 uv run tox
 
-# Quick single-version test
-uv run pytest tests/ -v --cov=fasteval
+# Quick single-version test with coverage
+uv run --group test pytest tests/ --cov=fasteval --cov-report=term -v
 
 # Run a specific test
-uv run pytest tests/test_example.py::test_name -v
+uv run --group test pytest tests/test_example.py::test_name -v
+
+# Run plugin tests (from plugin directory)
+cd plugins/fasteval-langgraph
+uv run pytest tests/ -v
 ```
 
+> **Note**: The project includes a custom pytest plugin (`fasteval.testing.plugin`). When running tests with coverage, the plugin is automatically disabled via `addopts` in `pyproject.toml` (`-p no:fasteval`) to ensure accurate coverage tracking.
+
 ## Pull Request Process
 
 1. Ensure all tests pass and linting is clean.
-2. Update documentation if your change affects user-facing behavior (see `docs/`).
+2. Update documentation if your change affects user-facing behavior. Docs are published at [fasteval.io](https://fasteval.io) and source lives in `docs/`.
 3. Open a pull request against `main` with a clear description of your changes.
 4. A maintainer will review your PR, typically within a few business days.
 5. Once approved, a maintainer will merge your contribution.
@@ -129,6 +136,14 @@ uv run pytest tests/test_example.py::test_name -v
 - Adherence to the existing code style
 - Clear, focused commits (one logical change per commit)
 
+### Writing Custom Metrics
+
+If you're contributing a new metric, see the [Custom Metrics guide](https://fasteval.io/docs/advanced/custom-metrics) for the expected patterns. All metrics should:
+- Extend `Metric` (deterministic) or `BaseLLMMetric` (LLM-based)
+- Include a corresponding decorator in `fasteval/core/decorators.py`
+- Be registered in `METRIC_REGISTRY` in `fasteval/core/evaluator.py`
+- Have tests with >85% coverage
+
 ## Project Structure
 
 ```
@@ -137,16 +152,18 @@ fasteval/
 ├── metrics/        # Metric implementations (LLM, deterministic, conversation)
 ├── models/         # Pydantic models (EvalInput, EvalResult, MetricResult)
 ├── providers/      # LLM provider clients (OpenAI, Anthropic)
-├── cache/          # Caching utilities
+├── cache/          # In-memory LRU caching
+├── collectors/     # Result collection and reporting
+│   └── reporters/  # Output reporters (JSON, HTML)
 ├── utils/          # Helpers (formatting, JSON parsing, async)
-└── testing/        # pytest plugin
+└── testing/        # pytest plugin (--fe-output, --fe-summary, --no-interactive)
 
 plugins/
 ├── fasteval-langfuse/   # Langfuse production trace evaluation
 ├── fasteval-langgraph/  # LangGraph agent testing
 └── fasteval-observe/    # Runtime monitoring
 
-docs/                    # MDX documentation
+docs/                    # MDX documentation (published at fasteval.io)
 tests/                   # Core package tests
 ```
 
diff --git a/README.md b/README.md
index 95b9459..2f0c47c 100644
--- a/README.md
+++ b/README.md
@@ -7,19 +7,34 @@
 ![Python versions](https://img.shields.io/badge/python-3.10_|_3.11_|_3.12_|_3.13_|_3.14-blue?logo=python)
 [![CI](https://github.com/intuit/fasteval/actions/workflows/ci.yml/badge.svg)](https://github.com/intuit/fasteval/actions/workflows/ci.yml)
 [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Docs](https://img.shields.io/badge/docs-fasteval.io-blue)](https://fasteval.io)
 
-A **decorator-first LLM evaluation library** for testing AI agents and LLMs. Stack decorators to define evaluation criteria, run with pytest.
+A **decorator-first LLM evaluation library** for testing AI agents and LLMs. Stack decorators to define evaluation criteria, run with pytest. [Read the docs](https://fasteval.io/docs).
+
+<p align="center">
+  <img src="hero-evaluation-journey.png" alt="The Evaluation Journey -- from non-deterministic LLM outputs to reliable engineering metrics" width="800">
+</p>
 
 ## Features
 
-- **Decorator-based metrics** -- stack `@fe.correctness`, `@fe.relevance`, `@fe.hallucination`, and 30+ more
+- **50+ built-in metrics** -- stack `@fe.correctness`, `@fe.relevance`, `@fe.hallucination`, and more
 - **pytest native** -- run evaluations with `pytest`, get familiar pass/fail output
 - **LLM-as-judge + deterministic** -- semantic LLM metrics alongside ROUGE, exact match, JSON schema, regex
+- **Custom criteria** -- `@fe.criteria("Is the response empathetic?")` for any evaluation you can describe in plain English
 - **Multi-modal** -- evaluate vision, audio, and image generation models
 - **Conversation metrics** -- context retention, topic drift, consistency for multi-turn agents
 - **RAG metrics** -- faithfulness, contextual precision, contextual recall, answer correctness
 - **Tool trajectory** -- verify agent tool calls, argument matching, call sequences
-- **Pluggable providers** -- OpenAI (default), Anthropic, Azure OpenAI, Ollama
+- **Reusable metric stacks** -- `@fe.stack()` to compose and reuse metric sets across tests
+- **Human-in-the-loop** -- `@fe.human_review()` for manual review alongside automated metrics
+- **Data-driven testing** -- `@fe.csv("test_data.csv")` to load test cases from CSV files
+- **Pluggable providers** -- OpenAI (default), Anthropic, or bring your own `LLMClient`
+
+## How It Works
+
+<p align="center">
+  <img src="fasteval-overview.png" alt="How fasteval works -- Decorate, Test, Score, Evaluate, Result" width="800">
+</p>
 
 ## Quick Start
 
@@ -96,6 +111,23 @@ def test_summary_quality():
     fe.score(actual_output=summary, expected_output=reference)
 ```
 
+### Custom Criteria
+
+```python
+@fe.criteria("Is the response empathetic and professional?")
+def test_tone():
+    response = agent("I'm frustrated with this product!")
+    fe.score(response)
+
+@fe.criteria(
+    "Does the response include a legal disclaimer?",
+    threshold=0.9,
+)
+def test_compliance():
+    response = agent("Can I break my lease?")
+    fe.score(response)
+```
+
 ### RAG Evaluation
 
 ```python
@@ -117,7 +149,8 @@ def test_rag_pipeline():
 def test_agent_tools():
     result = agent.run("Book a flight to Paris")
     fe.score(
-        actual_tools=result.tool_calls,
+        result.response,
+        tool_calls=result.tool_calls,
         expected_tools=[
             {"name": "search_flights", "args": {"destination": "Paris"}},
             {"name": "book_flight"},
@@ -125,24 +158,54 @@ def test_agent_tools():
     )
 ```
 
+### Multi-Turn Conversations
+
+```python
+@fe.context_retention(threshold=0.8)
+@fe.conversation([
+    {"query": "My name is Alice and I'm a vegetarian"},
+    {"query": "Suggest a restaurant for me"},
+    {"query": "What dietary restriction should they accommodate?"},
+])
+async def test_memory(query, expected, history):
+    response = await agent(query, history=history)
+    fe.score(response, input=query, history=history)
+```
+
 ### Metric Stacks
 
 ```python
+# Define a reusable metric stack
+@fe.stack()
 @fe.correctness(threshold=0.8, weight=2.0)
 @fe.relevance(threshold=0.7, weight=1.0)
 @fe.coherence(threshold=0.6, weight=1.0)
-def test_comprehensive():
+def quality_metrics():
+    pass
+
+# Apply to multiple tests
+@quality_metrics
+def test_chatbot():
     response = agent("Explain quantum computing")
     fe.score(response, expected_output=reference_answer, input="Explain quantum computing")
+
+@quality_metrics
+def test_summarizer():
+    summary = summarize(long_article)
+    fe.score(summary, expected_output=reference_summary)
 ```
 
 ## Plugins
 
 | Plugin | Description | Install |
 |--------|-------------|---------|
-| [fasteval-langfuse](./plugins/fasteval-langfuse/) | Evaluate Langfuse production traces with fasteval metrics | `pip install fasteval-langfuse` |
-| [fasteval-langgraph](./plugins/fasteval-langgraph/) | Test harness for LangGraph agents | `pip install fasteval-langgraph` |
-| [fasteval-observe](./plugins/fasteval-observe/) | Runtime monitoring with async sampling | `pip install fasteval-observe` |
+| [fasteval-langfuse](https://fasteval.io/docs/plugins/langfuse/overview) | Evaluate Langfuse production traces with fasteval metrics | `pip install fasteval-langfuse` |
+| [fasteval-langgraph](https://fasteval.io/docs/plugins/langgraph/overview) | Test harness for LangGraph agents | `pip install fasteval-langgraph` |
+| [fasteval-observe](https://fasteval.io/docs/plugins/observe/overview) | Runtime monitoring with async sampling | `pip install fasteval-observe` |
+
+<p align="center">
+  <img src="testing-pyramid-agents.png" alt="Testing Pyramid for Agents -- layered testing strategy with fasteval-langgraph" width="700">
+</p>
 
 ## Local Development
 
@@ -150,12 +213,15 @@ def test_comprehensive():
 # Install uv
 brew install uv
 
-# Create virtual environment and install dependencies
-uv sync --all-extras
+# Create virtual environment and install all dependencies
+uv sync --all-extras --group dev --group test
 
 # Run the test suite
 uv run tox
 
+# Run tests with coverage
+uv run pytest tests/ --cov=fasteval --cov-report=term -v
+
 # Format code
 uv run black .
 uv run isort .
@@ -166,17 +232,23 @@ uv run mypy .
 
 ## Documentation
 
-Full documentation is available in the [docs/](./docs/) directory, covering:
-
-- [Getting Started](./docs/getting-started/) -- installation, quickstart
-- [Core Concepts](./docs/core-concepts/) -- decorators, metrics, scoring, data sources
-- [LLM Metrics](./docs/llm-metrics/) -- correctness, relevance, hallucination, and more
-- [Deterministic Metrics](./docs/deterministic-metrics/) -- ROUGE, exact match, regex, JSON schema
-- [RAG Metrics](./docs/rag-metrics/) -- faithfulness, contextual precision/recall
-- [Conversation Metrics](./docs/conversation-metrics/) -- context retention, consistency
-- [Multi-Modal](./docs/multimodal/) -- vision, audio, image generation evaluation
-- [Plugins](./docs/plugins/) -- Langfuse, LangGraph, Observe
-- [API Reference](./docs/api-reference/) -- decorators, evaluator, models, score
+Full documentation is available at **[fasteval.io](https://fasteval.io)**.
+
+- [Getting Started](https://fasteval.io/docs/getting-started/quickstart) -- installation and quickstart guide
+- [Why FastEval](https://fasteval.io/docs/getting-started/introduction/why-fasteval) -- motivation and design philosophy
+- [Core Concepts](https://fasteval.io/docs/core-concepts/decorators) -- decorators, metrics, scoring, data sources
+- [Concepts](https://fasteval.io/docs/concepts/llm-as-judge) -- LLM-as-judge, scoring thresholds, evaluation strategies
+- [LLM Metrics](https://fasteval.io/docs/llm-metrics/correctness) -- correctness, relevance, hallucination, and more
+- [Deterministic Metrics](https://fasteval.io/docs/deterministic-metrics/exact-match) -- ROUGE, exact match, regex, JSON schema
+- [RAG Metrics](https://fasteval.io/docs/rag-metrics/faithfulness) -- faithfulness, contextual precision/recall
+- [Tool Trajectory](https://fasteval.io/docs/tool-tranjectory-metrics/tool-call-accuracy) -- tool call accuracy, sequence, argument matching
+- [Conversation Metrics](https://fasteval.io/docs/conversation-metrics/context-retention) -- context retention, consistency, topic drift
+- [Multi-Modal](https://fasteval.io/docs/multimodal/overview) -- vision, audio, image generation evaluation
+- [Human Review](https://fasteval.io/docs/human-review/overview) -- human-in-the-loop evaluation
+- [Cookbooks](https://fasteval.io/docs/cookbooks/rag-pipeline) -- RAG pipelines, CI/CD setup, prompt regression, production monitoring
+- [Plugins](https://fasteval.io/docs/plugins/langfuse/overview) -- Langfuse, LangGraph, Observe
+- [Advanced](https://fasteval.io/docs/advanced/custom-metrics) -- custom metrics, providers, output collectors, traces
+- [API Reference](https://fasteval.io/docs/api-reference/decorators) -- decorators, evaluator, models, score
 
 ## Contributing
 
diff --git a/docs/assets/fasteval-overview.svg b/docs/assets/fasteval-overview.svg
index cd5c336..27089bd 100644
--- a/docs/assets/fasteval-overview.svg
+++ b/docs/assets/fasteval-overview.svg
@@ -71,9 +71,9 @@
     <rect width="850" height="120" rx="12" fill="#0f172a" filter="url(#shadow)"/>
     <g transform="translate(30, 40)" font-family="monospace" font-size="12">
       <text y="0"><tspan fill="#94a3b8">import</tspan> <tspan fill="#f8fafc">fasteval</tspan> <tspan fill="#94a3b8">as</tspan> <tspan fill="#f8fafc">fe</tspan></text>
-      <text y="22"><tspan fill="#a78bfa">@fe.correctness</tspan><tspan fill="#fbbf24">(threshold=0.8)</tspan></text>
+      <text y="30"><tspan fill="#a78bfa">@fe.correctness</tspan><tspan fill="#fbbf24">(threshold=0.8)</tspan></text>
       <text y="44"><tspan fill="#60a5fa">def</tspan> <tspan fill="#f8fafc">test_my_agent():</tspan></text>
-      <text y="66"><tspan fill="#f8fafc">  fe.score(response, </tspan><tspan fill="#fbbf24">"Expected Output"</tspan><tspan fill="#f8fafc">)</tspan></text>
+      <text x="30" y="60"><tspan fill="#f8fafc">  fe.score(response, </tspan><tspan fill="#fbbf24">"Expected Output"</tspan><tspan fill="#f8fafc">)</tspan></text>
     </g>
 
     <rect x="620" y="25" width="200" height="32" rx="6" fill="#1e293b" stroke="#334155"/>
diff --git a/fasteval-overview.png b/fasteval-overview.png
new file mode 100644
index 0000000..a97942f
Binary files /dev/null and b/fasteval-overview.png differ
diff --git a/hero-evaluation-journey.png b/hero-evaluation-journey.png
new file mode 100644
index 0000000..44348d6
Binary files /dev/null and b/hero-evaluation-journey.png differ
diff --git a/medium-article.md b/medium-article.md
new file mode 100644
index 0000000..a445b20
--- /dev/null
+++ b/medium-article.md
@@ -0,0 +1,258 @@
+# We Got Tired of Writing the Same LLM Evaluation Code Over and Over, So We Open-Sourced Ours
+
+## Introducing fasteval, a decorator-first library that makes LLM testing feel like regular pytest
+
+![The evaluation journey: from non-deterministic LLM outputs to reliable engineering metrics](hero-evaluation-journey.png)
+
+---
+
+I need to tell you about a problem that drove us crazy for months before we finally did something about it.
+
+Every team building with LLMs eventually hits the same wall. You ship your first agent or chatbot, it works great in demos, and then someone asks the obvious question: "How do we know this is actually good?" Not vibes-good. Measurably good. Good enough to not embarrass us in production.
+
+That question sent us down a rabbit hole. What we found at the bottom was a mess.
+
+## The testing problem that kept showing up
+
+Every team we talked to was solving this the same way, and badly. Someone would write a one-off script that calls the model a hundred times and dumps results into a spreadsheet. Someone else would build a custom evaluator with hard-coded prompts for LLM-as-judge. A third person would slap together some ROUGE scores and call it a day.
+
+None of it talked to each other. None of it ran in CI. And none of it caught the regression that shipped to production last Thursday.
+
+Testing LLMs is fundamentally weird compared to testing normal software. You can't just assert that output equals expected. Ask a model "What's the capital of France?" and you'll get "Paris" one time and "The capital of France is Paris, a city in Western Europe" the next. Both correct. Completely different strings.
+
+So teams end up building custom evaluation frameworks. We did too. Multiple times, actually. And each time we found ourselves writing the same patterns: prompt templates for LLM-as-judge, score parsing, threshold logic, result aggregation. Over and over, in slightly different shapes.
+
+At some point we looked at each other and said: why are we doing this?
+
+## What we actually wanted
+
+We had a pretty clear picture of the ideal tool. Something that felt like pytest, not like a new platform to learn. We wanted to stack evaluation criteria the way you stack decorators: readable, composable, obvious at a glance. Both LLM-based and deterministic evaluation in the same framework, because sometimes you need semantic judgment and sometimes you just need to check if the output is valid JSON.
+
+Nothing we found checked all those boxes. So we built it.
+
+## How fasteval works
+
+![How fasteval works: Decorate → Test → Score → Evaluate → Result](fasteval-overview.png)
+
+You decorate a test function with the metrics you care about, then call `fe.score()` with your model's output.
+
+```python
+import fasteval as fe
+
+@fe.correctness(threshold=0.8)
+@fe.relevance(threshold=0.7)
+def test_qa_agent():
+    response = my_agent("What is the capital of France?")
+    fe.score(response, expected_output="Paris", input="What is the capital of France?")
+```
+
+Run it with `pytest -v` and you get pass/fail for each metric, with scores and reasoning. No config files. No dashboard setup. No new CLI to learn.
+
+We went with decorators because they make the evaluation criteria visible right where the test is defined. When someone new joins the team and opens the test file, they can immediately tell what quality bar each test enforces. People mention this more than anything else when they try the library, so I think we got that decision right.
+
+## Stacking metrics is where it gets interesting
+
+Real-world evaluation is never one-dimensional. You don't just care about correctness. You also care about relevance, whether the response is toxic, whether it follows instructions. fasteval lets you stack all of that:
+
+```python
+@fe.correctness(threshold=0.8, weight=2.0)
+@fe.relevance(threshold=0.7, weight=1.0)
+@fe.toxicity(threshold=0.95)
+def test_customer_support_bot():
+    response = support_bot("I want to cancel my subscription")
+    fe.score(
+        response,
+        expected_output="Acknowledge the request and provide cancellation steps",
+        input="I want to cancel my subscription"
+    )
+```
+
+Each metric evaluates independently. Weights let you prioritize what matters most. The test fails if any metric drops below its threshold.
+
+We've got over 30 built-in metrics at this point: correctness, hallucination, coherence, conciseness, bias, instruction following, and a bunch more. We kept adding them because every time we thought "okay that's enough," someone on the team would need one more.
+
+## Not everything needs an LLM to evaluate
+
+This was an important design decision. LLM-as-judge is powerful but it's slow and it costs money. For a lot of checks you genuinely don't need it. Does the output contain a required keyword? Is it valid JSON? Does it match a regex pattern? You don't need GPT-4 to tell you that.
+
+We built deterministic metrics right into the same decorator system:
+
+```python
+from pydantic import BaseModel
+
+class UserResponse(BaseModel):
+    name: str
+    age: int
+    email: str
+
+@fe.json(model=UserResponse)
+def test_structured_output():
+    output = my_agent("Create a user profile for Alice, age 30")
+    result = fe.score(output)
+    assert result.passed
+```
+
+No API key needed. Runs instantly. We use these for fast sanity checks on every commit, and save the heavier LLM evaluations for nightly runs. Having fast deterministic tests and thorough semantic tests living in the same framework, sharing the same decorator API, was something we didn't find in other tools.
+
+There's also `@fe.exact_match`, `@fe.contains`, `@fe.rouge`, and `@fe.regex`. Mix and match with LLM metrics however you want.
+
+## RAG evaluation was a big motivator
+
+Half the teams we work with are building RAG pipelines, and RAG is especially tricky to test. You need to evaluate retrieval quality and generation quality at the same time. Is the model sticking to the retrieved context or making stuff up? Are the right documents being pulled in the first place?
+
+```python
+@fe.faithfulness(threshold=0.8)
+@fe.contextual_precision(threshold=0.7)
+def test_rag_pipeline():
+    result = rag_pipeline("How does photosynthesis work?")
+    fe.score(
+        actual_output=result.answer,
+        context=result.retrieved_docs,
+        input="How does photosynthesis work?",
+    )
+```
+
+Faithfulness measures whether the answer is grounded in the retrieved context. Contextual precision checks whether the retriever pulled the right documents. Throw `@fe.hallucination` on top and you've got a solid RAG evaluation suite in about ten lines of code.
+
+We've actually seen teams catch retrieval regressions with this setup that they'd been missing for weeks. One team had a broken chunking config that degraded recall by 15%, and their existing tests never flagged it because they were only checking the final answer.
+
+## Testing agent tool calls
+
+If your agent is supposed to search for flights and then book one, you need to verify it actually called those tools, in the right order, with the right arguments. Doing that by hand gets old fast.
+
+```python
+@fe.tool_call_accuracy(threshold=0.9)
+def test_booking_agent():
+    result = agent.run("Book a flight to Paris")
+    fe.score(
+        actual_tools=result.tool_calls,
+        expected_tools=[
+            {"name": "search_flights", "args": {"destination": "Paris"}},
+            {"name": "book_flight"},
+        ],
+    )
+```
+
+Tool name matching, argument validation, sequence verification. We also have `@fe.tool_sequence` and `@fe.tool_args_match` for more granular control.
+
+## Testing LangGraph agents without losing your mind
+
+![Testing pyramid for agents: unit tests, integration tests, and end-to-end evaluation layers](testing-pyramid-agents.png)
+
+Okay, this one is close to my heart.
+
+If you're building agents with LangGraph, you know the pain. You've got a state graph with five or six nodes, a classifier, a retriever, a responder, maybe some routing logic, and testing the whole thing end-to-end is slow, flaky, and expensive. But testing nodes individually means ripping apart the graph and manually wiring up state. Nobody wants to do that.
+
+We built a test harness specifically for this. It wraps any compiled `StateGraph` and gives you a clean API to test the full flow, individual nodes, or anything in between.
+
+Full conversation testing is just `.chat()`:
+
+```python
+from fasteval_langgraph import harness
+import fasteval as fe
+
+graph = harness(compiled_graph)
+
+@fe.correctness(threshold=0.8)
+async def test_support_agent():
+    result = await graph.chat("How do I configure OAuth?")
+    fe.score(result.response, "Use OAuth 2.0...", input="How do I configure OAuth?")
+```
+
+The harness auto-detects whether your graph uses `MessagesState` or plain `TypedDict` and sets up sensible defaults. You don't configure anything for the common case.
+
+Where it gets really useful is node-level testing. Say you want to test just your classifier node without running the entire graph:
+
+```python
+from langchain_core.messages import HumanMessage
+
+result = await graph.node("classifier").run(
+    messages=[HumanMessage(content="What is OAuth?")]
+)
+
+assert result.updates.get("intent") == "FAQ"
+assert result.goto == "rag"  # Where the classifier routes to
+assert result.execution_time_ms < 500  # Performance check
+```
+
+State updates, routing decision, execution timing. All from running one node in isolation. We use this pattern constantly when iterating on individual node logic because waiting for the whole graph to execute every time just kills your feedback loop.
+
+For conversational agents, multi-turn sessions keep state across messages:
+
+```python
+async with graph.session() as s:
+    r1 = await s.chat("I need help with billing")
+    r2 = await s.chat("Actually, make that a refund")
+
+    # State persists across the session
+    assert r2.state["call_count"] == 2
+    assert len(s.history) == 2
+```
+
+And mocking. If you need to test a node without its dependencies hitting real APIs:
+
+```python
+from fasteval_langgraph import mock
+
+with graph.mocked(
+    mock("rag").updates({"docs": ["fake retrieval result"]}).goto("responder"),
+):
+    result = await graph.chat("What is OAuth?")
+    # RAG node is mocked, everything else runs normally
+```
+
+Mocks auto-restore when the context manager exits. No cleanup code.
+
+One more thing worth mentioning: the harness captures a full execution trace. Which nodes ran, in what order, what each one produced, how long each took. When a test fails and you're trying to figure out which node in the graph screwed up, that trace is the first thing you'll reach for.
+
+Install it separately with `pip install fasteval-langgraph`.
+
+## Production monitoring and the other plugins
+
+Building fasteval as a standalone library was step one. But we knew people would need it in production too, not just in test suites.
+
+**fasteval-langfuse** lets you evaluate production traces from Langfuse. Pull traces, run them through your metrics, push scores back. We added smart sampling so you're not burning through API credits evaluating every single request. You can sample by percentage, token budget, or adaptive strategies.
+
+**fasteval-observe** does async runtime monitoring with configurable sampling. Lightweight way to keep an eye on quality in production without adding latency.
+
+Each plugin is a separate pip install, so you only pull in what you need.
+
+## Getting started
+
+```bash
+pip install fasteval-core
+export OPENAI_API_KEY=sk-your-key-here
+```
+
+Write a test file:
+
+```python
+import fasteval as fe
+
+@fe.correctness(threshold=0.8)
+def test_my_llm():
+    response = call_your_model("What is 2 + 2?")
+    fe.score(response, expected_output="4", input="What is 2 + 2?")
+```
+
+Run it:
+
+```bash
+pytest test_my_llm.py -v
+```
+
+Want HTML reports? `--fe-output=report.html`. Aggregate statistics? `--fe-summary`. Anthropic instead of OpenAI? Set `ANTHROPIC_API_KEY`. Local Ollama model? Also works.
+
+We support Python 3.10 through 3.14. Apache 2.0 licensed.
+
+## Why we open-sourced it
+
+Because we kept seeing the same pain everywhere. Every team we talked to was reinventing evaluation from scratch. Smart people spending weeks on infrastructure that already existed, except everyone's version was trapped inside their own codebase and nobody could benefit from each other's work.
+
+We figured open-sourcing ours would save people time. And honestly, we wanted the feedback. The library has gotten meaningfully better since we opened it up because people file issues about use cases we never thought of.
+
+The repo is active and we ship regularly. If you try it and something's broken or missing, open an issue. We read them.
+
+---
+
+**GitHub**: [github.com/intuit/fasteval](https://github.com/intuit/fasteval) | **Docs**: [fasteval.io](https://fasteval.io) | **Install**: `pip install fasteval-core`
diff --git a/plugins/fasteval-langfuse/fasteval_langfuse/utils.py b/plugins/fasteval-langfuse/fasteval_langfuse/utils.py
index 3fe61b1..aeae992 100644
--- a/plugins/fasteval-langfuse/fasteval_langfuse/utils.py
+++ b/plugins/fasteval-langfuse/fasteval_langfuse/utils.py
@@ -57,7 +57,7 @@ def parse_time_range(time_range: str) -> tuple[Optional[str], Optional[str]]:
     Returns:
         Tuple of (from_timestamp, to_timestamp) in ISO 8601 format
     """
-    from datetime import datetime, timedelta
+    from datetime import datetime, timedelta, timezone
 
     if not time_range:
         return None, None
@@ -78,8 +78,10 @@ def parse_time_range(time_range: str) -> tuple[Optional[str], Optional[str]]:
             else:
                 raise ValueError(f"Invalid time range format: {time_range}")
 
-            to_timestamp = datetime.utcnow().isoformat() + "Z"
-            from_timestamp = (datetime.utcnow() - delta).isoformat() + "Z"
+            to_timestamp = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+            from_timestamp = (
+                (datetime.now(timezone.utc) - delta).isoformat().replace("+00:00", "Z")
+            )
             return from_timestamp, to_timestamp
 
     # Handle "YYYY-MM-DD to YYYY-MM-DD" format
diff --git a/plugins/fasteval-langfuse/tests/test_client.py b/plugins/fasteval-langfuse/tests/test_client.py
new file mode 100644
index 0000000..d34c5b7
--- /dev/null
+++ b/plugins/fasteval-langfuse/tests/test_client.py
@@ -0,0 +1,115 @@
+"""Tests for fasteval_langfuse.client."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from fasteval_langfuse.client import LangfuseClient
+
+
+def _make_mock_client():
+    """Create a LangfuseClient with mocked internals."""
+    client = LangfuseClient.__new__(LangfuseClient)
+    client.public_key = "pk-test"
+    client.secret_key = "sk-test"
+    client.host = "https://cloud.langfuse.com"
+    client._client = MagicMock()
+    return client
+
+
+class TestLangfuseClientInit:
+    def test_missing_credentials(self):
+        with (
+            patch.dict(
+                "os.environ",
+                {"LANGFUSE_PUBLIC_KEY": "", "LANGFUSE_SECRET_KEY": ""},
+                clear=False,
+            ),
+            patch(
+                "fasteval_langfuse.config.get_config",
+                return_value=MagicMock(public_key=None, secret_key=None, host="h"),
+            ),
+        ):
+            with pytest.raises(ValueError, match="Langfuse credentials required"):
+                LangfuseClient()
+
+
+class TestFetchTraces:
+    def test_fetch_traces_basic(self):
+        client = _make_mock_client()
+        mock_trace = MagicMock()
+        mock_trace.id = "t-1"
+        mock_trace.timestamp = "2026-01-01"
+        mock_trace.name = "query"
+        mock_trace.user_id = "u1"
+        mock_trace.session_id = "s1"
+        mock_trace.tags = ["prod"]
+        mock_trace.metadata = {"key": "val"}
+        mock_trace.input = "hello"
+        mock_trace.output = "world"
+        mock_trace.scores = []
+
+        client._client.fetch_traces.return_value = MagicMock(data=[mock_trace])
+
+        result = client.fetch_traces(project="test")
+        assert len(result) == 1
+        assert result[0]["id"] == "t-1"
+        assert result[0]["input"] == "hello"
+        assert result[0]["output"] == "world"
+
+    def test_fetch_traces_with_filters(self):
+        client = _make_mock_client()
+        client._client.fetch_traces.return_value = MagicMock(data=[])
+
+        client.fetch_traces(
+            project="prod",
+            tags=["support"],
+            user_id="u1",
+            session_id="s1",
+            from_timestamp="2026-01-01",
+            to_timestamp="2026-01-02",
+            limit=10,
+        )
+
+        call_kwargs = client._client.fetch_traces.call_args
+        assert call_kwargs[1]["tags"] == ["support"]
+        assert call_kwargs[1]["user_id"] == "u1"
+        assert call_kwargs[1]["session_id"] == "s1"
+        assert call_kwargs[1]["limit"] == 10
+
+
+class TestPushScore:
+    def test_push_score(self):
+        client = _make_mock_client()
+        client.push_score(trace_id="t-1", name="correctness", value=0.9, comment="Good")
+        client._client.score.assert_called_once_with(
+            trace_id="t-1", name="correctness", value=0.9, comment="Good"
+        )
+
+
+class TestFetchDataset:
+    def test_fetch_dataset(self):
+        client = _make_mock_client()
+        mock_item = MagicMock()
+        mock_item.id = "item-1"
+        mock_item.input = "question"
+        mock_item.expected_output = "answer"
+        mock_item.metadata = {}
+        mock_item.version = "v1"
+
+        mock_dataset = MagicMock()
+        mock_dataset.items = [mock_item]
+        client._client.get_dataset.return_value = mock_dataset
+
+        result = client.fetch_dataset(name="test-ds")
+        assert len(result) == 1
+        assert result[0]["id"] == "item-1"
+        assert result[0]["input"] == "question"
+        assert result[0]["expected_output"] == "answer"
+
+
+class TestFlush:
+    def test_flush(self):
+        client = _make_mock_client()
+        client.flush()
+        client._client.flush.assert_called_once()
diff --git a/plugins/fasteval-langfuse/tests/test_config.py b/plugins/fasteval-langfuse/tests/test_config.py
new file mode 100644
index 0000000..5ce50e2
--- /dev/null
+++ b/plugins/fasteval-langfuse/tests/test_config.py
@@ -0,0 +1,68 @@
+"""Tests for fasteval_langfuse.config."""
+
+import os
+from unittest.mock import patch
+
+import pytest
+
+import fasteval_langfuse.config as config_module
+from fasteval_langfuse.config import LangfuseConfig, configure_langfuse, get_config
+
+
+class TestLangfuseConfig:
+    def test_defaults(self):
+        with patch.dict(os.environ, {}, clear=True):
+            config = LangfuseConfig()
+            assert config.public_key is None
+            assert config.secret_key is None
+            assert config.host == "https://cloud.langfuse.com"
+            assert config.auto_push_scores is True
+            assert config.batch_size == 50
+            assert config.score_name_prefix == "fasteval_"
+
+    def test_from_env(self):
+        with patch.dict(
+            os.environ,
+            {
+                "LANGFUSE_PUBLIC_KEY": "pk-test",
+                "LANGFUSE_SECRET_KEY": "sk-test",
+                "LANGFUSE_HOST": "https://custom.host.com",
+            },
+        ):
+            config = LangfuseConfig()
+            assert config.public_key == "pk-test"
+            assert config.secret_key == "sk-test"
+            assert config.host == "https://custom.host.com"
+
+    def test_custom_values(self):
+        config = LangfuseConfig(
+            public_key="pk",
+            secret_key="sk",
+            default_project="prod",
+            auto_push_scores=False,
+            batch_size=100,
+            score_name_prefix="custom_",
+        )
+        assert config.default_project == "prod"
+        assert config.auto_push_scores is False
+        assert config.batch_size == 100
+        assert config.score_name_prefix == "custom_"
+
+
+class TestConfigureLangfuse:
+    def setup_method(self):
+        config_module._config = None
+
+    def test_configure_and_get(self):
+        custom = LangfuseConfig(public_key="pk", secret_key="sk")
+        configure_langfuse(custom)
+        assert get_config() is custom
+
+    def test_get_config_default(self):
+        config = get_config()
+        assert isinstance(config, LangfuseConfig)
+
+    def test_get_config_singleton(self):
+        c1 = get_config()
+        c2 = get_config()
+        assert c1 is c2
diff --git a/plugins/fasteval-langfuse/tests/test_decorators.py b/plugins/fasteval-langfuse/tests/test_decorators.py
new file mode 100644
index 0000000..5d41a43
--- /dev/null
+++ b/plugins/fasteval-langfuse/tests/test_decorators.py
@@ -0,0 +1,301 @@
+"""Tests for fasteval_langfuse.decorators."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from fasteval_langfuse.decorators import (
+    FASTEVAL_DATA_ATTR,
+    FASTEVAL_METRICS_ATTR,
+    langfuse_dataset,
+    langfuse_traces,
+)
+
+
+class TestLangfuseTracesDecorator:
+    def test_attaches_data_attr(self):
+        @langfuse_traces(project="prod", filter_tags=["support"])
+        def my_func(trace_id, input, output, context, metadata):
+            pass
+
+        assert hasattr(my_func, FASTEVAL_DATA_ATTR)
+        data = getattr(my_func, FASTEVAL_DATA_ATTR)
+        assert data["type"] == "langfuse_traces"
+        assert data["project"] == "prod"
+
+    def test_preserves_metrics(self):
+        def my_func():
+            pass
+
+        setattr(my_func, FASTEVAL_METRICS_ATTR, ["metric1"])
+
+        decorated = langfuse_traces(project="prod")(my_func)
+        assert getattr(decorated, FASTEVAL_METRICS_ATTR) == ["metric1"]
+
+    def test_default_sampling_name(self):
+        @langfuse_traces()
+        def my_func():
+            pass
+
+        data = getattr(my_func, FASTEVAL_DATA_ATTR)
+        assert data["sampling"] == "NoSamplingStrategy"
+
+    def test_custom_sampling_name(self):
+        mock_sampling = MagicMock()
+        mock_sampling.name = "CustomStrategy"
+
+        @langfuse_traces(sampling=mock_sampling)
+        def my_func():
+            pass
+
+        data = getattr(my_func, FASTEVAL_DATA_ATTR)
+        assert data["sampling"] == "CustomStrategy"
+
+    def test_async_function(self):
+        @langfuse_traces()
+        async def my_func():
+            pass
+
+        assert hasattr(my_func, FASTEVAL_DATA_ATTR)
+
+
+class TestLangfuseDatasetDecorator:
+    def test_attaches_data_attr(self):
+        @langfuse_dataset(name="qa-set", version="v2")
+        def my_func(input, expected_output):
+            pass
+
+        data = getattr(my_func, FASTEVAL_DATA_ATTR)
+        assert data["type"] == "langfuse_dataset"
+        assert data["name"] == "qa-set"
+        assert data["version"] == "v2"
+
+    def test_preserves_metrics(self):
+        def my_func():
+            pass
+
+        setattr(my_func, FASTEVAL_METRICS_ATTR, ["metric1"])
+
+        decorated = langfuse_dataset(name="ds")(my_func)
+        assert getattr(decorated, FASTEVAL_METRICS_ATTR) == ["metric1"]
+
+    def test_async_function(self):
+        @langfuse_dataset(name="ds")
+        async def my_func():
+            pass
+
+        assert hasattr(my_func, FASTEVAL_DATA_ATTR)
+
+
+class TestExecuteTraceEvaluation:
+    @patch("fasteval.core.scoring.get_last_score_result")
+    @patch("fasteval_langfuse.decorators.format_sampling_stats")
+    @patch("fasteval_langfuse.decorators.ScoreReporter")
+    @patch("fasteval_langfuse.decorators.TraceFetcher")
+    @patch("fasteval_langfuse.decorators.LangfuseClient")
+    def test_basic_execution(
+        self,
+        mock_client_cls,
+        mock_fetcher_cls,
+        mock_reporter_cls,
+        mock_format_stats,
+        mock_get_score,
+    ):
+        mock_fetcher = MagicMock()
+        mock_fetcher_cls.return_value = mock_fetcher
+        mock_fetcher.fetch_and_sample.return_value = (
+            [{"id": "t-1", "input": "q", "output": "a", "metadata": {}}],
+            1,
+        )
+        mock_fetcher.map_trace_to_params.return_value = {
+            "trace_id": "t-1",
+            "input": "q",
+            "output": "a",
+            "context": None,
+            "metadata": {},
+        }
+
+        mock_reporter = MagicMock()
+        mock_reporter_cls.return_value = mock_reporter
+        mock_get_score.return_value = None
+
+        from fasteval_langfuse.decorators import _execute_trace_evaluation
+
+        called_with = {}
+
+        def my_func(**kwargs):
+            called_with.update(kwargs)
+
+        _execute_trace_evaluation(
+            func=my_func,
+            is_async=False,
+            project="prod",
+            filter_tags=None,
+            time_range=None,
+            user_id=None,
+            session_id=None,
+            limit=None,
+            sampling=None,
+            auto_push_scores=True,
+            args=(),
+            kwargs={},
+        )
+
+        assert called_with["trace_id"] == "t-1"
+        mock_reporter.flush.assert_called_once()
+
+    @patch("fasteval.core.scoring.get_last_score_result")
+    @patch("fasteval_langfuse.decorators.format_sampling_stats")
+    @patch("fasteval_langfuse.decorators.ScoreReporter")
+    @patch("fasteval_langfuse.decorators.TraceFetcher")
+    @patch("fasteval_langfuse.decorators.LangfuseClient")
+    def test_pushes_scores_when_result_exists(
+        self,
+        mock_client_cls,
+        mock_fetcher_cls,
+        mock_reporter_cls,
+        mock_format_stats,
+        mock_get_score,
+    ):
+        mock_fetcher = MagicMock()
+        mock_fetcher_cls.return_value = mock_fetcher
+        mock_fetcher.fetch_and_sample.return_value = (
+            [{"id": "t-1", "input": "q", "output": "a", "metadata": {}}],
+            1,
+        )
+        mock_fetcher.map_trace_to_params.return_value = {
+            "trace_id": "t-1",
+            "input": "q",
+            "output": "a",
+            "context": None,
+            "metadata": {},
+        }
+
+        mock_reporter = MagicMock()
+        mock_reporter_cls.return_value = mock_reporter
+
+        mock_result = MagicMock()
+        mock_result.metric_results = [MagicMock()]
+        mock_result.aggregate_score = 0.9
+        mock_get_score.return_value = mock_result
+
+        from fasteval_langfuse.decorators import _execute_trace_evaluation
+
+        def my_func(**kwargs):
+            pass
+
+        _execute_trace_evaluation(
+            func=my_func,
+            is_async=False,
+            project=None,
+            filter_tags=None,
+            time_range=None,
+            user_id=None,
+            session_id=None,
+            limit=None,
+            sampling=None,
+            auto_push_scores=True,
+            args=(),
+            kwargs={},
+        )
+
+        mock_reporter.push_evaluation_result.assert_called_once()
+
+    @patch("fasteval.core.scoring.get_last_score_result")
+    @patch("fasteval_langfuse.decorators.format_sampling_stats")
+    @patch("fasteval_langfuse.decorators.ScoreReporter")
+    @patch("fasteval_langfuse.decorators.TraceFetcher")
+    @patch("fasteval_langfuse.decorators.LangfuseClient")
+    def test_skips_push_when_auto_push_false(
+        self,
+        mock_client_cls,
+        mock_fetcher_cls,
+        mock_reporter_cls,
+        mock_format_stats,
+        mock_get_score,
+    ):
+        mock_fetcher = MagicMock()
+        mock_fetcher_cls.return_value = mock_fetcher
+        mock_fetcher.fetch_and_sample.return_value = (
+            [{"id": "t-1", "input": "q", "output": "a", "metadata": {}}],
+            1,
+        )
+        mock_fetcher.map_trace_to_params.return_value = {
+            "trace_id": "t-1",
+            "input": "q",
+            "output": "a",
+            "context": None,
+            "metadata": {},
+        }
+
+        mock_reporter = MagicMock()
+        mock_reporter_cls.return_value = mock_reporter
+
+        mock_result = MagicMock()
+        mock_get_score.return_value = mock_result
+
+        from fasteval_langfuse.decorators import _execute_trace_evaluation
+
+        def my_func(**kwargs):
+            pass
+
+        _execute_trace_evaluation(
+            func=my_func,
+            is_async=False,
+            project=None,
+            filter_tags=None,
+            time_range=None,
+            user_id=None,
+            session_id=None,
+            limit=None,
+            sampling=None,
+            auto_push_scores=False,
+            args=(),
+            kwargs={},
+        )
+
+        mock_reporter.push_evaluation_result.assert_not_called()
+
+    @patch("fasteval.core.scoring.get_last_score_result")
+    @patch("fasteval_langfuse.decorators.format_sampling_stats")
+    @patch("fasteval_langfuse.decorators.ScoreReporter")
+    @patch("fasteval_langfuse.decorators.TraceFetcher")
+    @patch("fasteval_langfuse.decorators.LangfuseClient")
+    def test_prints_stats_when_sampling(
+        self,
+        mock_client_cls,
+        mock_fetcher_cls,
+        mock_reporter_cls,
+        mock_format_stats,
+        mock_get_score,
+    ):
+        mock_fetcher = MagicMock()
+        mock_fetcher_cls.return_value = mock_fetcher
+        mock_fetcher.fetch_and_sample.return_value = ([], 0)
+
+        mock_reporter = MagicMock()
+        mock_reporter_cls.return_value = mock_reporter
+        mock_get_score.return_value = None
+        mock_format_stats.return_value = "stats"
+
+        from fasteval_langfuse.decorators import _execute_trace_evaluation
+
+        mock_sampling = MagicMock()
+        mock_sampling.name = "TestStrategy"
+
+        _execute_trace_evaluation(
+            func=lambda **kw: None,
+            is_async=False,
+            project=None,
+            filter_tags=None,
+            time_range=None,
+            user_id=None,
+            session_id=None,
+            limit=None,
+            sampling=mock_sampling,
+            auto_push_scores=True,
+            args=(),
+            kwargs={},
+        )
+
+        mock_format_stats.assert_called_once()
diff --git a/plugins/fasteval-langfuse/tests/test_score_reporter.py b/plugins/fasteval-langfuse/tests/test_score_reporter.py
new file mode 100644
index 0000000..8a6806a
--- /dev/null
+++ b/plugins/fasteval-langfuse/tests/test_score_reporter.py
@@ -0,0 +1,60 @@
+"""Tests for fasteval_langfuse.score_reporter."""
+
+from unittest.mock import MagicMock, patch
+
+from fasteval_langfuse.score_reporter import ScoreReporter
+
+
+def _make_reporter():
+    """Create a ScoreReporter with mocked client and config."""
+    mock_client = MagicMock()
+    reporter = ScoreReporter.__new__(ScoreReporter)
+    reporter.client = mock_client
+    reporter.config = MagicMock()
+    reporter.config.auto_push_scores = True
+    reporter.config.score_name_prefix = "fasteval_"
+    return reporter
+
+
+class TestPushEvaluationResult:
+    def test_pushes_metric_scores(self):
+        reporter = _make_reporter()
+
+        mr1 = MagicMock()
+        mr1.metric_name = "correctness"
+        mr1.score = 0.9
+        mr1.reasoning = "Good"
+
+        mr2 = MagicMock()
+        mr2.metric_name = "relevance"
+        mr2.score = 0.8
+        mr2.reasoning = None
+
+        reporter.push_evaluation_result(
+            trace_id="t-1", metric_results=[mr1, mr2], aggregate_score=0.85
+        )
+
+        calls = reporter.client.push_score.call_args_list
+        assert len(calls) == 3  # 2 metrics + 1 aggregate
+        assert calls[0][1]["name"] == "fasteval_correctness"
+        assert calls[0][1]["value"] == 0.9
+        assert calls[1][1]["name"] == "fasteval_relevance"
+        assert calls[2][1]["name"] == "fasteval_aggregate"
+        assert calls[2][1]["value"] == 0.85
+
+    def test_skips_when_auto_push_disabled(self):
+        reporter = _make_reporter()
+        reporter.config.auto_push_scores = False
+
+        reporter.push_evaluation_result(
+            trace_id="t-1", metric_results=[MagicMock()], aggregate_score=0.5
+        )
+
+        reporter.client.push_score.assert_not_called()
+
+
+class TestFlush:
+    def test_flush(self):
+        reporter = _make_reporter()
+        reporter.flush()
+        reporter.client.flush.assert_called_once()
diff --git a/plugins/fasteval-langfuse/tests/test_trace_fetcher.py b/plugins/fasteval-langfuse/tests/test_trace_fetcher.py
new file mode 100644
index 0000000..0ef58db
--- /dev/null
+++ b/plugins/fasteval-langfuse/tests/test_trace_fetcher.py
@@ -0,0 +1,95 @@
+"""Tests for fasteval_langfuse.trace_fetcher."""
+
+from unittest.mock import MagicMock
+
+from fasteval_langfuse.trace_fetcher import TraceFetcher
+
+
+def _make_fetcher_with_mock(traces):
+    """Create a TraceFetcher with a mocked client."""
+    mock_client = MagicMock()
+    mock_client.fetch_traces.return_value = traces
+    fetcher = TraceFetcher.__new__(TraceFetcher)
+    fetcher.client = mock_client
+    return fetcher
+
+
+class TestFetchAndSample:
+    def test_basic_fetch(self, sample_traces):
+        fetcher = _make_fetcher_with_mock(sample_traces)
+        result, total = fetcher.fetch_and_sample()
+        assert total == 3
+        assert len(result) == 3
+
+    def test_with_time_range(self, sample_traces):
+        fetcher = _make_fetcher_with_mock(sample_traces)
+        result, total = fetcher.fetch_and_sample(time_range="last_24h")
+        assert total == 3
+
+    def test_with_sampling(self, sample_traces):
+        from fasteval_langfuse.sampling import RandomSamplingStrategy
+
+        fetcher = _make_fetcher_with_mock(sample_traces)
+        sampling = RandomSamplingStrategy(sample_size=1, seed=42)
+        result, total = fetcher.fetch_and_sample(sampling=sampling)
+        assert total == 3
+        assert len(result) == 1
+
+    def test_with_filters(self, sample_traces):
+        fetcher = _make_fetcher_with_mock(sample_traces)
+        fetcher.fetch_and_sample(
+            project="prod",
+            filter_tags=["tag1"],
+            user_id="u1",
+            session_id="s1",
+            limit=10,
+        )
+        call_kwargs = fetcher.client.fetch_traces.call_args[1]
+        assert call_kwargs["project"] == "prod"
+        assert call_kwargs["tags"] == ["tag1"]
+        assert call_kwargs["user_id"] == "u1"
+        assert call_kwargs["limit"] == 10
+
+
+class TestMapTraceToParams:
+    def test_basic_mapping(self, sample_traces):
+        fetcher = _make_fetcher_with_mock([])
+        params = fetcher.map_trace_to_params(sample_traces[0])
+        assert params["trace_id"] == "trace-1"
+        assert params["input"] == "What is Python?"
+        assert params["output"] == "Python is a programming language"
+        assert params["metadata"] == {"user_type": "free"}
+
+    def test_dict_input(self):
+        fetcher = _make_fetcher_with_mock([])
+        trace = {
+            "id": "t-1",
+            "input": {"query": "hello"},
+            "output": {"response": "world"},
+            "metadata": {},
+        }
+        params = fetcher.map_trace_to_params(trace)
+        assert params["input"] == "hello"
+        assert params["output"] == "world"
+
+    def test_dict_input_fallback(self):
+        fetcher = _make_fetcher_with_mock([])
+        trace = {
+            "id": "t-1",
+            "input": {"custom_key": "value"},
+            "output": {"custom_key": "value"},
+            "metadata": {},
+        }
+        params = fetcher.map_trace_to_params(trace)
+        assert "custom_key" in params["input"]
+
+    def test_context_extraction(self, sample_traces):
+        fetcher = _make_fetcher_with_mock([])
+        # trace-3 has context in metadata
+        params = fetcher.map_trace_to_params(sample_traces[2])
+        assert params["context"] == ["RAG combines retrieval with generation"]
+
+    def test_no_context(self, sample_traces):
+        fetcher = _make_fetcher_with_mock([])
+        params = fetcher.map_trace_to_params(sample_traces[0])
+        assert params["context"] is None
diff --git a/plugins/fasteval-langfuse/tests/test_utils.py b/plugins/fasteval-langfuse/tests/test_utils.py
index 32f5958..bddc1a7 100644
--- a/plugins/fasteval-langfuse/tests/test_utils.py
+++ b/plugins/fasteval-langfuse/tests/test_utils.py
@@ -45,6 +45,43 @@ def test_parse_time_range_to_format():
     assert to_ts == "2026-02-05T00:00:00Z"
 
 
+def test_parse_time_range_empty():
+    """Test empty time range returns None."""
+    from_ts, to_ts = parse_time_range("")
+    assert from_ts is None
+    assert to_ts is None
+
+
+def test_parse_time_range_invalid():
+    """Test invalid time range format raises ValueError."""
+    import pytest
+
+    with pytest.raises(ValueError, match="Unsupported"):
+        parse_time_range("invalid_format")
+
+
+def test_parse_time_range_invalid_duration():
+    """Test invalid duration suffix raises ValueError."""
+    import pytest
+
+    with pytest.raises(ValueError, match="Invalid time range"):
+        parse_time_range("last_5m")
+
+
+def test_extract_context_non_list_value():
+    """Test context extraction with non-list, non-string value."""
+    trace = {"metadata": {"context": 42}}
+    context = extract_context_from_trace(trace)
+    assert context == ["42"]
+
+
+def test_extract_context_none_value():
+    """Test context extraction skips None values."""
+    trace = {"metadata": {"context": None, "retrieved_docs": ["doc1"]}}
+    context = extract_context_from_trace(trace)
+    assert context == ["doc1"]
+
+
 def test_format_sampling_stats():
     """Test sampling statistics formatting."""
     stats = format_sampling_stats(200, 1000, "RandomSamplingStrategy")
@@ -52,3 +89,9 @@ def test_format_sampling_stats():
     assert "1,000" in stats
     assert "20.0%" in stats
     assert "RandomSamplingStrategy" in stats
+
+
+def test_format_sampling_stats_zero_total():
+    """Test formatting with zero total traces."""
+    stats = format_sampling_stats(0, 0, "NoSamplingStrategy")
+    assert "0.0%" in stats
diff --git a/plugins/fasteval-langfuse/uv.lock b/plugins/fasteval-langfuse/uv.lock
index f968e61..dc11090 100644
--- a/plugins/fasteval-langfuse/uv.lock
+++ b/plugins/fasteval-langfuse/uv.lock
@@ -233,7 +233,7 @@ name = "exceptiongroup"
 version = "1.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
 wheels = [
@@ -242,7 +242,7 @@ wheels = [
 
 [[package]]
 name = "fasteval-core"
-version = "1.0.0a1"
+version = "1.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "openai" },
@@ -250,14 +250,14 @@ dependencies = [
     { name = "pytest" },
     { name = "rouge-score" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/09/78/29c4e9b1a74b4cfdd07407d06dcf7d62037df39ac566a82a00ec91bfb6dd/fasteval_core-1.0.0a1.tar.gz", hash = "sha256:dd2f84ca3f3f2e1b39c109086dcb5b740ba3abee851bb4c9adca4a5b269230c0", size = 78220, upload-time = "2026-02-12T19:12:33.07Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/26/8d/9e1c566572f1ca966a0e88409c6132a7761777258c59c6c7fd92d6139c8f/fasteval_core-1.2.0.tar.gz", hash = "sha256:fce78b637a06d35e0ac8d4702ddc6beda9cec867fc5c8e6ba8f59c572daa1042", size = 86406, upload-time = "2026-03-09T11:31:19.151Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bf/25/06b36cebd8192a15385fac77a0b832dbb2a21ad0d92ccf4d1f5212f7df30/fasteval_core-1.0.0a1-py3-none-any.whl", hash = "sha256:8dc0c5901dc7a1df65ce4c7766c10e0de404bf53ce1100fb31c93bf5c9e0d98a", size = 95801, upload-time = "2026-02-12T19:12:30.106Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/53/8fffcab213dad0b3cb88d92dcd888f8a5dd89542e991eecc37eb895a170f/fasteval_core-1.2.0-py3-none-any.whl", hash = "sha256:65e39ad1aba6fa7564e50582c226ee2e94feff47c10ec95ab61e9c1a7c285edb", size = 107601, upload-time = "2026-03-09T11:31:17.648Z" },
 ]
 
 [[package]]
 name = "fasteval-langfuse"
-version = "1.0.0a1"
+version = "2.1.3"
 source = { editable = "." }
 dependencies = [
     { name = "fasteval-core" },
@@ -276,7 +276,7 @@ dev = [
 
 [package.metadata]
 requires-dist = [
-    { name = "fasteval-core", specifier = ">=1.0.0a1" },
+    { name = "fasteval-core", specifier = ">=1.2.0" },
     { name = "langfuse", specifier = ">=2.0.0" },
     { name = "pydantic", specifier = ">=2.0.0" },
 ]
diff --git a/pyproject.toml b/pyproject.toml
index 3470a97..cc066b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
 [project.urls]
 Homepage = "https://github.com/intuit/fasteval"
 Repository = "https://github.com/intuit/fasteval"
-Documentation = "https://github.com/intuit/fasteval/tree/main/docs"
+Documentation = "https://fasteval.io"
 Issues = "https://github.com/intuit/fasteval/issues"
 Changelog = "https://github.com/intuit/fasteval/blob/main/CHANGELOG.md"
 
@@ -110,9 +110,40 @@ testpaths = [
     "tests",
 ]
 looponfailroots = ["fasteval", "tests"]
-addopts = "-p no:warnings -o log_cli=true --log-cli-level=INFO"
+addopts = "-p no:warnings -p no:fasteval -o log_cli=true --log-cli-level=INFO"
 asyncio_mode = "auto"
 
+[tool.coverage.run]
+source = ["fasteval"]
+omit = [
+    "fasteval/__init__.py",
+    "fasteval/models/*",
+    "fasteval/metrics/vision.py",
+    "fasteval/metrics/multimodal.py",
+    "fasteval/metrics/audio.py",
+    "fasteval/metrics/__init__.py",
+    "fasteval/utils/image.py",
+    "fasteval/utils/audio.py",
+    "fasteval/utils/terminal_ui.py",
+    "fasteval/utils/__init__.py",
+    "fasteval/providers/base.py",
+    "fasteval/providers/__init__.py",
+    "fasteval/collectors/__init__.py",
+    "fasteval/collectors/reporters/__init__.py",
+    "fasteval/collectors/reporters/base.py",
+    "fasteval/core/__init__.py",
+    "fasteval/cache/__init__.py",
+    "fasteval/testing/__init__.py",
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "if TYPE_CHECKING:",
+    "raise NotImplementedError",
+    "except ImportError",
+]
+
 [build-system]
 requires = ["uv_build>=0.10.0,<0.11.0"]
 build-backend = "uv_build"
diff --git a/testing-pyramid-agents.png b/testing-pyramid-agents.png
new file mode 100644
index 0000000..af15bdf
Binary files /dev/null and b/testing-pyramid-agents.png differ
diff --git a/tests/test_async_helpers.py b/tests/test_async_helpers.py
new file mode 100644
index 0000000..f7c977a
--- /dev/null
+++ b/tests/test_async_helpers.py
@@ -0,0 +1,33 @@
+"""Tests for fasteval.utils.async_helpers."""
+
+import asyncio
+
+from fasteval.utils.async_helpers import run_async
+
+
+class TestRunAsync:
+    def test_run_async_no_running_loop(self):
+        async def coro():
+            return 42
+
+        result = run_async(coro())
+        assert result == 42
+
+    def test_run_async_with_running_loop(self):
+        async def inner():
+            return "from_inner"
+
+        async def outer():
+            # run_async called from within a running event loop
+            return run_async(inner())
+
+        result = asyncio.run(outer())
+        assert result == "from_inner"
+
+    def test_run_async_with_async_sleep(self):
+        async def coro():
+            await asyncio.sleep(0.01)
+            return "done"
+
+        result = run_async(coro())
+        assert result == "done"
diff --git a/tests/test_base_metric.py b/tests/test_base_metric.py
new file mode 100644
index 0000000..f945e7e
--- /dev/null
+++ b/tests/test_base_metric.py
@@ -0,0 +1,53 @@
+"""Tests for fasteval.metrics.base."""
+
+import pytest
+
+from fasteval.metrics.base import Metric
+from fasteval.models.evaluation import EvalInput, MetricResult
+
+
+class ConcreteMetric(Metric):
+    """Concrete implementation for testing."""
+
+    async def evaluate(self, eval_input: EvalInput) -> MetricResult:
+        return MetricResult(
+            metric_name=self.name,
+            score=0.8,
+            passed=self._determine_pass(0.8),
+            threshold=self.threshold,
+        )
+
+
+class TestMetric:
+    def test_init(self):
+        m = ConcreteMetric(name="test", threshold=0.7, weight=2.0)
+        assert m.name == "test"
+        assert m.threshold == 0.7
+        assert m.weight == 2.0
+
+    def test_determine_pass_above(self):
+        m = ConcreteMetric(name="test", threshold=0.5)
+        assert m._determine_pass(0.6) is True
+
+    def test_determine_pass_equal(self):
+        m = ConcreteMetric(name="test", threshold=0.5)
+        assert m._determine_pass(0.5) is True
+
+    def test_determine_pass_below(self):
+        m = ConcreteMetric(name="test", threshold=0.5)
+        assert m._determine_pass(0.4) is False
+
+    def test_repr(self):
+        m = ConcreteMetric(name="test", threshold=0.7)
+        assert repr(m) == "ConcreteMetric(name='test', threshold=0.7)"
+
+    @pytest.mark.asyncio
+    async def test_evaluate(self):
+        m = ConcreteMetric(name="test", threshold=0.5)
+        result = await m.evaluate(EvalInput(actual_output="hello"))
+        assert result.score == 0.8
+        assert result.passed is True
+
+    def test_abstract_cannot_instantiate(self):
+        with pytest.raises(TypeError):
+            Metric(name="test")  # type: ignore[abstract]
diff --git a/tests/test_cache.py b/tests/test_cache.py
new file mode 100644
index 0000000..3d66b00
--- /dev/null
+++ b/tests/test_cache.py
@@ -0,0 +1,171 @@
+"""Tests for fasteval.cache.memory."""
+
+import pytest
+from pydantic import BaseModel
+
+import fasteval.cache.memory as cache_module
+from fasteval.cache.memory import (
+    CacheStats,
+    MemoryCache,
+    clear_cache,
+    get_cache,
+)
+
+
+class SampleModel(BaseModel):
+    name: str
+    value: int
+
+
+class TestCacheStats:
+    def test_hit_rate_with_data(self):
+        stats = CacheStats(hits=7, misses=3)
+        assert stats.hit_rate == 0.7
+
+    def test_hit_rate_zero_total(self):
+        stats = CacheStats(hits=0, misses=0)
+        assert stats.hit_rate == 0.0
+
+
+class TestMemoryCache:
+    def test_basic_get_set(self):
+        cache = MemoryCache(max_size=10)
+        cache.set("k1", "v1")
+        assert cache.get("k1") == "v1"
+
+    def test_get_miss(self):
+        cache = MemoryCache(max_size=10)
+        assert cache.get("nonexistent") is None
+
+    def test_lru_eviction(self):
+        cache = MemoryCache(max_size=2)
+        cache.set("a", 1)
+        cache.set("b", 2)
+        cache.set("c", 3)  # evicts "a"
+        assert cache.get("a") is None
+        assert cache.get("b") == 2
+        assert cache.get("c") == 3
+
+    def test_lru_access_order(self):
+        cache = MemoryCache(max_size=2)
+        cache.set("a", 1)
+        cache.set("b", 2)
+        cache.get("a")  # moves "a" to most recent
+        cache.set("c", 3)  # evicts "b" (least recent)
+        assert cache.get("a") == 1
+        assert cache.get("b") is None
+        assert cache.get("c") == 3
+
+    def test_update_existing_key(self):
+        cache = MemoryCache(max_size=10)
+        cache.set("k1", "old")
+        cache.set("k1", "new")
+        assert cache.get("k1") == "new"
+        assert len(cache) == 1
+
+    def test_get_or_set_callable(self):
+        cache = MemoryCache(max_size=10)
+        result = cache.get_or_set("k1", lambda: 42)
+        assert result == 42
+        # Second call returns cached value
+        result = cache.get_or_set("k1", lambda: 99)
+        assert result == 42
+
+    def test_get_or_set_non_callable(self):
+        cache = MemoryCache(max_size=10)
+        result = cache.get_or_set("k1", "static_value")
+        assert result == "static_value"
+
+    def test_delete_existing(self):
+        cache = MemoryCache(max_size=10)
+        cache.set("k1", "v1")
+        assert cache.delete("k1") is True
+        assert cache.get("k1") is None
+
+    def test_delete_missing(self):
+        cache = MemoryCache(max_size=10)
+        assert cache.delete("nonexistent") is False
+
+    def test_clear(self):
+        cache = MemoryCache(max_size=10)
+        cache.set("a", 1)
+        cache.set("b", 2)
+        cache.clear()
+        assert len(cache) == 0
+        assert cache.get("a") is None
+
+    def test_stats_property(self):
+        cache = MemoryCache(max_size=100)
+        cache.set("k1", "v1")
+        cache.get("k1")  # hit
+        cache.get("k2")  # miss
+        stats = cache.stats
+        assert stats.hits == 1
+        assert stats.misses == 1
+        assert stats.size == 1
+        assert stats.max_size == 100
+
+    def test_eviction_stats(self):
+        cache = MemoryCache(max_size=1)
+        cache.set("a", 1)
+        cache.set("b", 2)  # evicts "a"
+        stats = cache.stats
+        assert stats.evictions == 1
+
+    def test_len(self):
+        cache = MemoryCache(max_size=10)
+        assert len(cache) == 0
+        cache.set("a", 1)
+        assert len(cache) == 1
+
+    def test_contains(self):
+        cache = MemoryCache(max_size=10)
+        cache.set("a", 1)
+        assert "a" in cache
+        assert "b" not in cache
+
+    def test_make_key_basic(self):
+        cache = MemoryCache()
+        key1 = cache._make_key("arg1", key="val")
+        key2 = cache._make_key("arg1", key="val")
+        key3 = cache._make_key("arg2", key="val")
+        assert key1 == key2
+        assert key1 != key3
+
+    def test_make_key_with_pydantic_model(self):
+        cache = MemoryCache()
+        model = SampleModel(name="test", value=42)
+        key = cache._make_key(model)
+        assert isinstance(key, str)
+        assert len(key) == 64  # SHA256 hex digest
+
+    def test_make_key_with_dict_and_list(self):
+        cache = MemoryCache()
+        key = cache._make_key({"a": [1, 2, 3]})
+        assert isinstance(key, str)
+
+    def test_unlimited_cache(self):
+        cache = MemoryCache(max_size=0)
+        for i in range(100):
+            cache.set(str(i), i)
+        assert len(cache) == 100
+
+
+class TestGlobalCache:
+    def setup_method(self):
+        cache_module._global_cache = None
+
+    def test_get_cache_singleton(self):
+        c1 = get_cache()
+        c2 = get_cache()
+        assert c1 is c2
+
+    def test_clear_cache(self):
+        cache = get_cache()
+        cache.set("k1", "v1")
+        clear_cache()
+        assert cache.get("k1") is None
+
+    def test_clear_cache_when_none(self):
+        cache_module._global_cache = None
+        clear_cache()  # Should not raise
diff --git a/tests/test_collector.py b/tests/test_collector.py
new file mode 100644
index 0000000..bdf84c9
--- /dev/null
+++ b/tests/test_collector.py
@@ -0,0 +1,220 @@
+"""Tests for fasteval.collectors.collector and fasteval.collectors.summary."""
+
+import json
+
+import pytest
+
+import fasteval.collectors.collector as collector_module
+from fasteval.collectors.collector import (
+    ResultCollector,
+    get_collector,
+    reset_collector,
+)
+from fasteval.collectors.summary import (
+    EvalRunSummary,
+    MetricAggregate,
+    TestCaseSummary,
+)
+from fasteval.models.evaluation import EvalInput, EvalResult, MetricResult
+
+
+def _make_result(
+    passed=True, aggregate_score=1.0, metrics=None, execution_time_ms=10.0, error=None
+):
+    return EvalResult(
+        eval_input=EvalInput(actual_output="test"),
+        metric_results=metrics or [],
+        passed=passed,
+        aggregate_score=aggregate_score,
+        execution_time_ms=execution_time_ms,
+        error=error,
+    )
+
+
+def _make_metric(name="m1", score=0.8, passed=True, threshold=0.5):
+    return MetricResult(
+        metric_name=name, score=score, passed=passed, threshold=threshold
+    )
+
+
+# ── ResultCollector ──────────────────────────────────────────────────────────
+
+
+class TestResultCollector:
+    def test_init(self):
+        collector = ResultCollector()
+        assert len(collector.results) == 0
+
+    def test_collect_and_results(self):
+        collector = ResultCollector()
+        r1 = _make_result()
+        r2 = _make_result(passed=False, aggregate_score=0.0)
+        collector.collect(r1, "test1")
+        collector.collect(r2, "test2")
+        assert len(collector.results) == 2
+
+    def test_results_is_copy(self):
+        collector = ResultCollector()
+        collector.collect(_make_result(), "test1")
+        results = collector.results
+        results.clear()
+        assert len(collector.results) == 1
+
+    def test_summary(self):
+        collector = ResultCollector()
+        collector.collect(_make_result(passed=True), "test1")
+        collector.collect(_make_result(passed=False, aggregate_score=0.0), "test2")
+        summary = collector.summary()
+        assert summary.total_tests == 2
+        assert summary.passed_tests == 1
+        assert summary.failed_tests == 1
+
+    def test_report_json(self):
+        collector = ResultCollector()
+        collector.collect(_make_result(), "test1")
+        content = collector.report("json")
+        parsed = json.loads(content)
+        assert "summary" in parsed
+        assert "results" in parsed
+
+    def test_report_unknown_format(self):
+        collector = ResultCollector()
+        collector.collect(_make_result(), "test1")
+        with pytest.raises(ValueError, match="Unknown format"):
+            collector.report("xml")
+
+    def test_register_reporter(self):
+        from fasteval.collectors.reporters.base import OutputReporter
+
+        class CustomReporter(OutputReporter):
+            def generate(self, summary, results):
+                return "custom"
+
+        collector = ResultCollector()
+        collector.register_reporter("custom", CustomReporter)
+        collector.collect(_make_result(), "test1")
+        content = collector.report("custom")
+        assert content == "custom"
+
+    def test_reset(self):
+        collector = ResultCollector()
+        collector.collect(_make_result(), "test1")
+        collector.reset()
+        assert len(collector.results) == 0
+
+    def test_report_to_file(self, tmp_path):
+        collector = ResultCollector()
+        collector.collect(_make_result(), "test1")
+        filepath = str(tmp_path / "report.json")
+        collector.report("json", path=filepath)
+        with open(filepath) as f:
+            parsed = json.loads(f.read())
+        assert "summary" in parsed
+
+    def test_report_html(self):
+        collector = ResultCollector()
+        mr = _make_metric()
+        collector.collect(_make_result(metrics=[mr]), "test1")
+        content = collector.report("html")
+        assert "FastEval" in content
+        assert "<html" in content
+
+
+# ── Global collector ─────────────────────────────────────────────────────────
+
+
+class TestGlobalCollector:
+    def setup_method(self):
+        collector_module._collector = None
+
+    def test_get_collector_singleton(self):
+        c1 = get_collector()
+        c2 = get_collector()
+        assert c1 is c2
+
+    def test_reset_collector(self):
+        collector = get_collector()
+        collector.collect(_make_result(), "test1")
+        reset_collector()
+        assert len(collector.results) == 0
+
+    def test_reset_collector_when_none(self):
+        collector_module._collector = None
+        reset_collector()  # Should not raise
+
+
+# ── EvalRunSummary ───────────────────────────────────────────────────────────
+
+
+class TestEvalRunSummary:
+    def test_empty_results(self):
+        summary = EvalRunSummary.from_results([], [])
+        assert summary.total_tests == 0
+        assert summary.timestamp != ""
+
+    def test_single_result(self):
+        mr = _make_metric(name="m1", score=0.8, passed=True)
+        result = _make_result(
+            passed=True, aggregate_score=0.8, metrics=[mr], execution_time_ms=15.0
+        )
+        summary = EvalRunSummary.from_results([result], ["test1"])
+        assert summary.total_tests == 1
+        assert summary.passed_tests == 1
+        assert summary.failed_tests == 0
+        assert summary.pass_rate == 1.0
+        assert summary.avg_aggregate_score == 0.8
+        assert summary.total_execution_time_ms == 15.0
+
+    def test_multiple_results_with_metrics(self):
+        mr1 = _make_metric(name="m1", score=0.9, passed=True)
+        mr2 = _make_metric(name="m1", score=0.3, passed=False)
+        mr3 = _make_metric(name="m2", score=0.7, passed=True)
+
+        r1 = _make_result(
+            passed=True, aggregate_score=0.9, metrics=[mr1], execution_time_ms=10.0
+        )
+        r2 = _make_result(
+            passed=False,
+            aggregate_score=0.3,
+            metrics=[mr2, mr3],
+            execution_time_ms=20.0,
+        )
+
+        summary = EvalRunSummary.from_results([r1, r2], ["t1", "t2"])
+
+        assert summary.total_tests == 2
+        assert summary.passed_tests == 1
+        assert summary.failed_tests == 1
+        assert summary.pass_rate == 0.5
+        assert summary.total_execution_time_ms == 30.0
+
+        # Check metric aggregates
+        assert len(summary.metric_aggregates) == 2  # m1 and m2
+
+        m1_agg = next(m for m in summary.metric_aggregates if m.metric_name == "m1")
+        assert m1_agg.count == 2
+        assert m1_agg.pass_count == 1
+        assert m1_agg.fail_count == 1
+        assert m1_agg.pass_rate == 0.5
+        assert m1_agg.min_score == 0.3
+        assert m1_agg.max_score == 0.9
+        assert m1_agg.std_score > 0
+
+        m2_agg = next(m for m in summary.metric_aggregates if m.metric_name == "m2")
+        assert m2_agg.count == 1
+        assert m2_agg.std_score == 0.0  # Single value
+
+    def test_test_summaries(self):
+        result = _make_result(
+            passed=False,
+            aggregate_score=0.4,
+            execution_time_ms=5.0,
+            error="failed",
+        )
+        summary = EvalRunSummary.from_results([result], ["test_func"])
+        assert len(summary.test_summaries) == 1
+        ts = summary.test_summaries[0]
+        assert ts.test_name == "test_func"
+        assert ts.passed is False
+        assert ts.aggregate_score == 0.4
+        assert ts.error == "failed"
diff --git a/tests/test_conversation_metrics.py b/tests/test_conversation_metrics.py
new file mode 100644
index 0000000..08c0b46
--- /dev/null
+++ b/tests/test_conversation_metrics.py
@@ -0,0 +1,105 @@
+"""Tests for fasteval.metrics.conversation."""
+
+import json
+
+import pytest
+
+from fasteval.metrics.conversation import (
+    ConsistencyMetric,
+    ContextRetentionMetric,
+    TopicDriftMetric,
+)
+from fasteval.models.evaluation import EvalInput
+
+
+class MockLLMClient:
+    def __init__(self, score=0.85):
+        self.response = json.dumps(
+            {"score": score, "reasoning": "Mock conversation eval"}
+        )
+
+    async def invoke(self, messages):
+        return self.response
+
+
+class TestContextRetentionMetric:
+    def test_default_name(self):
+        metric = ContextRetentionMetric(llm_client=MockLLMClient())
+        assert metric.name == "context_retention"
+
+    def test_prompt_includes_history(self):
+        metric = ContextRetentionMetric(llm_client=MockLLMClient())
+        prompt = metric.get_evaluation_prompt(
+            EvalInput(
+                actual_output="Yes, I remember",
+                history=[
+                    {"role": "user", "content": "My name is Alice"},
+                    {"role": "assistant", "content": "Nice to meet you, Alice"},
+                    {"role": "user", "content": "What is my name?"},
+                ],
+            )
+        )
+        assert "Alice" in prompt
+
+    @pytest.mark.asyncio
+    async def test_evaluation(self):
+        metric = ContextRetentionMetric(llm_client=MockLLMClient(0.9), threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="Your name is Alice",
+                history=[
+                    {"role": "user", "content": "My name is Alice"},
+                    {"role": "assistant", "content": "Hello Alice"},
+                ],
+            )
+        )
+        assert result.score == 0.9
+        assert result.passed is True
+
+
+class TestConsistencyMetric:
+    def test_default_name(self):
+        metric = ConsistencyMetric(llm_client=MockLLMClient())
+        assert metric.name == "consistency"
+
+    def test_default_binary_scoring(self):
+        metric = ConsistencyMetric(llm_client=MockLLMClient())
+        assert metric.scoring_type == "binary"
+
+    @pytest.mark.asyncio
+    async def test_evaluation(self):
+        # ConsistencyMetric uses binary scoring by default
+        # Score 0.8 >= 0.5 → binary 1.0
+        metric = ConsistencyMetric(llm_client=MockLLMClient(0.8), threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="Paris is the capital",
+                history=[
+                    {"role": "user", "content": "What is the capital of France?"},
+                    {"role": "assistant", "content": "The capital is Paris"},
+                ],
+            )
+        )
+        assert result.score == 1.0  # binary: 0.8 >= 0.5 → 1.0
+        assert result.passed is True
+
+
+class TestTopicDriftMetric:
+    def test_default_name(self):
+        metric = TopicDriftMetric(llm_client=MockLLMClient())
+        assert metric.name == "topic_drift"
+
+    @pytest.mark.asyncio
+    async def test_evaluation(self):
+        metric = TopicDriftMetric(llm_client=MockLLMClient(0.7), threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="About cooking",
+                history=[
+                    {"role": "user", "content": "Let's discuss cooking"},
+                    {"role": "assistant", "content": "Sure, what dish?"},
+                ],
+            )
+        )
+        assert result.score == 0.7
+        assert result.passed is True
diff --git a/tests/test_decorators_extended.py b/tests/test_decorators_extended.py
new file mode 100644
index 0000000..0d95008
--- /dev/null
+++ b/tests/test_decorators_extended.py
@@ -0,0 +1,343 @@
+"""Tests for fasteval.core.decorators."""
+
+import csv
+import os
+import tempfile
+
+import pytest
+from pydantic import BaseModel
+
+# Import the public decorator functions
+import fasteval.core.decorators as dec
+from fasteval.core.decorators import (
+    _attach_metric,
+    _metric_decorator_factory,
+    fasteval_HUMAN_REVIEW_ATTR,
+    fasteval_METRICS_ATTR,
+)
+from fasteval.models.config import MetricConfig
+
+# ── _attach_metric ───────────────────────────────────────────────────────────
+
+
+class TestAttachMetric:
+    def test_creates_attribute(self):
+        def my_func():
+            pass
+
+        config = MetricConfig(metric_type="test", name="test")
+        _attach_metric(my_func, config)
+        assert hasattr(my_func, fasteval_METRICS_ATTR)
+        assert len(getattr(my_func, fasteval_METRICS_ATTR)) == 1
+
+    def test_appends_to_existing(self):
+        def my_func():
+            pass
+
+        config1 = MetricConfig(metric_type="t1", name="n1")
+        config2 = MetricConfig(metric_type="t2", name="n2")
+        _attach_metric(my_func, config1)
+        _attach_metric(my_func, config2)
+        assert len(getattr(my_func, fasteval_METRICS_ATTR)) == 2
+
+
+# ── _metric_decorator_factory ────────────────────────────────────────────────
+
+
+class TestMetricDecoratorFactory:
+    def test_creates_working_decorator(self):
+        decorator_fn = _metric_decorator_factory("test_type", "test_name")
+
+        @decorator_fn()
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert len(configs) == 1
+        assert configs[0].metric_type == "test_type"
+        assert configs[0].name == "test_name"
+
+    def test_threshold_override(self):
+        decorator_fn = _metric_decorator_factory("test", "test", default_threshold=0.5)
+
+        @decorator_fn(threshold=0.9)
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].threshold == 0.9
+
+    def test_weight_override(self):
+        decorator_fn = _metric_decorator_factory("test", "test")
+
+        @decorator_fn(weight=2.5)
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].weight == 2.5
+
+    def test_name_override(self):
+        decorator_fn = _metric_decorator_factory("test", "default_name")
+
+        @decorator_fn(name="custom_name")
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].name == "custom_name"
+
+    def test_model_override(self):
+        decorator_fn = _metric_decorator_factory("test", "test")
+
+        @decorator_fn(model="gpt-4o")
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].llm_config == {"model": "gpt-4o"}
+
+    def test_llm_client_passthrough(self):
+        class FakeClient:
+            async def invoke(self, messages):
+                return ""
+
+        client = FakeClient()
+        decorator_fn = _metric_decorator_factory("test", "test")
+
+        @decorator_fn(llm_client=client)
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].llm_client is client
+
+
+# ── All metric decorators ────────────────────────────────────────────────────
+
+
+class TestAllMetricDecorators:
+    """Test that each public decorator attaches the correct metric_type."""
+
+    # Simple decorators that take no required positional args
+    _simple_decorators = {
+        "correctness": dec.correctness,
+        "hallucination": dec.hallucination,
+        "relevance": dec.relevance,
+        "toxicity": dec.toxicity,
+        "bias": dec.bias,
+        "conciseness": dec.conciseness,
+        "coherence": dec.coherence,
+        "completeness": dec.completeness,
+        "helpfulness": dec.helpfulness,
+        "faithfulness": dec.faithfulness,
+        "contextual_precision": dec.contextual_precision,
+        "contextual_recall": dec.contextual_recall,
+        "answer_correctness": dec.answer_correctness,
+        "rouge": dec.rouge,
+        "exact_match": dec.exact_match,
+        "contains": dec.contains,
+        "tool_call_accuracy": dec.tool_call_accuracy,
+        "tool_sequence": dec.tool_sequence,
+        "tool_args_match": dec.tool_args_match,
+        "context_retention": dec.context_retention,
+        "consistency": dec.consistency,
+        "topic_drift": dec.topic_drift,
+    }
+
+    @pytest.mark.parametrize(
+        "metric_type,decorator_fn", list(_simple_decorators.items())
+    )
+    def test_simple_decorator(self, metric_type, decorator_fn):
+        @decorator_fn()
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert len(configs) == 1
+        assert configs[0].metric_type == metric_type
+
+    def test_regex_decorator(self):
+        @dec.regex(pattern=r"\d+")
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].metric_type == "regex"
+
+    def test_criteria_decorator(self):
+        @dec.criteria("Is the response helpful?")
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].metric_type == "criteria"
+
+    def test_geval_decorator(self):
+        # geval is an alias for criteria
+        @dec.geval(criteria="Is the response helpful?")
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].metric_type == "criteria"
+
+    def test_instruction_following_decorator(self):
+        @dec.instruction_following(instructions=["Be concise", "Use examples"])
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].metric_type == "instruction_following"
+
+    def test_json_decorator(self):
+        class User(BaseModel):
+            name: str
+
+        @dec.json(model=User)
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].metric_type == "json"
+
+
+# ── Decorator stacking ───────────────────────────────────────────────────────
+
+
+class TestDecoratorStacking:
+    def test_multiple_decorators(self):
+        @dec.correctness()
+        @dec.relevance()
+        @dec.contains()
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert len(configs) == 3
+        types = [c.metric_type for c in configs]
+        assert "correctness" in types
+        assert "relevance" in types
+        assert "contains" in types
+
+
+# ── Data decorators ──────────────────────────────────────────────────────────
+
+
+class TestCsvDecorator:
+    def test_csv_decorator(self):
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".csv", delete=False, newline=""
+        ) as f:
+            writer = csv.writer(f)
+            writer.writerow(["input", "expected"])
+            writer.writerow(["q1", "a1"])
+            writer.writerow(["q2", "a2"])
+            csv_path = f.name
+
+        try:
+
+            @dec.csv(csv_path)
+            def my_func(input, expected):
+                pass
+
+            assert callable(my_func)
+            assert hasattr(my_func, dec.fasteval_DATA_ATTR)
+        finally:
+            os.unlink(csv_path)
+
+
+# ── Human review decorator ───────────────────────────────────────────────────
+
+
+class TestHumanReviewDecorator:
+    def test_attaches_config(self):
+        @dec.human_review(prompt="Review this", required=True, threshold=0.8)
+        def my_func():
+            pass
+
+        assert hasattr(my_func, fasteval_HUMAN_REVIEW_ATTR)
+        config = getattr(my_func, fasteval_HUMAN_REVIEW_ATTR)
+        assert config["prompt"] == "Review this"
+        assert config["required"] is True
+        assert config["threshold"] == 0.8
+
+
+# ── Stack decorator ──────────────────────────────────────────────────────────
+
+
+class TestConversationDecorator:
+    def test_sync_conversation(self):
+        results = []
+
+        @dec.conversation(
+            [
+                {"query": "Hello", "expected": "Hi"},
+                {"query": "Bye", "expected": "Goodbye"},
+            ]
+        )
+        def my_func(query, expected, history):
+            results.append({"query": query, "expected": expected, "history": history})
+            return None
+
+        my_func()  # type: ignore[call-arg]
+        assert len(results) == 2
+        assert results[0]["query"] == "Hello"
+        assert results[0]["history"] == []
+        assert results[1]["query"] == "Bye"
+
+    @pytest.mark.asyncio
+    async def test_async_conversation(self):
+        results = []
+
+        @dec.conversation(
+            [
+                {"query": "Hello"},
+                {"query": "Bye"},
+            ]
+        )
+        async def my_func(query, expected, history):
+            results.append({"query": query, "history": history})
+
+        await my_func()  # type: ignore[call-arg]
+        assert len(results) == 2
+
+    def test_conversation_preserves_metrics(self):
+        @dec.correctness()
+        @dec.conversation([{"query": "Hi"}])
+        def my_func(query, expected, history):
+            pass
+
+        assert hasattr(my_func, fasteval_METRICS_ATTR)
+
+
+class TestCriteriaWithEvaluationSteps:
+    def test_criteria_with_steps(self):
+        @dec.criteria("Be formal", evaluation_steps=["Step1", "Step2"])
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert configs[0].config["criteria"] == "Be formal"
+        assert configs[0].config["evaluation_steps"] == ["Step1", "Step2"]
+
+
+class TestStackDecorator:
+    def test_stack_combines_metrics(self):
+        # @fe.stack() goes at the TOP, captures decorators below it
+        @dec.stack()
+        @dec.correctness()
+        @dec.relevance()
+        def my_stack():
+            pass
+
+        # my_stack is now a decorator itself
+        @my_stack
+        def my_func():
+            pass
+
+        configs = getattr(my_func, fasteval_METRICS_ATTR)
+        assert len(configs) == 2
+        types = [c.metric_type for c in configs]
+        assert "correctness" in types
+        assert "relevance" in types
diff --git a/tests/test_deterministic_metrics.py b/tests/test_deterministic_metrics.py
new file mode 100644
index 0000000..3efb0c2
--- /dev/null
+++ b/tests/test_deterministic_metrics.py
@@ -0,0 +1,695 @@
+"""Tests for fasteval.metrics.deterministic."""
+
+import re
+
+import pytest
+from pydantic import BaseModel
+
+from fasteval.metrics.deterministic import (
+    ContainsMetric,
+    ExactMatchMetric,
+    JsonMetric,
+    RegexMetric,
+    RougeMetric,
+    ToolArgsMatchMetric,
+    ToolCallAccuracyMetric,
+    ToolSequenceMetric,
+    _match_tool_name,
+)
+from fasteval.models.evaluation import EvalInput, ExpectedTool, ToolCall
+
+
+class UserModel(BaseModel):
+    name: str
+    age: int
+
+
+# ── RougeMetric ──────────────────────────────────────────────────────────────
+
+
+class TestRougeMetric:
+    @pytest.mark.asyncio
+    async def test_high_similarity(self):
+        metric = RougeMetric(threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="the cat sat on the mat",
+                expected_output="the cat sat on the mat",
+            )
+        )
+        assert result.score == 1.0
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_low_similarity(self):
+        metric = RougeMetric(threshold=0.9)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="completely different text",
+                expected_output="the cat sat on the mat",
+            )
+        )
+        assert result.score < 0.9
+        assert result.passed is False
+
+    @pytest.mark.asyncio
+    async def test_missing_actual(self):
+        metric = RougeMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output=None, expected_output="expected")
+        )
+        assert result.score == 0.0
+        assert result.passed is False
+
+    @pytest.mark.asyncio
+    async def test_missing_expected(self):
+        metric = RougeMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output="actual", expected_output=None)
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_details_include_precision_recall(self):
+        metric = RougeMetric(threshold=0.3)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="the cat sat on the mat",
+                expected_output="the cat sat on the mat",
+            )
+        )
+        assert "precision" in result.details
+        assert "recall" in result.details
+        assert "fmeasure" in result.details
+
+    def test_custom_name(self):
+        metric = RougeMetric(name="my_rouge", rouge_type="rouge1")
+        assert metric.name == "my_rouge"
+        assert metric.rouge_type == "rouge1"
+
+
+# ── ExactMatchMetric ─────────────────────────────────────────────────────────
+
+
+class TestExactMatchMetric:
+    @pytest.mark.asyncio
+    async def test_exact_match(self):
+        metric = ExactMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output="Hello World", expected_output="hello world")
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_case_sensitive(self):
+        metric = ExactMatchMetric(case_sensitive=True)
+        result = await metric.evaluate(
+            EvalInput(actual_output="Hello", expected_output="hello")
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_normalize_whitespace(self):
+        metric = ExactMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output="hello   world", expected_output="hello world")
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_no_match(self):
+        metric = ExactMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output="yes", expected_output="no")
+        )
+        assert result.score == 0.0
+        assert result.passed is False
+
+    @pytest.mark.asyncio
+    async def test_missing_output(self):
+        metric = ExactMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output=None, expected_output="expected")
+        )
+        assert result.score == 0.0
+
+
+# ── ContainsMetric ───────────────────────────────────────────────────────────
+
+
+class TestContainsMetric:
+    @pytest.mark.asyncio
+    async def test_contains(self):
+        metric = ContainsMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="The answer is 42 indeed",
+                expected_output="42",
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_does_not_contain(self):
+        metric = ContainsMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="The answer is unknown",
+                expected_output="42",
+            )
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_case_insensitive(self):
+        metric = ContainsMetric(case_sensitive=False)
+        result = await metric.evaluate(
+            EvalInput(actual_output="HELLO WORLD", expected_output="hello")
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_case_sensitive(self):
+        metric = ContainsMetric(case_sensitive=True)
+        result = await metric.evaluate(
+            EvalInput(actual_output="HELLO WORLD", expected_output="hello")
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_missing_output(self):
+        metric = ContainsMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output=None, expected_output="test")
+        )
+        assert result.score == 0.0
+
+
+# ── JsonMetric ───────────────────────────────────────────────────────────────
+
+
+class TestJsonMetric:
+    @pytest.mark.asyncio
+    async def test_valid_json(self):
+        metric = JsonMetric(model=UserModel)
+        result = await metric.evaluate(
+            EvalInput(actual_output='{"name": "Alice", "age": 30}')
+        )
+        assert result.score == 1.0
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_invalid_json_syntax(self):
+        metric = JsonMetric(model=UserModel)
+        result = await metric.evaluate(EvalInput(actual_output="{not valid json}"))
+        assert result.score == 0.0
+        assert result.passed is False
+
+    @pytest.mark.asyncio
+    async def test_schema_validation_failure(self):
+        metric = JsonMetric(model=UserModel)
+        result = await metric.evaluate(EvalInput(actual_output='{"name": "Alice"}'))
+        assert result.score == 0.0
+        assert "validation" in result.details.get("error_type", "")
+
+    @pytest.mark.asyncio
+    async def test_missing_output(self):
+        metric = JsonMetric(model=UserModel)
+        result = await metric.evaluate(EvalInput(actual_output=None))
+        assert result.score == 0.0
+
+
+# ── RegexMetric ──────────────────────────────────────────────────────────────
+
+
+class TestRegexMetric:
+    @pytest.mark.asyncio
+    async def test_full_match(self):
+        metric = RegexMetric(pattern=r"\d{3}-\d{4}")
+        result = await metric.evaluate(EvalInput(actual_output="123-4567"))
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_full_match_fails(self):
+        metric = RegexMetric(pattern=r"\d{3}-\d{4}")
+        result = await metric.evaluate(EvalInput(actual_output="phone: 123-4567"))
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_search_match(self):
+        metric = RegexMetric(pattern=r"\d{3}-\d{4}", full_match=False)
+        result = await metric.evaluate(EvalInput(actual_output="phone: 123-4567"))
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_no_match(self):
+        metric = RegexMetric(pattern=r"\d+", full_match=False)
+        result = await metric.evaluate(EvalInput(actual_output="no digits here"))
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_flags_ignorecase(self):
+        metric = RegexMetric(pattern=r"^yes$", flags=re.IGNORECASE)
+        result = await metric.evaluate(EvalInput(actual_output="YES"))
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_missing_output(self):
+        metric = RegexMetric(pattern=r"\d+")
+        result = await metric.evaluate(EvalInput(actual_output=None))
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_match_details(self):
+        metric = RegexMetric(pattern=r"\d+", full_match=False)
+        result = await metric.evaluate(EvalInput(actual_output="abc123def"))
+        assert result.details["match"] == "123"
+        assert result.details["match_start"] == 3
+
+
+# ── _match_tool_name ─────────────────────────────────────────────────────────
+
+
+class TestMatchToolName:
+    def test_exact_match(self):
+        assert _match_tool_name("search_flights", "search_flights") is True
+
+    def test_wildcard_prefix(self):
+        assert _match_tool_name("search_flights", "search_*") is True
+
+    def test_wildcard_suffix(self):
+        assert _match_tool_name("search_flights", "*_flights") is True
+
+    def test_no_match(self):
+        assert _match_tool_name("search_flights", "book_*") is False
+
+
+# ── ToolCallAccuracyMetric ───────────────────────────────────────────────────
+
+
+class TestToolCallAccuracyMetric:
+    @pytest.mark.asyncio
+    async def test_all_tools_match(self):
+        metric = ToolCallAccuracyMetric(threshold=0.8)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search"), ToolCall(name="book")],
+                expected_tools=[
+                    ExpectedTool(name="search"),
+                    ExpectedTool(name="book"),
+                ],
+            )
+        )
+        assert result.score == 1.0
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_missing_required_tool(self):
+        metric = ToolCallAccuracyMetric(threshold=0.8)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search")],
+                expected_tools=[
+                    ExpectedTool(name="search"),
+                    ExpectedTool(name="book"),
+                ],
+            )
+        )
+        assert result.score < 1.0
+
+    @pytest.mark.asyncio
+    async def test_extra_tools_penalized(self):
+        metric = ToolCallAccuracyMetric(threshold=0.8, ignore_extra=False)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[
+                    ToolCall(name="search"),
+                    ToolCall(name="book"),
+                    ToolCall(name="cancel"),
+                ],
+                expected_tools=[
+                    ExpectedTool(name="search"),
+                    ExpectedTool(name="book"),
+                ],
+            )
+        )
+        # 2 matched / max(2 required, 3 actual) = 2/3
+        assert abs(result.score - 2 / 3) < 0.01
+
+    @pytest.mark.asyncio
+    async def test_ignore_extra(self):
+        metric = ToolCallAccuracyMetric(threshold=0.8, ignore_extra=True)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[
+                    ToolCall(name="search"),
+                    ToolCall(name="book"),
+                    ToolCall(name="cancel"),
+                ],
+                expected_tools=[
+                    ExpectedTool(name="search"),
+                    ExpectedTool(name="book"),
+                ],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_wildcard_matching(self):
+        metric = ToolCallAccuracyMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search_flights")],
+                expected_tools=[ExpectedTool(name="search_*")],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_no_expected_tools_no_actual(self):
+        metric = ToolCallAccuracyMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output="result", tool_calls=[], expected_tools=[])
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_no_expected_with_actual(self):
+        metric = ToolCallAccuracyMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search")],
+                expected_tools=[],
+            )
+        )
+        assert result.score == 0.0
+
+
+# ── ToolSequenceMetric ───────────────────────────────────────────────────────
+
+
+class TestToolSequenceMetric:
+    @pytest.mark.asyncio
+    async def test_strict_exact_match(self):
+        metric = ToolSequenceMetric(strict=True)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="a"), ToolCall(name="b"), ToolCall(name="c")],
+                expected_tools=[
+                    ExpectedTool(name="a"),
+                    ExpectedTool(name="b"),
+                    ExpectedTool(name="c"),
+                ],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_strict_mismatch(self):
+        metric = ToolSequenceMetric(strict=True)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="b"), ToolCall(name="a")],
+                expected_tools=[
+                    ExpectedTool(name="a"),
+                    ExpectedTool(name="b"),
+                ],
+            )
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_lcs_scoring(self):
+        metric = ToolSequenceMetric(strict=False)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[
+                    ToolCall(name="a"),
+                    ToolCall(name="c"),
+                    ToolCall(name="b"),
+                ],
+                expected_tools=[
+                    ExpectedTool(name="a"),
+                    ExpectedTool(name="b"),
+                    ExpectedTool(name="c"),
+                ],
+            )
+        )
+        # LCS of [a, c, b] vs [a, b, c] -> LCS len 2 / max(3, 3) = 2/3
+        assert result.score > 0.0
+        assert result.score < 1.0
+
+    @pytest.mark.asyncio
+    async def test_no_expected(self):
+        metric = ToolSequenceMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output="result", tool_calls=[], expected_tools=[])
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_no_required_tools(self):
+        metric = ToolSequenceMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="a")],
+                expected_tools=[
+                    ExpectedTool(name="a", required=False),
+                ],
+            )
+        )
+        assert result.score == 1.0
+
+
+# ── ToolArgsMatchMetric ──────────────────────────────────────────────────────
+
+
+class TestToolArgsMatchMetric:
+    @pytest.mark.asyncio
+    async def test_all_args_match(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[
+                    ToolCall(
+                        name="search", arguments={"dest": "NYC", "date": "2024-01-01"}
+                    )
+                ],
+                expected_tools=[
+                    ExpectedTool(
+                        name="search", args={"dest": "NYC", "date": "2024-01-01"}
+                    )
+                ],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_value_mismatch(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search", arguments={"dest": "LAX"})],
+                expected_tools=[ExpectedTool(name="search", args={"dest": "NYC"})],
+            )
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_missing_arg(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search", arguments={})],
+                expected_tools=[ExpectedTool(name="search", args={"dest": "NYC"})],
+            )
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_tool_not_called(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[],
+                expected_tools=[ExpectedTool(name="search", args={"dest": "NYC"})],
+            )
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_no_expected_args(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search", arguments={"dest": "NYC"})],
+                expected_tools=[ExpectedTool(name="search", args={})],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_no_expected_tools(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(actual_output="result", tool_calls=[], expected_tools=[])
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_numeric_comparison(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="calc", arguments={"value": 3.14})],
+                expected_tools=[ExpectedTool(name="calc", args={"value": 3.14})],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_string_case_insensitive(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search", arguments={"dest": "nyc"})],
+                expected_tools=[ExpectedTool(name="search", args={"dest": "NYC"})],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_none_expected_value(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="search", arguments={"dest": "anything"})],
+                expected_tools=[ExpectedTool(name="search", args={"dest": None})],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_numeric_tolerance(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[ToolCall(name="calc", arguments={"value": 3.14000001})],
+                expected_tools=[ExpectedTool(name="calc", args={"value": 3.14})],
+            )
+        )
+        assert result.score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_multiple_tools_partial_match(self):
+        metric = ToolArgsMatchMetric()
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[
+                    ToolCall(name="a", arguments={"x": 1}),
+                    ToolCall(name="b", arguments={"y": "wrong"}),
+                ],
+                expected_tools=[
+                    ExpectedTool(name="a", args={"x": 1}),
+                    ExpectedTool(name="b", args={"y": "correct"}),
+                ],
+            )
+        )
+        assert result.score == 0.5
+
+
+# ── RougeMetric additional ───────────────────────────────────────────────────
+
+
+class TestRougeMetricAdditional:
+    @pytest.mark.asyncio
+    async def test_partial_overlap(self):
+        metric = RougeMetric(threshold=0.3)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="the quick brown fox",
+                expected_output="the slow brown dog",
+            )
+        )
+        assert 0.0 < result.score < 1.0
+        assert result.passed is True
+
+    def test_default_name(self):
+        metric = RougeMetric()
+        assert metric.name == "rouge"
+
+
+# ── ExactMatch additional ────────────────────────────────────────────────────
+
+
+class TestExactMatchAdditional:
+    @pytest.mark.asyncio
+    async def test_no_normalize(self):
+        metric = ExactMatchMetric(normalize=False, case_sensitive=False)
+        result = await metric.evaluate(
+            EvalInput(actual_output="hello   world", expected_output="hello world")
+        )
+        assert result.score == 0.0  # Extra spaces not normalized
+
+    def test_default_name(self):
+        metric = ExactMatchMetric()
+        assert metric.name == "exact_match"
+
+
+# ── RegexMetric additional ───────────────────────────────────────────────────
+
+
+class TestRegexMetricAdditional:
+    @pytest.mark.asyncio
+    async def test_multiline(self):
+        metric = RegexMetric(pattern=r"hello", flags=re.MULTILINE, full_match=False)
+        result = await metric.evaluate(EvalInput(actual_output="foo\nhello\nbar"))
+        assert result.score == 1.0
+
+    def test_default_name(self):
+        metric = RegexMetric(pattern=r"\d+")
+        assert metric.name == "regex"
+
+
+# ── ToolSequence additional ──────────────────────────────────────────────────
+
+
+class TestToolSequenceAdditional:
+    @pytest.mark.asyncio
+    async def test_empty_actual_tools(self):
+        metric = ToolSequenceMetric(strict=True)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="result",
+                tool_calls=[],
+                expected_tools=[ExpectedTool(name="a")],
+            )
+        )
+        assert result.score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_lcs_empty_sequences(self):
+        metric = ToolSequenceMetric(strict=False)
+        # Test internal LCS with empty sequences
+        assert metric._longest_common_subsequence([], ["a"]) == 0
+        assert metric._longest_common_subsequence(["a"], []) == 0
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
new file mode 100644
index 0000000..35baa86
--- /dev/null
+++ b/tests/test_evaluator.py
@@ -0,0 +1,325 @@
+"""Tests for fasteval.core.evaluator."""
+
+import json
+from typing import Any
+
+import pytest
+
+from fasteval.core.evaluator import (
+    METRIC_REGISTRY,
+    Evaluator,
+    EvaluatorConfig,
+    create_evaluator,
+)
+from fasteval.metrics.base import Metric
+from fasteval.models.config import MetricConfig
+from fasteval.models.evaluation import EvalInput, MetricResult
+
+
+class MockLLMClient:
+    async def invoke(self, messages):
+        return json.dumps({"score": 0.9, "reasoning": "Mock"})
+
+
+# ── EvaluatorConfig ──────────────────────────────────────────────────────────
+
+
+class TestEvaluatorConfig:
+    def test_defaults(self):
+        config = EvaluatorConfig()
+        assert config.fail_fast is False
+        assert config.parallel is True
+        assert config.cache_enabled is True
+
+    def test_custom(self):
+        config = EvaluatorConfig(fail_fast=True, parallel=False)
+        assert config.fail_fast is True
+        assert config.parallel is False
+
+
+# ── Evaluator._create_metric ─────────────────────────────────────────────────
+
+
+class TestEvaluatorCreateMetric:
+    def test_standard_deterministic_metric(self):
+        evaluator = Evaluator()
+        config = MetricConfig(metric_type="exact_match", name="em", threshold=1.0)
+        metric = evaluator._create_metric(config)
+        assert metric.name == "em"
+        assert metric.threshold == 1.0
+
+    def test_custom_metric_with_instance(self):
+        class MyMetric(Metric):
+            async def evaluate(self, eval_input):
+                return MetricResult(
+                    metric_name="custom", score=1.0, passed=True, threshold=0.5
+                )
+
+        instance = MyMetric(name="custom")
+        evaluator = Evaluator()
+        config = MetricConfig(
+            metric_type="custom",
+            name="custom",
+            config={"instance": instance},
+        )
+        metric = evaluator._create_metric(config)
+        assert metric is instance
+
+    def test_unknown_metric_type(self):
+        evaluator = Evaluator()
+        config = MetricConfig(metric_type="nonexistent", name="bad")
+        with pytest.raises(ValueError, match="Unknown metric type"):
+            evaluator._create_metric(config)
+
+    def test_json_metric_pydantic_model(self):
+        from pydantic import BaseModel
+
+        class User(BaseModel):
+            name: str
+
+        evaluator = Evaluator()
+        config = MetricConfig(
+            metric_type="json",
+            name="json_check",
+            config={"pydantic_model": User},
+        )
+        metric: Any = evaluator._create_metric(config)
+        assert metric.model is User
+
+    def test_criteria_metric(self):
+        evaluator = Evaluator()
+        config = MetricConfig(
+            metric_type="criteria",
+            name="criteria_check",
+            config={"criteria": "Be formal"},
+            llm_client=MockLLMClient(),
+        )
+        metric: Any = evaluator._create_metric(config)
+        assert metric.criteria == "Be formal"
+
+    def test_criteria_with_evaluation_steps(self):
+        evaluator = Evaluator()
+        config = MetricConfig(
+            metric_type="geval",
+            name="geval_check",
+            config={
+                "criteria": "test",
+                "evaluation_steps": ["Step 1"],
+            },
+            llm_client=MockLLMClient(),
+        )
+        metric: Any = evaluator._create_metric(config)
+        assert metric.evaluation_steps == ["Step 1"]
+
+    def test_instruction_following_metric(self):
+        evaluator = Evaluator()
+        config = MetricConfig(
+            metric_type="instruction_following",
+            name="if_check",
+            config={"instructions": "Be concise"},
+            llm_client=MockLLMClient(),
+        )
+        metric: Any = evaluator._create_metric(config)
+        assert metric.instructions == "Be concise"
+
+    def test_with_llm_client(self):
+        client = MockLLMClient()
+        evaluator = Evaluator()
+        config = MetricConfig(
+            metric_type="correctness",
+            name="corr",
+            llm_client=client,
+        )
+        metric: Any = evaluator._create_metric(config)
+        assert metric._llm_client is client
+
+    def test_with_llm_config_model(self):
+        evaluator = Evaluator()
+        config = MetricConfig(
+            metric_type="correctness",
+            name="corr",
+            llm_config={"model": "gpt-4o"},
+        )
+        metric: Any = evaluator._create_metric(config)
+        assert metric._model_override == "gpt-4o"
+
+
+# ── Evaluator.evaluate ───────────────────────────────────────────────────────
+
+
+class TestEvaluatorEvaluate:
+    @pytest.mark.asyncio
+    async def test_single_metric_pass(self):
+        evaluator = Evaluator()
+        result = await evaluator.evaluate(
+            eval_input=EvalInput(actual_output="hello", expected_output="hello"),
+            metrics=[MetricConfig(metric_type="exact_match", name="em", threshold=1.0)],
+        )
+        assert result.passed is True
+        assert result.aggregate_score == 1.0
+        assert len(result.metric_results) == 1
+
+    @pytest.mark.asyncio
+    async def test_single_metric_fail(self):
+        evaluator = Evaluator()
+        result = await evaluator.evaluate(
+            eval_input=EvalInput(actual_output="yes", expected_output="no"),
+            metrics=[MetricConfig(metric_type="exact_match", name="em", threshold=1.0)],
+        )
+        assert result.passed is False
+        assert result.aggregate_score == 0.0
+
+    @pytest.mark.asyncio
+    async def test_parallel_execution(self):
+        evaluator = Evaluator(EvaluatorConfig(parallel=True))
+        result = await evaluator.evaluate(
+            eval_input=EvalInput(
+                actual_output="hello world", expected_output="hello world"
+            ),
+            metrics=[
+                MetricConfig(metric_type="exact_match", name="em1", threshold=1.0),
+                MetricConfig(metric_type="contains", name="contains1", threshold=1.0),
+            ],
+        )
+        assert result.passed is True
+        assert len(result.metric_results) == 2
+
+    @pytest.mark.asyncio
+    async def test_sequential_fail_fast(self):
+        evaluator = Evaluator(EvaluatorConfig(parallel=False, fail_fast=True))
+        result = await evaluator.evaluate(
+            eval_input=EvalInput(actual_output="yes", expected_output="no"),
+            metrics=[
+                MetricConfig(metric_type="exact_match", name="em1", threshold=1.0),
+                MetricConfig(metric_type="exact_match", name="em2", threshold=1.0),
+            ],
+        )
+        # fail_fast should stop after first failure
+        assert result.passed is False
+        assert len(result.metric_results) == 1
+
+    @pytest.mark.asyncio
+    async def test_weighted_aggregate(self):
+        evaluator = Evaluator()
+        result = await evaluator.evaluate(
+            eval_input=EvalInput(actual_output="hello", expected_output="hello"),
+            metrics=[
+                MetricConfig(
+                    metric_type="exact_match", name="em", threshold=1.0, weight=2.0
+                ),
+                MetricConfig(
+                    metric_type="contains", name="ct", threshold=1.0, weight=1.0
+                ),
+            ],
+        )
+        # Both pass with score 1.0: (1.0*2 + 1.0*1) / 3 = 1.0
+        assert result.aggregate_score == 1.0
+
+    @pytest.mark.asyncio
+    async def test_execution_time_recorded(self):
+        evaluator = Evaluator()
+        result = await evaluator.evaluate(
+            eval_input=EvalInput(actual_output="a", expected_output="a"),
+            metrics=[MetricConfig(metric_type="exact_match", name="em")],
+        )
+        assert result.execution_time_ms >= 0
+
+    @pytest.mark.asyncio
+    async def test_reference_id_preserved(self):
+        evaluator = Evaluator()
+        result = await evaluator.evaluate(
+            eval_input=EvalInput(
+                actual_output="a",
+                expected_output="a",
+                reference_id="ref-123",
+            ),
+            metrics=[MetricConfig(metric_type="exact_match", name="em")],
+        )
+        assert result.reference_id == "ref-123"
+
+
+# ── Evaluator._evaluate_metric ───────────────────────────────────────────────
+
+
+class TestEvaluatorEvaluateMetric:
+    @pytest.mark.asyncio
+    async def test_error_handling(self):
+        class BrokenMetric(Metric):
+            async def evaluate(self, eval_input):
+                raise RuntimeError("metric broke")
+
+        evaluator = Evaluator()
+        metric = BrokenMetric(name="broken", threshold=0.5)
+        result = await evaluator._evaluate_metric(
+            metric, EvalInput(actual_output="test")
+        )
+        assert result.score == 0.0
+        assert result.passed is False
+        assert result.reasoning is not None and "metric broke" in result.reasoning
+
+
+# ── Evaluator.evaluate_batch ─────────────────────────────────────────────────
+
+
+class TestEvaluatorBatch:
+    @pytest.mark.asyncio
+    async def test_batch(self):
+        evaluator = Evaluator()
+        inputs = [
+            EvalInput(actual_output="a", expected_output="a"),
+            EvalInput(actual_output="b", expected_output="b"),
+        ]
+        results = await evaluator.evaluate_batch(
+            inputs,
+            [MetricConfig(metric_type="exact_match", name="em")],
+        )
+        assert len(results) == 2
+        assert all(r.passed for r in results)
+
+
+# ── create_evaluator ─────────────────────────────────────────────────────────
+
+
+class TestCreateEvaluator:
+    def test_factory(self):
+        evaluator = create_evaluator(fail_fast=True, parallel=False)
+        assert evaluator.config.fail_fast is True
+        assert evaluator.config.parallel is False
+
+
+# ── METRIC_REGISTRY ──────────────────────────────────────────────────────────
+
+
+class TestMetricRegistry:
+    def test_core_metrics_registered(self):
+        expected = [
+            "correctness",
+            "hallucination",
+            "relevance",
+            "criteria",
+            "geval",
+            "toxicity",
+            "bias",
+            "conciseness",
+            "coherence",
+            "completeness",
+            "helpfulness",
+            "instruction_following",
+            "faithfulness",
+            "contextual_precision",
+            "contextual_recall",
+            "answer_correctness",
+            "rouge",
+            "exact_match",
+            "contains",
+            "json",
+            "regex",
+            "tool_call_accuracy",
+            "tool_sequence",
+            "tool_args_match",
+            "context_retention",
+            "consistency",
+            "topic_drift",
+        ]
+        for name in expected:
+            assert name in METRIC_REGISTRY, f"{name} not in METRIC_REGISTRY"
diff --git a/tests/test_formatting.py b/tests/test_formatting.py
new file mode 100644
index 0000000..e17e6e8
--- /dev/null
+++ b/tests/test_formatting.py
@@ -0,0 +1,88 @@
+"""Tests for fasteval.utils.formatting."""
+
+from fasteval.models.evaluation import EvalInput, EvalResult, MetricResult
+from fasteval.utils.formatting import format_evaluation_report
+
+
+def _make_metric_result(
+    name="test_metric", score=0.5, passed=True, threshold=0.5, reasoning=""
+):
+    return MetricResult(
+        metric_name=name,
+        score=score,
+        passed=passed,
+        threshold=threshold,
+        reasoning=reasoning,
+    )
+
+
+def _make_eval_result(metric_results=None, passed=True, aggregate_score=1.0):
+    return EvalResult(
+        eval_input=EvalInput(actual_output="actual", expected_output="expected"),
+        metric_results=metric_results or [],
+        passed=passed,
+        aggregate_score=aggregate_score,
+    )
+
+
+class TestFormatEvaluationReport:
+    def test_basic_report_structure(self):
+        mr = _make_metric_result(score=0.3, passed=False, reasoning="Bad output")
+        result = _make_eval_result(metric_results=[mr], passed=False)
+        report = format_evaluation_report("test_func", [result])
+
+        assert "FASTEVAL EVALUATION FAILED" in report
+        assert "test_func" in report
+        assert "0/1 metrics passed" in report
+
+    def test_failed_metric_shows_reasoning(self):
+        mr = _make_metric_result(
+            score=0.1, passed=False, threshold=0.7, reasoning="Not correct"
+        )
+        result = _make_eval_result(metric_results=[mr], passed=False)
+        report = format_evaluation_report("test_func", [result])
+
+        assert "Not correct" in report
+        assert "0.10 / 0.70" in report
+
+    def test_passed_metric_no_reasoning(self):
+        mr = _make_metric_result(score=0.9, passed=True)
+        result = _make_eval_result(metric_results=[mr])
+        report = format_evaluation_report("test_func", [result])
+
+        assert "1/1 metrics passed" in report
+
+    def test_with_eval_inputs(self):
+        result = _make_eval_result()
+        eval_input = EvalInput(
+            actual_output="my actual",
+            expected_output="my expected",
+            input="my question",
+        )
+        report = format_evaluation_report("test_func", [result], [eval_input])
+
+        assert "my question" in report
+        assert "my expected" in report
+        assert "my actual" in report
+
+    def test_without_eval_inputs(self):
+        result = _make_eval_result()
+        report = format_evaluation_report("test_func", [result])
+        # Should still generate without errors
+        assert "FASTEVAL EVALUATION FAILED" in report
+
+    def test_empty_metrics(self):
+        result = _make_eval_result(metric_results=[])
+        report = format_evaluation_report("test_func", [result])
+        assert "0/0 metrics passed" in report
+
+    def test_mixed_pass_fail(self):
+        mr1 = _make_metric_result(name="m1", score=0.9, passed=True)
+        mr2 = _make_metric_result(
+            name="m2", score=0.2, passed=False, reasoning="fail reason"
+        )
+        result = _make_eval_result(metric_results=[mr1, mr2], passed=False)
+        report = format_evaluation_report("test_func", [result])
+
+        assert "1/2 metrics passed" in report
+        assert "fail reason" in report
diff --git a/tests/test_json_parsing.py b/tests/test_json_parsing.py
new file mode 100644
index 0000000..bcc72b1
--- /dev/null
+++ b/tests/test_json_parsing.py
@@ -0,0 +1,113 @@
+"""Tests for fasteval.utils.json_parsing."""
+
+import pytest
+from pydantic import BaseModel
+
+from fasteval.utils.json_parsing import extract_json_from_text, parse_json_response
+
+
+class SampleModel(BaseModel):
+    score: float
+    reasoning: str = ""
+
+
+class TestExtractJsonFromText:
+    def test_direct_json(self):
+        result = extract_json_from_text('{"score": 0.8, "reasoning": "Good"}')
+        assert result == {"score": 0.8, "reasoning": "Good"}
+
+    def test_markdown_code_block(self):
+        text = 'Here is the result:\n```json\n{"score": 0.5, "reasoning": "OK"}\n```'
+        result = extract_json_from_text(text)
+        assert result == {"score": 0.5, "reasoning": "OK"}
+
+    def test_markdown_code_block_no_json_tag(self):
+        text = '```\n{"score": 0.9}\n```'
+        result = extract_json_from_text(text)
+        assert result == {"score": 0.9}
+
+    def test_embedded_json_with_score(self):
+        text = 'The evaluation shows {"score": 0.7, "reasoning": "decent"} overall.'
+        result = extract_json_from_text(text)
+        assert result == {"score": 0.7, "reasoning": "decent"}
+
+    def test_score_only_fallback(self):
+        text = "The score is score: 0.85 based on analysis"
+        result = extract_json_from_text(text)
+        assert result is not None
+        assert result["score"] == 0.85
+
+    def test_score_clamping_above_one(self):
+        text = "score: 1.5"
+        result = extract_json_from_text(text)
+        assert result is not None
+        assert result["score"] == 1.0
+
+    def test_no_match_for_negative_score(self):
+        # Regex only matches positive numbers, so negative scores don't match
+        text = "score: -0.5"
+        result = extract_json_from_text(text)
+        # Falls through to "score" fallback but regex captures "0.5" from "-0.5"
+        if result is not None:
+            assert 0.0 <= result["score"] <= 1.0
+
+    def test_empty_string(self):
+        assert extract_json_from_text("") is None
+
+    def test_no_json_found(self):
+        assert extract_json_from_text("no json here at all") is None
+
+    def test_invalid_json_in_code_block(self):
+        text = "```json\n{invalid json}\n```"
+        # Should fall through to other strategies
+        result = extract_json_from_text(text)
+        assert result is None
+
+    def test_score_with_equals(self):
+        text = "score=0.6"
+        result = extract_json_from_text(text)
+        assert result is not None
+        assert result["score"] == 0.6
+
+
+class TestParseJsonResponse:
+    def test_valid_model(self):
+        result = parse_json_response('{"score": 0.8, "reasoning": "Good"}', SampleModel)
+        assert isinstance(result, SampleModel)
+        assert result.score == 0.8
+        assert result.reasoning == "Good"
+
+    def test_extraction_failure(self):
+        with pytest.raises(ValueError, match="Could not extract JSON"):
+            parse_json_response("no json here", SampleModel)
+
+    def test_validation_failure(self):
+        with pytest.raises(ValueError, match="JSON validation failed"):
+            parse_json_response('{"score": "not_a_number"}', SampleModel)
+
+    def test_from_markdown_code_block(self):
+        text = '```json\n{"score": 0.9, "reasoning": "Excellent"}\n```'
+        result = parse_json_response(text, SampleModel)
+        assert result.score == 0.9
+
+
+class TestExtractJsonEdgeCases:
+    def test_embedded_json_with_invalid_inner(self):
+        # JSON object found but has invalid content when parsed
+        text = 'Result: {"score": "bad"} end'
+        result = extract_json_from_text(text)
+        # It should still extract the JSON dict
+        assert result is not None
+        assert result["score"] == "bad"
+
+    def test_score_value_extraction_with_float(self):
+        text = "Based on analysis, score: 0.75 out of 1.0"
+        result = extract_json_from_text(text)
+        assert result is not None
+        assert result["score"] == 0.75
+
+    def test_score_extraction_with_single_quotes(self):
+        text = "score': 0.6"
+        result = extract_json_from_text(text)
+        assert result is not None
+        assert result["score"] == 0.6
diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py
new file mode 100644
index 0000000..8244ddd
--- /dev/null
+++ b/tests/test_llm_metrics.py
@@ -0,0 +1,412 @@
+"""Tests for fasteval.metrics.llm (with mocked LLM client)."""
+
+import json
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from fasteval.metrics.llm import (
+    AnswerCorrectnessMetric,
+    BaseLLMMetric,
+    BiasMetric,
+    CoherenceMetric,
+    CompletenessMetric,
+    ConcisenessMetric,
+    ContextualPrecisionMetric,
+    ContextualRecallMetric,
+    CorrectnessMetric,
+    CriteriaMetric,
+    FaithfulnessMetric,
+    HallucinationMetric,
+    HelpfulnessMetric,
+    InstructionFollowingMetric,
+    LLMEvalResponse,
+    RelevanceMetric,
+    ToxicityMetric,
+)
+from fasteval.models.evaluation import EvalInput
+
+
+class MockLLMClient:
+    """Mock LLM client that returns configurable responses."""
+
+    def __init__(self, response_text=None):
+        self.response_text = response_text or json.dumps(
+            {"score": 0.85, "reasoning": "Mock evaluation"}
+        )
+        self.call_count = 0
+
+    async def invoke(self, messages):
+        self.call_count += 1
+        return self.response_text
+
+
+class FailingLLMClient:
+    """Mock client that always raises."""
+
+    async def invoke(self, messages):
+        raise RuntimeError("LLM error")
+
+
+# ── LLMEvalResponse ─────────────────────────────────────────────────────────
+
+
+class TestLLMEvalResponse:
+    def test_valid(self):
+        resp = LLMEvalResponse(score=0.8, reasoning="Good")
+        assert resp.score == 0.8
+        assert resp.reasoning == "Good"
+
+    def test_score_bounds(self):
+        with pytest.raises(Exception):
+            LLMEvalResponse(score=1.5, reasoning="Out of range")
+
+    def test_default_reasoning(self):
+        resp = LLMEvalResponse(score=0.5)
+        assert resp.reasoning == ""
+
+
+# ── BaseLLMMetric ────────────────────────────────────────────────────────────
+
+
+class TestBaseLLMMetric:
+    def test_get_client_explicit(self):
+        client = MockLLMClient()
+        metric = CorrectnessMetric(llm_client=client)
+        assert metric._get_client() is client
+
+    def test_get_client_model_override(self):
+        metric = CorrectnessMetric(model="gpt-4o")
+        client = metric._get_client()
+        # Should create an OpenAI client
+        from fasteval.providers.openai import OpenAIClient
+
+        assert isinstance(client, OpenAIClient)
+
+    @pytest.mark.asyncio
+    async def test_evaluate_success(self):
+        client = MockLLMClient()
+        metric = CorrectnessMetric(llm_client=client, threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(
+                actual_output="4",
+                expected_output="4",
+                input="What is 2+2?",
+            )
+        )
+        assert result.score == 0.85
+        assert result.passed is True
+        assert result.reasoning == "Mock evaluation"
+
+    @pytest.mark.asyncio
+    async def test_evaluate_retry_then_success(self):
+        call_count = 0
+
+        class RetryClient:
+            async def invoke(self, messages):
+                nonlocal call_count
+                call_count += 1
+                if call_count == 1:
+                    return "not json"
+                return json.dumps({"score": 0.9, "reasoning": "OK"})
+
+        metric = CorrectnessMetric(llm_client=RetryClient(), max_retries=3)
+        result = await metric.evaluate(
+            EvalInput(actual_output="answer", expected_output="answer")
+        )
+        assert result.score == 0.9
+        assert call_count == 2
+
+    @pytest.mark.asyncio
+    async def test_evaluate_all_retries_fail(self):
+        metric = CorrectnessMetric(llm_client=FailingLLMClient(), max_retries=2)
+        result = await metric.evaluate(
+            EvalInput(actual_output="answer", expected_output="answer")
+        )
+        assert result.score == 0.0
+        assert result.passed is False
+        assert result.reasoning is not None and "LLM error" in result.reasoning
+
+    @pytest.mark.asyncio
+    async def test_binary_scoring(self):
+        client = MockLLMClient(json.dumps({"score": 0.7, "reasoning": "Partial"}))
+        metric = CorrectnessMetric(
+            llm_client=client, scoring_type="binary", threshold=0.5
+        )
+        result = await metric.evaluate(
+            EvalInput(actual_output="answer", expected_output="answer")
+        )
+        assert result.score == 1.0  # 0.7 >= 0.5 → 1.0
+
+    @pytest.mark.asyncio
+    async def test_binary_scoring_below(self):
+        client = MockLLMClient(json.dumps({"score": 0.3, "reasoning": "Low"}))
+        metric = CorrectnessMetric(
+            llm_client=client, scoring_type="binary", threshold=0.5
+        )
+        result = await metric.evaluate(
+            EvalInput(actual_output="answer", expected_output="answer")
+        )
+        assert result.score == 0.0  # 0.3 < 0.5 → 0.0
+
+
+# ── Specific Metric Prompt Tests ─────────────────────────────────────────────
+
+
+class TestCorrectnessMetric:
+    def test_default_name(self):
+        metric = CorrectnessMetric(llm_client=MockLLMClient())
+        assert metric.name == "correctness"
+
+    def test_prompt_contains_inputs(self):
+        metric = CorrectnessMetric(llm_client=MockLLMClient())
+        prompt = metric.get_evaluation_prompt(
+            EvalInput(
+                actual_output="4",
+                expected_output="4",
+                input="What is 2+2?",
+            )
+        )
+        assert "What is 2+2?" in prompt
+        assert "Semantic Equivalence" in prompt
+
+
+class TestHallucinationMetric:
+    def test_default_threshold(self):
+        metric = HallucinationMetric(llm_client=MockLLMClient())
+        assert metric.threshold == 0.9
+
+    def test_prompt_includes_context(self):
+        metric = HallucinationMetric(llm_client=MockLLMClient())
+        prompt = metric.get_evaluation_prompt(
+            EvalInput(
+                actual_output="The earth is flat",
+                context=["The earth is round"],
+            )
+        )
+        assert "The earth is round" in prompt
+
+
+class TestRelevanceMetric:
+    def test_default_name(self):
+        metric = RelevanceMetric(llm_client=MockLLMClient())
+        assert metric.name == "relevance"
+
+    @pytest.mark.asyncio
+    async def test_evaluation(self):
+        client = MockLLMClient()
+        metric = RelevanceMetric(llm_client=client, threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(actual_output="answer", input="question")
+        )
+        assert result.score == 0.85
+
+
+class TestCriteriaMetric:
+    def test_default_name(self):
+        metric = CriteriaMetric(llm_client=MockLLMClient(), criteria="Be concise")
+        assert metric.name == "criteria"
+
+    def test_prompt_includes_criteria(self):
+        metric = CriteriaMetric(
+            llm_client=MockLLMClient(), criteria="Answer must be formal"
+        )
+        prompt = metric.get_evaluation_prompt(EvalInput(actual_output="yo what up"))
+        assert "Answer must be formal" in prompt
+
+    def test_with_evaluation_steps(self):
+        metric = CriteriaMetric(
+            llm_client=MockLLMClient(),
+            criteria="test",
+            evaluation_steps=["Step 1", "Step 2"],
+        )
+        prompt = metric.get_evaluation_prompt(EvalInput(actual_output="test"))
+        assert "Step 1" in prompt
+
+
+class TestToxicityMetric:
+    def test_default_name(self):
+        metric = ToxicityMetric(llm_client=MockLLMClient())
+        assert metric.name == "toxicity"
+
+    @pytest.mark.asyncio
+    async def test_evaluation(self):
+        client = MockLLMClient()
+        metric = ToxicityMetric(llm_client=client, threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(actual_output="Hello, how can I help?")
+        )
+        assert result.passed is True
+
+
+class TestBiasMetric:
+    def test_default_name(self):
+        metric = BiasMetric(llm_client=MockLLMClient())
+        assert metric.name == "bias"
+
+
+class TestQualityMetrics:
+    def test_conciseness_name(self):
+        metric = ConcisenessMetric(llm_client=MockLLMClient())
+        assert metric.name == "conciseness"
+
+    def test_coherence_name(self):
+        metric = CoherenceMetric(llm_client=MockLLMClient())
+        assert metric.name == "coherence"
+
+    def test_completeness_name(self):
+        metric = CompletenessMetric(llm_client=MockLLMClient())
+        assert metric.name == "completeness"
+
+    def test_helpfulness_name(self):
+        metric = HelpfulnessMetric(llm_client=MockLLMClient())
+        assert metric.name == "helpfulness"
+
+    def test_instruction_following_name(self):
+        metric = InstructionFollowingMetric(llm_client=MockLLMClient())
+        assert metric.name == "instruction_following"
+
+    def test_instruction_following_prompt(self):
+        metric = InstructionFollowingMetric(
+            llm_client=MockLLMClient(),
+            instructions=["Always respond in French"],
+        )
+        prompt = metric.get_evaluation_prompt(EvalInput(actual_output="Bonjour"))
+        assert "Always respond in French" in prompt
+
+
+class TestRAGMetrics:
+    def test_faithfulness_name(self):
+        metric = FaithfulnessMetric(llm_client=MockLLMClient())
+        assert metric.name == "faithfulness"
+
+    def test_faithfulness_prompt_includes_context(self):
+        metric = FaithfulnessMetric(llm_client=MockLLMClient())
+        prompt = metric.get_evaluation_prompt(
+            EvalInput(
+                actual_output="answer",
+                context=["doc1", "doc2"],
+            )
+        )
+        assert "doc1" in prompt
+        assert "doc2" in prompt
+
+    def test_contextual_precision_name(self):
+        metric = ContextualPrecisionMetric(llm_client=MockLLMClient())
+        assert metric.name == "contextual_precision"
+
+    def test_contextual_recall_name(self):
+        metric = ContextualRecallMetric(llm_client=MockLLMClient())
+        assert metric.name == "contextual_recall"
+
+    def test_answer_correctness_name(self):
+        metric = AnswerCorrectnessMetric(llm_client=MockLLMClient())
+        assert metric.name == "answer_correctness"
+
+    def test_contextual_precision_prompt(self):
+        metric = ContextualPrecisionMetric(llm_client=MockLLMClient())
+        prompt = metric.get_evaluation_prompt(
+            EvalInput(
+                actual_output="answer",
+                input="question",
+                retrieval_context=["doc1", "doc2"],
+            )
+        )
+        assert "doc1" in prompt
+
+    def test_contextual_recall_prompt(self):
+        metric = ContextualRecallMetric(llm_client=MockLLMClient())
+        prompt = metric.get_evaluation_prompt(
+            EvalInput(
+                actual_output="answer",
+                expected_output="expected",
+                context=["doc1"],
+            )
+        )
+        assert "doc1" in prompt
+
+    def test_answer_correctness_prompt(self):
+        metric = AnswerCorrectnessMetric(llm_client=MockLLMClient())
+        prompt = metric.get_evaluation_prompt(
+            EvalInput(
+                actual_output="answer",
+                expected_output="expected",
+                input="question",
+            )
+        )
+        assert "question" in prompt
+
+
+class TestLLMMetricEdgeCases:
+    def test_get_client_default_provider(self):
+        """Test _get_client falls back to default provider."""
+        mock_client = MockLLMClient()
+        from fasteval.providers.registry import (
+            clear_default_provider,
+            set_default_provider,
+        )
+
+        set_default_provider(mock_client)
+        try:
+            metric = CorrectnessMetric()
+            client = metric._get_client()
+            assert client is mock_client
+        finally:
+            clear_default_provider()
+
+    def test_parse_response(self):
+        metric = CorrectnessMetric(llm_client=MockLLMClient())
+        result = metric._parse_response('{"score": 0.7, "reasoning": "OK"}')
+        assert result.score == 0.7
+
+    @pytest.mark.asyncio
+    async def test_conciseness_evaluation(self):
+        client = MockLLMClient()
+        metric = ConcisenessMetric(llm_client=client, threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(actual_output="Short answer", input="question")
+        )
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_coherence_evaluation(self):
+        client = MockLLMClient()
+        metric = CoherenceMetric(llm_client=client, threshold=0.5)
+        result = await metric.evaluate(EvalInput(actual_output="Coherent text"))
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_completeness_evaluation(self):
+        client = MockLLMClient()
+        metric = CompletenessMetric(llm_client=client, threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(actual_output="Complete answer", input="q")
+        )
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_helpfulness_evaluation(self):
+        client = MockLLMClient()
+        metric = HelpfulnessMetric(llm_client=client, threshold=0.5)
+        result = await metric.evaluate(
+            EvalInput(actual_output="Helpful response", input="q")
+        )
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_instruction_following_evaluation(self):
+        client = MockLLMClient()
+        metric = InstructionFollowingMetric(
+            llm_client=client,
+            instructions=["Be formal"],
+            threshold=0.5,
+        )
+        result = await metric.evaluate(EvalInput(actual_output="Formal response"))
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_instruction_following_no_instructions(self):
+        client = MockLLMClient()
+        metric = InstructionFollowingMetric(llm_client=client, threshold=0.5)
+        prompt = metric.get_evaluation_prompt(EvalInput(actual_output="test"))
+        assert "No instructions" in prompt
diff --git a/tests/test_plugin.py b/tests/test_plugin.py
new file mode 100644
index 0000000..aa52ec3
--- /dev/null
+++ b/tests/test_plugin.py
@@ -0,0 +1,217 @@
+"""Tests for fasteval.testing.plugin."""
+
+import os
+from io import StringIO
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from fasteval.collectors.summary import EvalRunSummary, MetricAggregate, TestCaseSummary
+from fasteval.testing.plugin import (
+    _print_console_summary,
+    pytest_addoption,
+    pytest_configure,
+    pytest_sessionfinish,
+    pytest_sessionstart,
+    pytest_unconfigure,
+)
+
+# ── pytest_addoption ─────────────────────────────────────────────────────────
+
+
+class TestPytestAddoption:
+    def test_adds_options(self):
+        mock_parser = MagicMock()
+        pytest_addoption(mock_parser)
+        assert mock_parser.addoption.call_count == 3
+        option_names = [call[0][0] for call in mock_parser.addoption.call_args_list]
+        assert "--no-interactive" in option_names
+        assert "--fe-output" in option_names
+        assert "--fe-summary" in option_names
+
+
+# ── pytest_configure ─────────────────────────────────────────────────────────
+
+
+class TestPytestConfigure:
+    def test_sets_env_var(self):
+        mock_config = MagicMock()
+        mock_config.getoption.return_value = True
+        with patch.dict(os.environ, {}, clear=False):
+            pytest_configure(mock_config)
+            assert os.environ.get("FASTEVAL_NO_INTERACTIVE") == "1"
+            # Cleanup
+            os.environ.pop("FASTEVAL_NO_INTERACTIVE", None)
+
+    def test_no_env_var_when_false(self):
+        mock_config = MagicMock()
+        mock_config.getoption.return_value = False
+        initial = os.environ.get("FASTEVAL_NO_INTERACTIVE")
+        pytest_configure(mock_config)
+        assert os.environ.get("FASTEVAL_NO_INTERACTIVE") == initial
+
+
+# ── pytest_sessionstart ──────────────────────────────────────────────────────
+
+
+class TestPytestSessionStart:
+    def test_resets_collector(self):
+        with patch("fasteval.collectors.collector.reset_collector") as mock_reset:
+            pytest_sessionstart(MagicMock())
+
+
+# ── pytest_sessionfinish ─────────────────────────────────────────────────────
+
+
+class TestPytestSessionFinish:
+    def test_no_results_noop(self):
+        mock_session = MagicMock()
+        mock_collector = MagicMock()
+        mock_collector.results = []
+
+        with patch(
+            "fasteval.collectors.collector.get_collector",
+            return_value=mock_collector,
+        ):
+            pytest_sessionfinish(mock_session, 0)
+            mock_collector.report.assert_not_called()
+
+    def test_fe_output_with_path(self):
+        mock_session = MagicMock()
+        mock_session.config.getoption.side_effect = lambda opt, **kwargs: {
+            "--fe-summary": False,
+            "--fe-output": ["json:output.json"],
+        }.get(opt, kwargs.get("default"))
+
+        mock_collector = MagicMock()
+        mock_collector.results = [MagicMock()]
+
+        with patch(
+            "fasteval.collectors.collector.get_collector",
+            return_value=mock_collector,
+        ):
+            pytest_sessionfinish(mock_session, 0)
+            mock_collector.report.assert_called_once_with("json", path="output.json")
+
+    def test_fe_output_without_path(self):
+        mock_session = MagicMock()
+        mock_session.config.getoption.side_effect = lambda opt, **kwargs: {
+            "--fe-summary": False,
+            "--fe-output": ["json"],
+        }.get(opt, kwargs.get("default"))
+
+        mock_collector = MagicMock()
+        mock_collector.results = [MagicMock()]
+        mock_collector.report.return_value = '{"test": true}'
+
+        with patch(
+            "fasteval.collectors.collector.get_collector",
+            return_value=mock_collector,
+        ):
+            pytest_sessionfinish(mock_session, 0)
+            mock_collector.report.assert_called_once_with("json")
+
+    def test_fe_summary(self):
+        mock_session = MagicMock()
+        mock_session.config.getoption.side_effect = lambda opt, **kwargs: {
+            "--fe-summary": True,
+            "--fe-output": [],
+        }.get(opt, kwargs.get("default"))
+
+        mock_collector = MagicMock()
+        mock_collector.results = [MagicMock()]
+
+        with (
+            patch(
+                "fasteval.collectors.collector.get_collector",
+                return_value=mock_collector,
+            ),
+            patch("fasteval.testing.plugin._print_console_summary") as mock_print,
+        ):
+            pytest_sessionfinish(mock_session, 0)
+            mock_print.assert_called_once()
+
+
+# ── pytest_unconfigure ───────────────────────────────────────────────────────
+
+
+class TestPytestUnconfigure:
+    def test_cleans_up_env_var(self):
+        mock_config = MagicMock()
+        mock_config.getoption.return_value = True
+        with patch.dict(os.environ, {"FASTEVAL_NO_INTERACTIVE": "1"}):
+            pytest_unconfigure(mock_config)
+            assert "FASTEVAL_NO_INTERACTIVE" not in os.environ
+
+    def test_no_cleanup_when_not_set(self):
+        mock_config = MagicMock()
+        mock_config.getoption.return_value = False
+        env_copy = os.environ.copy()
+        env_copy.pop("FASTEVAL_NO_INTERACTIVE", None)
+        with patch.dict(os.environ, env_copy, clear=True):
+            pytest_unconfigure(mock_config)
+
+
+# ── _print_console_summary ───────────────────────────────────────────────────
+
+
+class TestPrintConsoleSummary:
+    def test_output_format(self, capsys):
+        summary = EvalRunSummary(
+            total_tests=3,
+            passed_tests=2,
+            failed_tests=1,
+            pass_rate=2 / 3,
+            avg_aggregate_score=0.75,
+            total_execution_time_ms=100.0,
+            metric_aggregates=[
+                MetricAggregate(
+                    metric_name="correctness",
+                    count=3,
+                    pass_count=2,
+                    fail_count=1,
+                    pass_rate=2 / 3,
+                    avg_score=0.75,
+                    min_score=0.3,
+                    max_score=1.0,
+                )
+            ],
+            test_summaries=[
+                TestCaseSummary(
+                    test_name="test_pass",
+                    passed=True,
+                    aggregate_score=1.0,
+                    metric_count=1,
+                    execution_time_ms=10.0,
+                ),
+                TestCaseSummary(
+                    test_name="test_fail",
+                    passed=False,
+                    aggregate_score=0.3,
+                    metric_count=1,
+                    execution_time_ms=20.0,
+                    error="low score",
+                ),
+            ],
+        )
+        _print_console_summary(summary)
+        captured = capsys.readouterr()
+        assert "FastEval Summary" in captured.out
+        assert "3 total" in captured.out
+        assert "2 passed" in captured.out
+        assert "1 failed" in captured.out
+        assert "correctness" in captured.out
+        assert "test_fail" in captured.out
+
+    def test_no_metrics(self, capsys):
+        summary = EvalRunSummary(
+            total_tests=1,
+            passed_tests=1,
+            failed_tests=0,
+            pass_rate=1.0,
+            avg_aggregate_score=1.0,
+            total_execution_time_ms=5.0,
+        )
+        _print_console_summary(summary)
+        captured = capsys.readouterr()
+        assert "FastEval Summary" in captured.out
diff --git a/tests/test_providers.py b/tests/test_providers.py
new file mode 100644
index 0000000..8577f0e
--- /dev/null
+++ b/tests/test_providers.py
@@ -0,0 +1,192 @@
+"""Tests for fasteval.providers (registry, openai, anthropic)."""
+
+import os
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+import fasteval.providers.registry as registry_module
+from fasteval.providers.anthropic import AnthropicClient
+from fasteval.providers.openai import OpenAIClient
+from fasteval.providers.registry import (
+    clear_default_provider,
+    create_provider_for_model,
+    get_default_provider,
+    set_default_provider,
+)
+
+
+class MockLLMClient:
+    async def invoke(self, messages):
+        return "mock response"
+
+
+class TestProviderRegistry:
+    def setup_method(self):
+        clear_default_provider()
+
+    def test_set_and_get_default_provider(self):
+        client = MockLLMClient()
+        set_default_provider(client)
+        assert get_default_provider() is client
+
+    def test_clear_default_provider(self):
+        set_default_provider(MockLLMClient())
+        clear_default_provider()
+        # Should now try env vars or raise
+        with patch.dict(os.environ, {}, clear=True):
+            env = os.environ.copy()
+            env.pop("OPENAI_API_KEY", None)
+            env.pop("ANTHROPIC_API_KEY", None)
+            with patch.dict(os.environ, env, clear=True):
+                with pytest.raises(ValueError, match="No LLM provider configured"):
+                    get_default_provider()
+
+    def test_create_from_env_openai(self):
+        clear_default_provider()
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+            provider = get_default_provider()
+            assert isinstance(provider, OpenAIClient)
+
+    def test_create_from_env_anthropic(self):
+        clear_default_provider()
+        with patch.dict(
+            os.environ,
+            {"ANTHROPIC_API_KEY": "test-key"},
+            clear=False,
+        ):
+            # Remove OpenAI key if present
+            env = os.environ.copy()
+            env.pop("OPENAI_API_KEY", None)
+            env["ANTHROPIC_API_KEY"] = "test-key"
+            with patch.dict(os.environ, env, clear=True):
+                provider = get_default_provider()
+                assert isinstance(provider, AnthropicClient)
+
+    def test_create_from_env_no_keys(self):
+        clear_default_provider()
+        with patch.dict(os.environ, {}, clear=True):
+            with pytest.raises(ValueError, match="No LLM provider configured"):
+                get_default_provider()
+
+    def test_create_provider_for_model_gpt(self):
+        provider = create_provider_for_model("gpt-4o")
+        assert isinstance(provider, OpenAIClient)
+        assert provider.model == "gpt-4o"
+
+    def test_create_provider_for_model_o1(self):
+        provider = create_provider_for_model("o1-preview")
+        assert isinstance(provider, OpenAIClient)
+
+    def test_create_provider_for_model_o3(self):
+        provider = create_provider_for_model("o3-mini")
+        assert isinstance(provider, OpenAIClient)
+
+    def test_create_provider_for_model_claude(self):
+        provider = create_provider_for_model("claude-sonnet-4-6")
+        assert isinstance(provider, AnthropicClient)
+        assert provider.model == "claude-sonnet-4-6"
+
+    def test_create_provider_for_model_unknown(self):
+        with pytest.raises(ValueError, match="Unknown model"):
+            create_provider_for_model("llama-70b")
+
+
+class TestOpenAIClient:
+    def test_init_defaults(self):
+        client = OpenAIClient()
+        assert client.model == "gpt-4o-mini"
+        assert client.temperature == 0.0
+        assert client._client is None
+
+    def test_init_custom(self):
+        client = OpenAIClient(model="gpt-4o", api_key="key", temperature=0.5)
+        assert client.model == "gpt-4o"
+        assert client.api_key == "key"
+        assert client.temperature == 0.5
+
+    def test_repr(self):
+        client = OpenAIClient(model="gpt-4o")
+        assert repr(client) == "OpenAIClient(model='gpt-4o')"
+
+    @pytest.mark.asyncio
+    async def test_invoke_mocked(self):
+        client = OpenAIClient(model="gpt-4o", api_key="test")
+
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = "Hello!"
+
+        mock_async_client = AsyncMock()
+        mock_async_client.chat.completions.create = AsyncMock(
+            return_value=mock_response
+        )
+        client._client = mock_async_client
+
+        result = await client.invoke([{"role": "user", "content": "Hi"}])
+        assert result == "Hello!"
+
+
+class TestProviderRegistryEdgeCases:
+    def setup_method(self):
+        clear_default_provider()
+
+    def test_create_provider_for_model_claude_import_error(self):
+        with patch.dict("sys.modules", {"anthropic": None}):
+            # When anthropic is not importable
+            try:
+                provider = create_provider_for_model("claude-3-5-sonnet")
+                # If it succeeds, that's fine (anthropic might already be imported)
+                assert isinstance(provider, AnthropicClient)
+            except ImportError:
+                pass  # Expected when anthropic can't be imported
+
+
+class TestAnthropicClient:
+    def test_init_defaults(self):
+        client = AnthropicClient()
+        assert client.model == "claude-sonnet-4-6"
+        assert client.temperature == 0.0
+        assert client.max_tokens == 4096
+
+    def test_repr(self):
+        client = AnthropicClient(model="claude-3-5-sonnet")
+        assert repr(client) == "AnthropicClient(model='claude-3-5-sonnet')"
+
+    @pytest.mark.asyncio
+    async def test_invoke_mocked(self):
+        client = AnthropicClient(model="claude-sonnet-4-6", api_key="test")
+
+        mock_response = MagicMock()
+        mock_response.content = [MagicMock()]
+        mock_response.content[0].text = "Hi from Claude!"
+
+        mock_async_client = AsyncMock()
+        mock_async_client.messages.create = AsyncMock(return_value=mock_response)
+        client._client = mock_async_client
+
+        result = await client.invoke([{"role": "user", "content": "Hello"}])
+        assert result == "Hi from Claude!"
+
+    @pytest.mark.asyncio
+    async def test_invoke_with_system_message(self):
+        client = AnthropicClient(api_key="test")
+
+        mock_response = MagicMock()
+        mock_response.content = [MagicMock()]
+        mock_response.content[0].text = "Response"
+
+        mock_async_client = AsyncMock()
+        mock_async_client.messages.create = AsyncMock(return_value=mock_response)
+        client._client = mock_async_client
+
+        messages = [
+            {"role": "system", "content": "You are helpful"},
+            {"role": "user", "content": "Hi"},
+        ]
+        await client.invoke(messages)
+
+        call_kwargs = mock_async_client.messages.create.call_args[1]
+        assert call_kwargs["system"] == "You are helpful"
+        assert len(call_kwargs["messages"]) == 1
+        assert call_kwargs["messages"][0]["role"] == "user"
diff --git a/tests/test_reporters.py b/tests/test_reporters.py
new file mode 100644
index 0000000..145aff0
--- /dev/null
+++ b/tests/test_reporters.py
@@ -0,0 +1,155 @@
+"""Tests for fasteval.collectors.reporters (json + html)."""
+
+import json
+
+import pytest
+
+from fasteval.collectors.reporters.html_reporter import HtmlReporter
+from fasteval.collectors.reporters.json_reporter import JsonReporter
+from fasteval.collectors.summary import EvalRunSummary
+from fasteval.models.evaluation import EvalInput, EvalResult, MetricResult
+
+
+def _make_metric(name="m1", score=0.8, passed=True, threshold=0.5, reasoning="OK"):
+    return MetricResult(
+        metric_name=name,
+        score=score,
+        passed=passed,
+        threshold=threshold,
+        reasoning=reasoning,
+    )
+
+
+def _make_result(
+    passed=True, aggregate_score=0.8, metrics=None, execution_time_ms=10.0
+):
+    return EvalResult(
+        eval_input=EvalInput(actual_output="test output", expected_output="expected"),
+        metric_results=metrics or [],
+        passed=passed,
+        aggregate_score=aggregate_score,
+        execution_time_ms=execution_time_ms,
+    )
+
+
+def _make_summary_and_results():
+    mr1 = _make_metric(name="correctness", score=0.9, passed=True)
+    mr2 = _make_metric(
+        name="relevance", score=0.3, passed=False, reasoning="Not relevant"
+    )
+    r1 = _make_result(passed=True, metrics=[mr1])
+    r2 = _make_result(passed=False, aggregate_score=0.3, metrics=[mr2])
+    summary = EvalRunSummary.from_results([r1, r2], ["test_pass", "test_fail"])
+    return summary, [r1, r2]
+
+
+# ── JsonReporter ─────────────────────────────────────────────────────────────
+
+
+class TestJsonReporter:
+    def test_generates_valid_json(self):
+        summary, results = _make_summary_and_results()
+        reporter = JsonReporter()
+        output = reporter.generate(summary, results)
+        parsed = json.loads(output)
+        assert isinstance(parsed, dict)
+
+    def test_includes_summary_and_results(self):
+        summary, results = _make_summary_and_results()
+        reporter = JsonReporter()
+        parsed = json.loads(reporter.generate(summary, results))
+        assert "summary" in parsed
+        assert "results" in parsed
+        assert len(parsed["results"]) == 2
+
+    def test_test_name_assigned(self):
+        summary, results = _make_summary_and_results()
+        reporter = JsonReporter()
+        parsed = json.loads(reporter.generate(summary, results))
+        assert parsed["results"][0]["test_name"] == "test_pass"
+        assert parsed["results"][1]["test_name"] == "test_fail"
+
+    def test_include_inputs_false(self):
+        summary, results = _make_summary_and_results()
+        reporter = JsonReporter(include_inputs=False)
+        parsed = json.loads(reporter.generate(summary, results))
+        for r in parsed["results"]:
+            assert "eval_input" not in r
+
+    def test_custom_indent(self):
+        summary, results = _make_summary_and_results()
+        reporter = JsonReporter(indent=4)
+        output = reporter.generate(summary, results)
+        # 4-space indent should produce more whitespace than 2-space
+        assert "    " in output
+
+    def test_empty_results(self):
+        summary = EvalRunSummary.from_results([], [])
+        reporter = JsonReporter()
+        output = reporter.generate(summary, [])
+        parsed = json.loads(output)
+        assert parsed["results"] == []
+
+
+# ── HtmlReporter ─────────────────────────────────────────────────────────────
+
+
+class TestHtmlReporter:
+    def test_generates_html(self):
+        summary, results = _make_summary_and_results()
+        reporter = HtmlReporter()
+        output = reporter.generate(summary, results)
+        assert "<!DOCTYPE html>" in output
+        assert "<html" in output
+        assert "</html>" in output
+
+    def test_contains_key_sections(self):
+        summary, results = _make_summary_and_results()
+        reporter = HtmlReporter()
+        output = reporter.generate(summary, results)
+        assert "FastEval Evaluation Report" in output
+        assert "Metric Breakdown" in output
+        assert "Test Results" in output
+
+    def test_cards_section(self):
+        summary, results = _make_summary_and_results()
+        reporter = HtmlReporter()
+        output = reporter.generate(summary, results)
+        assert "Total Tests" in output
+        assert "Passed" in output
+        assert "Failed" in output
+
+    def test_metric_table(self):
+        summary, results = _make_summary_and_results()
+        reporter = HtmlReporter()
+        output = reporter.generate(summary, results)
+        assert "correctness" in output
+        assert "relevance" in output
+
+    def test_pass_fail_badges(self):
+        summary, results = _make_summary_and_results()
+        reporter = HtmlReporter()
+        output = reporter.generate(summary, results)
+        assert "PASS" in output
+        assert "FAIL" in output
+
+    def test_empty_results(self):
+        summary = EvalRunSummary.from_results([], [])
+        reporter = HtmlReporter()
+        output = reporter.generate(summary, [])
+        assert "No test results" in output
+
+    def test_empty_metrics(self):
+        r = _make_result(metrics=[])
+        summary = EvalRunSummary.from_results([r], ["test1"])
+        reporter = HtmlReporter()
+        output = reporter.generate(summary, [r])
+        assert "No metrics recorded" in output
+
+    def test_reasoning_displayed(self):
+        mr = _make_metric(reasoning="Detailed reason here", passed=False)
+        r = _make_result(metrics=[mr], passed=False)
+        summary = EvalRunSummary.from_results([r], ["test1"])
+        reporter = HtmlReporter()
+        output = reporter.generate(summary, [r])
+        assert "Detailed reason here" in output
diff --git a/tests/test_scoring.py b/tests/test_scoring.py
new file mode 100644
index 0000000..59069b6
--- /dev/null
+++ b/tests/test_scoring.py
@@ -0,0 +1,191 @@
+"""Tests for fasteval.core.scoring."""
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from fasteval.core.scoring import (
+    _get_test_name_from_caller,
+    _last_score_result,
+    _normalize_audio,
+    _normalize_generated_image,
+    _normalize_image,
+    clear_last_score_result,
+    get_last_score_result,
+    score,
+)
+from fasteval.models.evaluation import (
+    EvalInput,
+    EvalResult,
+    EvaluationFailedError,
+    ExpectedTool,
+    MetricResult,
+    ToolCall,
+)
+from fasteval.models.multimodal import AudioInput, GeneratedImage, ImageInput
+
+# ── Normalization helpers ────────────────────────────────────────────────────
+
+
+class TestNormalizeImage:
+    def test_image_input_passthrough(self):
+        img = ImageInput(source="test.png")
+        assert _normalize_image(img) is img
+
+    def test_path_to_str(self):
+        result = _normalize_image(Path("/tmp/test.png"))
+        assert result == "/tmp/test.png"
+        assert isinstance(result, str)
+
+    def test_string_passthrough(self):
+        assert _normalize_image("http://img.png") == "http://img.png"
+
+
+class TestNormalizeAudio:
+    def test_audio_input_passthrough(self):
+        aud = AudioInput(source="test.wav")
+        assert _normalize_audio(aud) is aud
+
+    def test_path_to_str(self):
+        result = _normalize_audio(Path("/tmp/test.wav"))
+        assert result == "/tmp/test.wav"
+
+    def test_string_passthrough(self):
+        assert _normalize_audio("http://audio.wav") == "http://audio.wav"
+
+
+class TestNormalizeGeneratedImage:
+    def test_generated_image_passthrough(self):
+        img = GeneratedImage(image=ImageInput(source="test.png"), prompt="A test image")
+        assert _normalize_generated_image(img) is img
+
+    def test_image_input_passthrough(self):
+        img = ImageInput(source="test.png")
+        assert _normalize_generated_image(img) is img
+
+    def test_path_to_str(self):
+        result = _normalize_generated_image(Path("/tmp/gen.png"))
+        assert result == "/tmp/gen.png"
+
+    def test_string_passthrough(self):
+        assert _normalize_generated_image("gen.png") == "gen.png"
+
+
+# ── Context variable storage ─────────────────────────────────────────────────
+
+
+class TestLastScoreResult:
+    def test_default_is_none(self):
+        clear_last_score_result()
+        assert get_last_score_result() is None
+
+    def test_set_and_get(self):
+        result = EvalResult(
+            eval_input=EvalInput(actual_output="test"),
+            metric_results=[],
+            passed=True,
+            aggregate_score=1.0,
+        )
+        _last_score_result.set(result)
+        assert get_last_score_result() is result
+        clear_last_score_result()
+
+    def test_clear(self):
+        _last_score_result.set(
+            EvalResult(
+                eval_input=EvalInput(actual_output="test"),
+                metric_results=[],
+                passed=True,
+                aggregate_score=1.0,
+            )
+        )
+        clear_last_score_result()
+        assert get_last_score_result() is None
+
+
+# ── _get_test_name_from_caller ───────────────────────────────────────────────
+
+
+class TestGetTestName:
+    def test_from_test_function(self):
+        # This is called from a function starting with "test_"
+        name = _get_test_name_from_caller()
+        assert name.startswith("test_")
+
+    def _helper_non_test(self):
+        return _get_test_name_from_caller()
+
+    def test_from_non_test_via_helper(self):
+        # The helper doesn't start with test_, but the caller does
+        name = self._helper_non_test()
+        assert name.startswith("test_")
+
+
+# ── score() function ─────────────────────────────────────────────────────────
+
+
+class TestScoreFunction:
+    def test_score_no_decorators(self):
+        """score() without decorators returns a base result."""
+        result = score("actual output", "expected output", input="question")
+        assert result.passed is True
+        assert result.aggregate_score == 1.0
+        assert len(result.metric_results) == 0
+
+    def test_score_stores_last_result(self):
+        clear_last_score_result()
+        result = score("output")
+        assert get_last_score_result() is result
+
+    def test_score_normalizes_tool_calls_dict(self):
+        result = score(
+            "output",
+            tool_calls=[
+                {"name": "search", "args": {"q": "test"}, "result": "found"},
+            ],
+        )
+        assert result.eval_input.tool_calls[0].name == "search"
+        assert result.eval_input.tool_calls[0].arguments == {"q": "test"}
+
+    def test_score_normalizes_tool_calls_with_arguments_key(self):
+        result = score(
+            "output",
+            tool_calls=[
+                {"name": "search", "arguments": {"q": "test"}},
+            ],
+        )
+        assert result.eval_input.tool_calls[0].arguments == {"q": "test"}
+
+    def test_score_normalizes_tool_call_model(self):
+        tc = ToolCall(name="search", arguments={"q": "test"})
+        result = score("output", tool_calls=[tc])
+        assert result.eval_input.tool_calls[0] is tc
+
+    def test_score_normalizes_expected_tools_dict(self):
+        result = score(
+            "output",
+            expected_tools=[
+                {"name": "search", "args": {"q": "test"}, "required": False},
+            ],
+        )
+        assert result.eval_input.expected_tools[0].name == "search"
+        assert result.eval_input.expected_tools[0].required is False
+
+    def test_score_normalizes_expected_tool_model(self):
+        et = ExpectedTool(name="search")
+        result = score("output", expected_tools=[et])
+        assert result.eval_input.expected_tools[0] is et
+
+    def test_score_with_context(self):
+        result = score(
+            "output",
+            context=["doc1", "doc2"],
+            retrieval_context=["ret1"],
+        )
+        assert result.eval_input.context == ["doc1", "doc2"]
+        assert result.eval_input.retrieval_context == ["ret1"]
+
+    def test_score_with_metadata(self):
+        result = score("output", metadata={"key": "value"})
+        assert result.eval_input.metadata == {"key": "value"}
diff --git a/tests/test_text_utils.py b/tests/test_text_utils.py
new file mode 100644
index 0000000..47e56eb
--- /dev/null
+++ b/tests/test_text_utils.py
@@ -0,0 +1,39 @@
+"""Tests for fasteval.utils.text."""
+
+from fasteval.utils.text import truncate
+
+
+class TestTruncate:
+    def test_empty_string(self):
+        assert truncate("") == ""
+
+    def test_short_text_unchanged(self):
+        assert truncate("Hello", max_length=80) == "Hello"
+
+    def test_long_text_truncated(self):
+        result = truncate("a" * 100, max_length=10)
+        assert len(result) == 10
+        assert result.endswith("...")
+
+    def test_newlines_replaced(self):
+        result = truncate("line1\nline2\nline3", max_length=80)
+        assert "\n" not in result
+        assert result == "line1 line2 line3"
+
+    def test_custom_ellipsis(self):
+        result = truncate("a" * 100, max_length=10, ellipsis="~~")
+        assert result.endswith("~~")
+        assert len(result) == 10
+
+    def test_max_length_less_than_ellipsis(self):
+        result = truncate("a" * 100, max_length=2, ellipsis="...")
+        assert result == ".."
+        assert len(result) == 2
+
+    def test_exact_max_length_no_truncation(self):
+        text = "a" * 80
+        assert truncate(text, max_length=80) == text
+
+    def test_whitespace_stripped(self):
+        result = truncate("  hello  ", max_length=80)
+        assert result == "hello"
diff --git a/uv.lock b/uv.lock
index adcc268..790b580 100644
--- a/uv.lock
+++ b/uv.lock
@@ -541,7 +541,7 @@ wheels = [
 
 [[package]]
 name = "fasteval-core"
-version = "1.0.0a1"
+version = "1.2.0"
 source = { editable = "." }
 dependencies = [
     { name = "openai" },
@@ -2571,6 +2571,13 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0f/8b/4b61d6e13f7108f36910df9ab4b58fd389cc2520d54d81b88660804aad99/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:418997cb02d0a0f1497cf6a09f63166f9f5df9f3e16c8a716ab76a72127c714f", size = 79423467, upload-time = "2026-02-10T21:44:48.711Z" },
     { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" },
     { url = "https://files.pythonhosted.org/packages/ec/23/2c9fe0c9c27f7f6cb865abcea8a4568f29f00acaeadfc6a37f6801f84cb4/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:e521c9f030a3774ed770a9c011751fb47c4d12029a3d6522116e48431f2ff89e", size = 79498254, upload-time = "2026-02-10T21:44:44.095Z" },
+    { url = "https://files.pythonhosted.org/packages/16/ee/efbd56687be60ef9af0c9c0ebe106964c07400eade5b0af8902a1d8cd58c/torch-2.10.0-3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a1ff626b884f8c4e897c4c33782bdacdff842a165fee79817b1dd549fdda1321", size = 915510070, upload-time = "2026-03-11T14:16:39.386Z" },
+    { url = "https://files.pythonhosted.org/packages/36/ab/7b562f1808d3f65414cd80a4f7d4bb00979d9355616c034c171249e1a303/torch-2.10.0-3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:ac5bdcbb074384c66fa160c15b1ead77839e3fe7ed117d667249afce0acabfac", size = 915518691, upload-time = "2026-03-11T14:15:43.147Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/7a/abada41517ce0011775f0f4eacc79659bc9bc6c361e6bfe6f7052a6b9363/torch-2.10.0-3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:98c01b8bb5e3240426dcde1446eed6f40c778091c8544767ef1168fc663a05a6", size = 915622781, upload-time = "2026-03-11T14:17:11.354Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/c6/4dfe238342ffdcec5aef1c96c457548762d33c40b45a1ab7033bb26d2ff2/torch-2.10.0-3-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:80b1b5bfe38eb0e9f5ff09f206dcac0a87aadd084230d4a36eea5ec5232c115b", size = 915627275, upload-time = "2026-03-11T14:16:11.325Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/f0/72bf18847f58f877a6a8acf60614b14935e2f156d942483af1ffc081aea0/torch-2.10.0-3-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:46b3574d93a2a8134b3f5475cfb98e2eb46771794c57015f6ad1fb795ec25e49", size = 915523474, upload-time = "2026-03-11T14:17:44.422Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/39/590742415c3030551944edc2ddc273ea1fdfe8ffb2780992e824f1ebee98/torch-2.10.0-3-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:b1d5e2aba4eb7f8e87fbe04f86442887f9167a35f092afe4c237dfcaaef6e328", size = 915632474, upload-time = "2026-03-11T14:15:13.666Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/8e/34949484f764dde5b222b7fe3fede43e4a6f0da9d7f8c370bb617d629ee2/torch-2.10.0-3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0228d20b06701c05a8f978357f657817a4a63984b0c90745def81c18aedfa591", size = 915523882, upload-time = "2026-03-11T14:14:46.311Z" },
     { url = "https://files.pythonhosted.org/packages/0c/1a/c61f36cfd446170ec27b3a4984f072fd06dab6b5d7ce27e11adb35d6c838/torch-2.10.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5276fa790a666ee8becaffff8acb711922252521b28fbce5db7db5cf9cb2026d", size = 145992962, upload-time = "2026-01-21T16:24:14.04Z" },
     { url = "https://files.pythonhosted.org/packages/b5/60/6662535354191e2d1555296045b63e4279e5a9dbad49acf55a5d38655a39/torch-2.10.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aaf663927bcd490ae971469a624c322202a2a1e68936eb952535ca4cd3b90444", size = 915599237, upload-time = "2026-01-21T16:23:25.497Z" },
     { url = "https://files.pythonhosted.org/packages/40/b8/66bbe96f0d79be2b5c697b2e0b187ed792a15c6c4b8904613454651db848/torch-2.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:a4be6a2a190b32ff5c8002a0977a25ea60e64f7ba46b1be37093c141d9c49aeb", size = 113720931, upload-time = "2026-01-21T16:24:23.743Z" },