From 5a88d153050726b017d821b22d3a27755ee3146b Mon Sep 17 00:00:00 2001
From: Akihiko Kuroda <akihikokuroda2020@gmail.com>
Date: Wed, 10 Jun 2026 16:24:47 -0400
Subject: [PATCH 1/4] add debug plugin collections

Signed-off-by: Akihiko Kuroda <akihikokuroda2020@gmail.com>
---
 docs/docs/how-to/debug-with-plugins.md        | 310 ++++++++++++++++++
 .../plugins/builtin_complete_diagnostics.py   | 121 +++++++
 .../plugins/builtin_full_pipeline_tracing.py  | 118 +++++++
 .../plugins/builtin_generation_tracing.py     |  60 ++++
 .../plugins/builtin_sampling_diagnostics.py   | 105 ++++++
 .../plugins/builtin_validation_failures.py    |  91 +++++
 .../plugins/builtin_validation_strict.py      |  95 ++++++
 .../plugins/builtin_validation_tracing.py     | 127 +++++++
 mellea/plugins/builtin_debug/__init__.py      |  55 ++++
 mellea/plugins/builtin_debug/generation.py    | 178 ++++++++++
 mellea/plugins/builtin_debug/sampling.py      | 132 ++++++++
 mellea/plugins/builtin_debug/validation.py    |  91 +++++
 12 files changed, 1483 insertions(+)
 create mode 100644 docs/docs/how-to/debug-with-plugins.md
 create mode 100644 docs/examples/plugins/builtin_complete_diagnostics.py
 create mode 100644 docs/examples/plugins/builtin_full_pipeline_tracing.py
 create mode 100644 docs/examples/plugins/builtin_generation_tracing.py
 create mode 100644 docs/examples/plugins/builtin_sampling_diagnostics.py
 create mode 100644 docs/examples/plugins/builtin_validation_failures.py
 create mode 100644 docs/examples/plugins/builtin_validation_strict.py
 create mode 100644 docs/examples/plugins/builtin_validation_tracing.py
 create mode 100644 mellea/plugins/builtin_debug/__init__.py
 create mode 100644 mellea/plugins/builtin_debug/generation.py
 create mode 100644 mellea/plugins/builtin_debug/sampling.py
 create mode 100644 mellea/plugins/builtin_debug/validation.py

diff --git a/docs/docs/how-to/debug-with-plugins.md b/docs/docs/how-to/debug-with-plugins.md
new file mode 100644
index 000000000..8a63f45fa
--- /dev/null
+++ b/docs/docs/how-to/debug-with-plugins.md
@@ -0,0 +1,310 @@
+---
+title: "Debug with Plugins"
+description: "Use built-in debug plugins to trace generation, validation, and sampling behavior in detail."
+# diataxis: how-to
+---
+
+**Prerequisites:** [The Requirements System](../concepts/requirements-system),
+[Sampling Strategies](../concepts/sampling-strategies), `pip install mellea`.
+
+Mellea's plugin system provides debug hooks that trace the full lifecycle of
+generation, validation, and sampling. Use these plugins to understand:
+
+- What prompts are sent to the LLM
+- Model latency and token usage
+- Which requirements pass/fail and why
+- When repair strategies trigger and what feedback they provide
+- End-to-end flow through the sampling loop
+
+## Built-in debug plugins
+
+Mellea ships with three categories of debug plugins in `mellea.plugins.builtin_debug`:
+
+### Generation pipeline plugins
+
+Trace all LLM backend calls with request/response inspection, latency, and tokens.
+
+```python
+from mellea.plugins.builtin_debug.generation import (
+    log_generation_pre_call,
+    log_generation_post_call,
+)
+from mellea.plugins import register
+
+register([
+    log_generation_pre_call,
+    log_generation_post_call,
+])
+```
+
+**Output:**
+
+```text
+[📤 GEN-PRE-CALL gen_id=abc123...] model=granite4.1:3b | prompt=Write a thank you note
+[📥 GEN-POST-CALL gen_id=abc123...] model=granite4.1:3b | latency=397ms | tokens=(47+19=66) | response=hello there thank you...
+```
+
+**Logs:**
+
+- Generation ID for correlation
+- Model being called
+- Request: prompt preview (first 100 chars)
+- Response: preview, latency, token counts
+- **Repair feedback** when present (shows guidance the model receives during repair)
+
+### Validation pipeline plugins
+
+Trace requirement validation with pre-check setup and per-requirement results.
+
+```python
+from mellea.plugins.builtin_debug.validation import (
+    log_validation_pre_check,
+    log_validation_post_check,
+)
+from mellea.plugins import register
+
+register([
+    log_validation_pre_check,
+    log_validation_post_check,
+])
+```
+
+**Output:**
+
+```text
+[🔍 VALIDATION-PRE-CHECK] requirements=3 | target=ModelOutputThunk
+
+[❌ VALIDATION-POST-CHECK] MIXED RESULTS: 2/3 passed, 1/3 failed
+    ✓ Use only lowercase letters
+    ✓ Include the phrase 'thank you'
+    ❌ Start with a greeting
+       └─ validated as "no"
+```
+
+**Logs:**
+
+- Pre-check: how many requirements, what's being validated
+- Post-check: pass/fail count per requirement
+- Per-requirement status with reasons for failures
+
+### Sampling pipeline plugins
+
+Trace the sampling strategy lifecycle including iterations, validation results,
+and repair events.
+
+```python
+from mellea.plugins.builtin_debug.sampling import (
+    log_sampling_loop_start,
+    log_sampling_iteration,
+    log_sampling_repair,
+    log_sampling_loop_end,
+)
+from mellea.plugins import register
+
+register([
+    log_sampling_loop_start,
+    log_sampling_iteration,
+    log_sampling_repair,
+    log_sampling_loop_end,
+])
+```
+
+**Output:**
+
+```text
+[🎯 SAMPLING-START] strategy=RepairTemplateStrategy | loop_budget=3 | requirements=3
+
+[❌ SAMPLING-ITER 1] FAILED: 2/3 validations passed
+    ❌ Start with a greeting
+
+[🔧 REPAIR-TRIGGERED] at iteration 1
+   repair_type=template
+   failed_validations:
+     • Start with a greeting
+
+[❌ SAMPLING-ITER 2] FAILED: 2/3 validations passed
+    ❌ Start with a greeting
+
+[🎉 SAMPLING-END] SUCCESS in 2 iteration(s) using RepairTemplateStrategy
+   total_attempts=2
+   best_validation_score=3/3
+```
+
+**Logs:**
+
+- Loop start: strategy, budget, requirement count
+- Each iteration: pass/fail count, failed requirement names
+- Repair events: when triggered, repair type, failed requirements
+- Loop end: success/failure, iterations used, final statistics
+
+## Enabling multiple plugins together
+
+Combine plugins for complete end-to-end visibility:
+
+```python
+from mellea.plugins.builtin_debug.generation import (
+    log_generation_pre_call,
+    log_generation_post_call,
+)
+from mellea.plugins.builtin_debug.validation import (
+    log_validation_pre_check,
+    log_validation_post_check,
+)
+from mellea.plugins.builtin_debug.sampling import (
+    log_sampling_loop_start,
+    log_sampling_iteration,
+    log_sampling_repair,
+    log_sampling_loop_end,
+)
+from mellea.plugins import register
+
+register([
+    # Generation hooks
+    log_generation_pre_call,
+    log_generation_post_call,
+    # Validation hooks
+    log_validation_pre_check,
+    log_validation_post_check,
+    # Sampling hooks
+    log_sampling_loop_start,
+    log_sampling_iteration,
+    log_sampling_repair,
+    log_sampling_loop_end,
+])
+```
+
+This reveals the complete flow:
+
+```text
+[🎯 SAMPLING-START] strategy=... | loop_budget=... | requirements=...
+
+[📤 GEN-PRE-CALL] prompt=...
+[📥 GEN-POST-CALL] response=... | latency=... | tokens=...
+
+[🔍 VALIDATION-PRE-CHECK] requirements=... | target=...
+[📤 GEN-PRE-CALL] prompt=Start with a greeting (validation check)
+[📥 GEN-POST-CALL] response=no
+[❌ VALIDATION-POST-CHECK] MIXED RESULTS: 2/3 passed, 1/3 failed
+
+[❌ SAMPLING-ITER 1] FAILED: 2/3 validations passed
+
+[🔧 REPAIR-TRIGGERED] at iteration 1
+   failed_validations: Start with a greeting
+
+[📤 GEN-PRE-CALL] prompt=Write a thank you note
+   [⭐ REPAIR ATTEMPT] Repair feedback provided: ...
+[📥 GEN-POST-CALL] response=... | latency=... | tokens=...
+
+[🔍 VALIDATION-PRE-CHECK] requirements=... | target=...
+[📤 GEN-PRE-CALL] prompt=Start with a greeting
+[📥 GEN-POST-CALL] response=yes
+[✅ VALIDATION-POST-CHECK] ALL PASSED: 3/3 requirements
+
+[✅ SAMPLING-ITER 2] SUCCESS: 3/3 validations passed
+
+[🎉 SAMPLING-END] SUCCESS in 2 iteration(s)
+```
+
+## Example scripts
+
+Ready-to-run examples are available in `docs/examples/plugins/`:
+
+| Script                            | Plugins                    | Purpose                             |
+| --------------------------------- | -------------------------- | ----------------------------------- |
+| `builtin_generation_tracing.py`   | Generation                 | Basic model call tracing            |
+| `builtin_validation_tracing.py`   | Validation                 | Requirement validation              |
+| `builtin_validation_failures.py`  | Validation                 | Show validation failures            |
+| `builtin_sampling_diagnostics.py` | Sampling                   | Strategy iterations                 |
+| `builtin_full_pipeline_tracing.py`| Generation + Sampling      | End-to-end with model visibility    |
+| `builtin_complete_diagnostics.py` | All 3                      | Complete pipeline with validation   |
+
+Run any example:
+
+```bash
+uv run python docs/examples/plugins/builtin_generation_tracing.py
+uv run python docs/examples/plugins/builtin_validation_failures.py
+uv run python docs/examples/plugins/builtin_complete_diagnostics.py
+```
+
+## Common debugging scenarios
+
+### "Why is the model generating a different response than I expected?"
+
+Enable **generation tracing** to see:
+
+- Exactly what prompt was sent
+- Model's latency and token usage
+- Response preview
+- When repair feedback is provided (if using RepairTemplateStrategy)
+
+This shows whether the issue is in the prompt, model behavior, or repair strategy.
+
+### "Why are my requirements failing?"
+
+Enable **validation tracing** to see:
+
+- Each requirement being checked
+- Pass/fail status per requirement
+- Failure reason (e.g., "validated as 'no'")
+- Pass/fail counts
+
+This pinpoints which requirements are problematic and why.
+
+### "Why isn't the repair strategy helping?"
+
+Enable **all three plugin categories** to see:
+
+- Initial attempt (generation + validation)
+- What failed (validation results)
+- Repair feedback provided (in generation pre-call logs)
+- Second attempt with feedback (generation + validation)
+- Whether the repair improved the results
+
+This reveals whether the repair strategy is receiving the right feedback and the model is responding appropriately.
+
+### "Why is sampling taking so long?"
+
+Enable **sampling tracing** to see:
+
+- How many iterations ran
+- Validation results per iteration
+- When repairs were triggered
+- Total attempts before success/failure
+
+This identifies whether the issue is budget exhaustion, frequent failures, or ineffective repair.
+
+## Controlling log output
+
+By default, debug plugins log at INFO level for important events and DEBUG level
+for details. Control verbosity:
+
+```python
+import logging
+
+# Show only failures and key events
+logging.basicConfig(level=logging.INFO)
+
+# Show all details including passed requirements
+logging.basicConfig(level=logging.DEBUG)
+
+# Silence a specific logger
+logging.getLogger("httpx").setLevel(logging.ERROR)
+logging.getLogger("ollama").setLevel(logging.ERROR)
+```
+
+## Performance notes
+
+Debug plugins have minimal overhead:
+
+- Pre-hooks check whether plugins are registered before building payloads
+- Logging is formatted efficiently
+- No plugins fire in the hot path when not registered
+
+For production use, you can safely leave plugins registered — they only log when
+enabled. For maximum performance, simply don't register them.
+
+## Next steps
+
+- [Observability: Tracing](../observability/tracing.md) — export traces to Jaeger or Grafana
+- [Handling Exceptions and Failures](./handling-exceptions.md) — work with sampling failures
+- [The Requirements System](../concepts/requirements-system) — understand validation in depth
diff --git a/docs/examples/plugins/builtin_complete_diagnostics.py b/docs/examples/plugins/builtin_complete_diagnostics.py
new file mode 100644
index 000000000..aafc26238
--- /dev/null
+++ b/docs/examples/plugins/builtin_complete_diagnostics.py
@@ -0,0 +1,121 @@
+# pytest: ollama, e2e
+"""Complete diagnostics example with all debug plugins.
+
+This example combines generation, sampling, AND validation tracing for maximum
+visibility into the entire sampling + validation pipeline.
+
+You'll see:
+1. Validation pre-check: requirements about to be checked
+2. Generation pre/post-call: model called for main task + validation
+3. Validation post-check: results per requirement
+4. Sampling iteration: aggregate pass/fail count
+5. Repair events: when and why repairs triggered
+6. Final result: success/failure with statistics
+
+This provides complete end-to-end traceability for:
+- Model behavior (generation)
+- Validation logic (what passes/fails and why)
+- Repair strategy (how feedback improves results)
+- Overall sampling loop (iterations and budget)
+
+Run:
+    uv run python docs/examples/plugins/builtin_complete_diagnostics.py
+
+Watch the complete flow with all lifecycle events visible.
+"""
+
+import logging
+
+import mellea
+from mellea.core import Requirement
+from mellea.plugins import register
+from mellea.plugins.builtin_debug.generation import (
+    log_generation_post_call,
+    log_generation_pre_call,
+)
+from mellea.plugins.builtin_debug.sampling import (
+    log_sampling_iteration,
+    log_sampling_loop_end,
+    log_sampling_loop_start,
+    log_sampling_repair,
+)
+from mellea.plugins.builtin_debug.validation import (
+    log_validation_post_check,
+    log_validation_pre_check,
+)
+from mellea.stdlib.requirements import req, simple_validate
+from mellea.stdlib.sampling import RepairTemplateStrategy
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+log = logging.getLogger(__name__)
+
+# Enable ALL debug plugins
+register(
+    [
+        # Generation pipeline
+        log_generation_pre_call,
+        log_generation_post_call,
+        # Validation pipeline
+        log_validation_pre_check,
+        log_validation_post_check,
+        # Sampling pipeline
+        log_sampling_loop_start,
+        log_sampling_iteration,
+        log_sampling_repair,
+        log_sampling_loop_end,
+    ]
+)
+
+
+def is_lowercase_only(text: str) -> bool:
+    """Validation: text must be all lowercase."""
+    return text == text.lower()
+
+
+def has_thank_you(text: str) -> bool:
+    """Validation: text must contain 'thank you'."""
+    return "thank you" in text.lower()
+
+
+requirements: list[Requirement | str] = [
+    req("Start with a greeting"),
+    req(
+        "Use only lowercase letters (no capitals)",
+        validation_fn=simple_validate(is_lowercase_only),
+    ),
+    req("Include the phrase 'thank you'", validation_fn=simple_validate(has_thank_you)),
+]
+
+
+def main():
+    """Complete diagnostics example."""
+    log.info("=" * 70)
+    log.info("Complete Diagnostics Example (All Debug Plugins)")
+    log.info("=" * 70)
+    log.info("")
+    log.info("All debug plugins are enabled:")
+    log.info("  [📤/📥 GEN-*]        - model calls and responses")
+    log.info("  [🔍 VALIDATION-*]    - requirement validation")
+    log.info("  [❌ SAMPLING-ITER]   - iteration results")
+    log.info("  [🔧 REPAIR]          - repair events")
+    log.info("")
+
+    with mellea.start_session() as m:
+        log.info("Generating text with strict requirements and repair strategy...")
+        log.info("")
+
+        result = m.instruct(
+            "Write a thank you note",
+            requirements=requirements,
+            strategy=RepairTemplateStrategy(loop_budget=3),
+        )
+
+        log.info("")
+        log.info("=" * 70)
+        log.info("Final result:")
+        log.info(str(result)[:300] + ("..." if len(str(result)) > 300 else ""))
+        log.info("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/plugins/builtin_full_pipeline_tracing.py b/docs/examples/plugins/builtin_full_pipeline_tracing.py
new file mode 100644
index 000000000..b3500e2b1
--- /dev/null
+++ b/docs/examples/plugins/builtin_full_pipeline_tracing.py
@@ -0,0 +1,118 @@
+# pytest: ollama, e2e
+"""Full pipeline tracing example combining generation and sampling diagnostics.
+
+This example demonstrates both the generation and sampling debug plugins working
+together to provide end-to-end visibility into the entire sampling loop.
+
+You'll see:
+1. Generation pre-call: what's being sent to the LLM
+2. Generation post-call: model response, latency, and tokens
+3. Sampling iteration: validation results per attempt
+4. Repair events: when and why repairs are triggered
+5. Final result: success/failure with statistics
+
+This combined view helps debug complex interactions between:
+- Model behavior (generation tracing)
+- Validation logic (sampling diagnostics)
+- Repair strategies (how feedback improves results)
+
+Run:
+    uv run python docs/examples/plugins/builtin_full_pipeline_tracing.py
+
+Watch the complete flow:
+    [📤 GEN-PRE-CALL] → [📥 GEN-POST-CALL] (model called)
+    [❌ SAMPLING-ITER] → [🔧 REPAIR-TRIGGERED] (validation failed)
+    [📤 GEN-PRE-CALL] → [📥 GEN-POST-CALL] (repair attempt)
+    [❌ SAMPLING-ITER] → [🔧 REPAIR-TRIGGERED] (still failing)
+    ...
+    [💥 SAMPLING-END] (final result)
+"""
+
+import logging
+
+import mellea
+from mellea.core import Requirement
+from mellea.plugins import register
+from mellea.plugins.builtin_debug.generation import (
+    log_generation_post_call,
+    log_generation_pre_call,
+)
+from mellea.plugins.builtin_debug.sampling import (
+    log_sampling_iteration,
+    log_sampling_loop_end,
+    log_sampling_loop_start,
+    log_sampling_repair,
+)
+from mellea.stdlib.requirements import req, simple_validate
+from mellea.stdlib.sampling import RepairTemplateStrategy
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+log = logging.getLogger(__name__)
+
+# Enable both generation and sampling tracing
+register(
+    [
+        # Generation pipeline
+        log_generation_pre_call,
+        log_generation_post_call,
+        # Sampling pipeline
+        log_sampling_loop_start,
+        log_sampling_iteration,
+        log_sampling_repair,
+        log_sampling_loop_end,
+    ]
+)
+
+
+def is_lowercase_only(text: str) -> bool:
+    """Validation: text must be all lowercase."""
+    return text == text.lower()
+
+
+def has_thank_you(text: str) -> bool:
+    """Validation: text must contain 'thank you'."""
+    return "thank you" in text.lower()
+
+
+requirements: list[Requirement | str] = [
+    req("Start with a greeting"),
+    req(
+        "Use only lowercase letters (no capitals)",
+        validation_fn=simple_validate(is_lowercase_only),
+    ),
+    req("Include the phrase 'thank you'", validation_fn=simple_validate(has_thank_you)),
+]
+
+
+def main():
+    """Full pipeline tracing example."""
+    log.info("=" * 70)
+    log.info("Full Pipeline Tracing Example")
+    log.info("=" * 70)
+    log.info("")
+    log.info("Watch for both generation AND sampling events:")
+    log.info("  [📤 GEN-PRE-CALL]     - what's sent to the LLM")
+    log.info("  [📥 GEN-POST-CALL]    - model response + latency")
+    log.info("  [❌ SAMPLING-ITER]    - validation results")
+    log.info("  [🔧 REPAIR-TRIGGERED] - repair strategy kicks in")
+    log.info("")
+
+    with mellea.start_session() as m:
+        log.info("Generating text with strict requirements and repair strategy...")
+        log.info("")
+
+        result = m.instruct(
+            "Write a thank you note",
+            requirements=requirements,
+            strategy=RepairTemplateStrategy(loop_budget=3),
+        )
+
+        log.info("")
+        log.info("=" * 70)
+        log.info("Final result:")
+        log.info(str(result)[:300] + ("..." if len(str(result)) > 300 else ""))
+        log.info("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/plugins/builtin_generation_tracing.py b/docs/examples/plugins/builtin_generation_tracing.py
new file mode 100644
index 000000000..8e5f5c534
--- /dev/null
+++ b/docs/examples/plugins/builtin_generation_tracing.py
@@ -0,0 +1,60 @@
+# pytest: ollama, e2e
+"""Built-in generation tracing plugin example.
+
+This example demonstrates the GenerationTracingPlugin from mellea.plugins.builtin_debug,
+which traces all LLM backend calls with request/response inspection.
+
+The plugin logs:
+- Generation ID for correlation
+- Model being called
+- Prompt preview (first 100 chars)
+- Response preview (first 100 chars)
+- Latency in milliseconds
+- Token usage (prompt + completion = total)
+
+Run:
+    uv run python docs/examples/plugins/builtin_generation_tracing.py
+
+Watch the logs to see tracing in action:
+    [📤 GEN-PRE-CALL gen_id=...] model=... | prompt=...
+    [📥 GEN-POST-CALL gen_id=...] model=... | latency=...ms | tokens=(...) | response=...
+"""
+
+import logging
+
+import mellea
+from mellea.plugins import register
+from mellea.plugins.builtin_debug.generation import (
+    log_generation_post_call,
+    log_generation_pre_call,
+)
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+log = logging.getLogger(__name__)
+
+# Enable generation tracing by registering the hooks
+register([log_generation_pre_call, log_generation_post_call])
+
+
+def main():
+    """Example: Use GenerationTracingPlugin to debug generation calls."""
+    log.info("=" * 70)
+    log.info("Generation Tracing Plugin Example")
+    log.info("=" * 70)
+    log.info("")
+    log.info("Watch the logs for [📤 GEN-PRE-CALL] and [📥 GEN-POST-CALL] entries.")
+    log.info("")
+
+    with mellea.start_session() as m:
+        result = m.instruct("What are the three main colors of the rainbow?")
+
+        log.info("")
+        log.info("=" * 70)
+        log.info("Final result:")
+        result_str = str(result)
+        log.info(result_str[:200] + "..." if len(result_str) > 200 else result_str)
+        log.info("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/plugins/builtin_sampling_diagnostics.py b/docs/examples/plugins/builtin_sampling_diagnostics.py
new file mode 100644
index 000000000..ed6eab68c
--- /dev/null
+++ b/docs/examples/plugins/builtin_sampling_diagnostics.py
@@ -0,0 +1,105 @@
+# pytest: ollama, e2e
+"""Built-in sampling diagnostics plugin example.
+
+This example demonstrates the sampling hooks from mellea.plugins.builtin_debug,
+which trace the sampling strategy pipeline with iteration tracking, validation
+results, repair events, and success/failure analysis.
+
+The plugin logs:
+- Strategy initialization with loop budget and requirements count
+- Each iteration with validation pass/fail status
+- Detailed validation results per requirement
+- Repair events when triggered
+- Final sampling result with success/failure reason
+
+Run:
+    uv run python docs/examples/plugins/builtin_sampling_diagnostics.py
+
+Watch the logs to see:
+    [🎯 SAMPLING-START] strategy=... | loop_budget=... | requirements=...
+    [✅ SAMPLING-ITER 1] SUCCESS: ... validations passed
+    [❌ SAMPLING-ITER 2] FAILED: ... validations passed
+    [🔧 REPAIR-TRIGGERED] at iteration ...
+    [🎉 SAMPLING-END] SUCCESS in ... iterations
+"""
+
+import logging
+
+import mellea
+from mellea.core import Requirement
+from mellea.plugins import register
+from mellea.plugins.builtin_debug.sampling import (
+    log_sampling_iteration,
+    log_sampling_loop_end,
+    log_sampling_loop_start,
+    log_sampling_repair,
+)
+from mellea.stdlib.requirements import req, simple_validate
+from mellea.stdlib.sampling import RepairTemplateStrategy
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+log = logging.getLogger(__name__)
+
+# Enable sampling diagnostics
+register(
+    [
+        log_sampling_loop_start,
+        log_sampling_iteration,
+        log_sampling_repair,
+        log_sampling_loop_end,
+    ]
+)
+
+
+def is_lowercase_only(text: str) -> bool:
+    """Validation: text must be all lowercase."""
+    return text == text.lower()
+
+
+def has_thank_you(text: str) -> bool:
+    """Validation: text must contain 'thank you'."""
+    return "thank you" in text.lower()
+
+
+requirements: list[Requirement | str] = [
+    req("Start with a greeting"),
+    req(
+        "Use only lowercase letters (no capitals)",
+        validation_fn=simple_validate(is_lowercase_only),
+    ),
+    req("Include the phrase 'thank you'", validation_fn=simple_validate(has_thank_you)),
+]
+
+
+def main():
+    """Example: Use sampling diagnostics to debug repair strategies."""
+    log.info("=" * 70)
+    log.info("Sampling Diagnostics Plugin Example")
+    log.info("=" * 70)
+    log.info("")
+    log.info("Watch the logs for sampling lifecycle events:")
+    log.info("  [🎯 SAMPLING-START] - strategy begins")
+    log.info("  [✅/❌ SAMPLING-ITER] - per-iteration validation results")
+    log.info("  [🔧 REPAIR-TRIGGERED] - repair invoked (RepairTemplateStrategy only)")
+    log.info("  [🎉/💥 SAMPLING-END] - final result")
+    log.info("")
+
+    with mellea.start_session() as m:
+        log.info("Generating text with strict requirements...")
+        log.info("")
+
+        result = m.instruct(
+            "Write a thank you note",
+            requirements=requirements,
+            strategy=RepairTemplateStrategy(loop_budget=3),
+        )
+
+        log.info("")
+        log.info("=" * 70)
+        log.info("Final result:")
+        log.info(str(result)[:300] + ("..." if len(str(result)) > 300 else ""))
+        log.info("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/plugins/builtin_validation_failures.py b/docs/examples/plugins/builtin_validation_failures.py
new file mode 100644
index 000000000..a714b439e
--- /dev/null
+++ b/docs/examples/plugins/builtin_validation_failures.py
@@ -0,0 +1,91 @@
+# pytest: ollama, e2e
+"""Validation tracing example showing real failures.
+
+This example uses instruct with immediate validation (no repair strategy),
+so we'll see real validation failures in the logs.
+
+Run:
+    uv run python docs/examples/plugins/builtin_validation_failures.py
+
+Watch the logs to see:
+    [🔍 VALIDATION-PRE-CHECK] requirements setup
+    [❌ VALIDATION-POST-CHECK] with MIXED RESULTS showing:
+    ✓ Passed requirements (debug level)
+    ❌ Failed requirements (info level) with reasons
+"""
+
+import logging
+
+import mellea
+from mellea.core import Requirement
+from mellea.plugins import register
+from mellea.plugins.builtin_debug.validation import (
+    log_validation_post_check,
+    log_validation_pre_check,
+)
+from mellea.stdlib.requirements import req, simple_validate
+
+logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(message)s")
+log = logging.getLogger(__name__)
+
+# Enable validation tracing
+register([log_validation_pre_check, log_validation_post_check])
+
+
+def requires_hello(text: str) -> bool:
+    """Validation: must contain the word 'hello'."""
+    return "hello" in text.lower()
+
+
+def no_exclamation(text: str) -> bool:
+    """Validation: no exclamation marks allowed."""
+    return "!" not in text
+
+
+def is_lowercase_only(text: str) -> bool:
+    """Validation: all lowercase."""
+    return text == text.lower()
+
+
+def is_short(text: str) -> bool:
+    """Validation: must be 20 characters or less."""
+    return len(text) <= 20
+
+
+requirements: list[Requirement | str] = [
+    req("Must contain the word 'hello'", validation_fn=simple_validate(requires_hello)),
+    req("No exclamation marks allowed", validation_fn=simple_validate(no_exclamation)),
+    req("All lowercase", validation_fn=simple_validate(is_lowercase_only)),
+    req("Must be 20 characters or less", validation_fn=simple_validate(is_short)),
+]
+
+
+def main():
+    """Example that will show validation failures."""
+    log.info("=" * 70)
+    log.info("Validation Failures Example")
+    log.info("=" * 70)
+    log.info("")
+    log.info("Requirements (with potential for failures):")
+    for i, req_obj in enumerate(requirements, 1):
+        req_desc = getattr(req_obj, "description", str(req_obj))
+        log.info(f"  {i}. {req_desc}")
+    log.info("")
+
+    with mellea.start_session() as m:
+        log.info("Test: Generate casual greeting (likely to fail some requirements)")
+        log.info("-" * 70)
+        log.info("")
+
+        # Use immediate validation (no repair) to see failures
+        result = m.instruct(
+            "Say a casual greeting with punctuation", requirements=requirements
+        )
+        log.info("")
+        log.info(f"Generated text: {result}")
+        log.info("")
+        log.info("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/plugins/builtin_validation_strict.py b/docs/examples/plugins/builtin_validation_strict.py
new file mode 100644
index 000000000..6dec56fff
--- /dev/null
+++ b/docs/examples/plugins/builtin_validation_strict.py
@@ -0,0 +1,95 @@
+# pytest: ollama, e2e
+"""Strict validation tracing example with intentional failures.
+
+This example demonstrates validation tracing with STRICT requirements designed
+to trigger failures so you can see the full validation output including:
+- Failed requirement details
+- Failure reasons
+- Pass/fail counts
+
+Run:
+    uv run python docs/examples/plugins/builtin_validation_strict.py
+
+Watch the logs to see validation failures:
+    [🔍 VALIDATION-PRE-CHECK] requirements=... | target=...
+    [❌ VALIDATION-POST-CHECK] MIXED RESULTS: ... passed, ... failed
+    ✓ Passed requirement
+    ❌ Failed requirement
+       └─ reason why it failed
+"""
+
+import logging
+
+import mellea
+from mellea.core import Requirement
+from mellea.plugins import register
+from mellea.plugins.builtin_debug.validation import (
+    log_validation_post_check,
+    log_validation_pre_check,
+)
+from mellea.stdlib.requirements import req, simple_validate
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+log = logging.getLogger(__name__)
+
+# Enable validation tracing
+register([log_validation_pre_check, log_validation_post_check])
+
+
+def is_single_word(text: str) -> bool:
+    """Validation: response must be exactly one word."""
+    return len(text.strip().split()) == 1
+
+
+def is_all_caps(text: str) -> bool:
+    """Validation: all letters must be uppercase."""
+    return text == text.upper() and text.isalpha()
+
+
+def is_very_short(text: str) -> bool:
+    """Validation: must be 5 characters or less."""
+    return len(text) <= 5
+
+
+requirements: list[Requirement | str] = [
+    req(
+        "Response must be exactly one word",
+        validation_fn=simple_validate(is_single_word),
+    ),
+    req("All letters must be UPPERCASE", validation_fn=simple_validate(is_all_caps)),
+    req(
+        "Response must be 5 characters or less",
+        validation_fn=simple_validate(is_very_short),
+    ),
+]
+
+
+def main():
+    """Example with strict requirements that will fail."""
+    log.info("=" * 70)
+    log.info("Strict Validation Tracing Example")
+    log.info("=" * 70)
+    log.info("")
+    log.info("Intentionally strict requirements to demonstrate validation failures:")
+    for i, req_obj in enumerate(requirements, 1):
+        req_desc = getattr(req_obj, "description", str(req_obj))
+        log.info(f"  {i}. {req_desc}")
+    log.info("")
+
+    with mellea.start_session() as m:
+        log.info(
+            "Generating multi-word response (will fail 'single word' requirement):"
+        )
+        log.info("-" * 70)
+
+        result = m.instruct("Say hello world", requirements=requirements)
+        log.info(f"Result: {result}")
+        log.info("")
+
+        log.info("=" * 70)
+        log.info("Validation tracing complete!")
+        log.info("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/plugins/builtin_validation_tracing.py b/docs/examples/plugins/builtin_validation_tracing.py
new file mode 100644
index 000000000..870d0ed14
--- /dev/null
+++ b/docs/examples/plugins/builtin_validation_tracing.py
@@ -0,0 +1,127 @@
+# pytest: ollama, e2e
+"""Validation tracing plugin example.
+
+This example demonstrates the validation debug plugins from mellea.plugins.builtin_debug,
+which trace requirement validation with pre-check setup and detailed per-requirement
+results including pass/fail status, reasons, and scores.
+
+The plugin logs:
+- Pre-check: requirements about to be validated, target being checked
+- Per-requirement: pass/fail status with reasons for failures
+- Post-check: aggregate pass/fail counts and summary
+
+Run:
+    uv run python docs/examples/plugins/builtin_validation_tracing.py
+
+Watch the logs to see:
+    [🔍 VALIDATION-PRE-CHECK] requirements=... | target=...
+    [❌ VALIDATION-POST-CHECK] MIXED RESULTS: ... passed, ... failed
+    ✓ Passed requirement
+    ❌ Failed requirement
+       └─ reason why it failed
+"""
+
+import logging
+
+import mellea
+from mellea.core import Requirement
+from mellea.plugins import register
+from mellea.plugins.builtin_debug.validation import (
+    log_validation_post_check,
+    log_validation_pre_check,
+)
+from mellea.stdlib.requirements import check, req, simple_validate
+
+logging.basicConfig(
+    level=logging.DEBUG,  # DEBUG to see passed requirements too
+    format="%(levelname)s: %(message)s",
+)
+log = logging.getLogger(__name__)
+
+# Enable validation tracing
+register([log_validation_pre_check, log_validation_post_check])
+
+
+def is_lowercase_only(text: str) -> bool:
+    """Validation: text must be all lowercase."""
+    return text == text.lower()
+
+
+def has_required_phrase(text: str) -> bool:
+    """Validation: text must contain 'thank you'."""
+    return "thank you" in text.lower()
+
+
+def has_proper_length(text: str) -> bool:
+    """Validation: text should be between 10 and 500 characters."""
+    return 10 <= len(text) <= 500
+
+
+requirements: list[Requirement | str] = [
+    req(
+        "Use only lowercase letters (no capitals)",
+        validation_fn=simple_validate(is_lowercase_only),
+    ),
+    req(
+        "Include the phrase 'thank you'",
+        validation_fn=simple_validate(has_required_phrase),
+    ),
+    req(
+        "Text length between 10 and 500 characters",
+        validation_fn=simple_validate(has_proper_length),
+    ),
+    check("Response should be helpful and polite"),
+]
+
+
+def main():
+    """Example: Use validation tracing to debug requirement checking."""
+    log.info("=" * 70)
+    log.info("Validation Tracing Plugin Example")
+    log.info("=" * 70)
+    log.info("")
+    log.info("Watch the logs for validation lifecycle events:")
+    log.info("  [🔍 VALIDATION-PRE-CHECK]  - setup phase before validation")
+    log.info("  [❌ VALIDATION-POST-CHECK]  - results after validation")
+    log.info("")
+    log.info("Requirements being validated:")
+    for i, req_obj in enumerate(requirements, 1):
+        req_desc = getattr(req_obj, "description", str(req_obj))
+        log.info(f"  {i}. {req_desc}")
+    log.info("")
+
+    with mellea.start_session() as m:
+        log.info("Test 1: Good response (should pass most requirements)")
+        log.info("-" * 70)
+
+        result1 = m.instruct(
+            "Say thank you",
+            requirements=requirements[:3],  # Use first 3 requirements only
+        )
+        log.info("")
+        log.info(f"Result:\n{result1}")
+        log.info("")
+
+        log.info("Test 2: Uppercase response (should fail lowercase requirement)")
+        log.info("-" * 70)
+
+        result2 = m.instruct("Say THANK YOU in all caps", requirements=requirements[:3])
+        log.info("")
+        log.info(f"Result:\n{result2}")
+        log.info("")
+
+        log.info("Test 3: Short response (should fail length requirement)")
+        log.info("-" * 70)
+
+        result3 = m.instruct("Say hi", requirements=requirements[:3])
+        log.info("")
+        log.info(f"Result:\n{result3}")
+        log.info("")
+
+        log.info("=" * 70)
+        log.info("Validation tracing complete!")
+        log.info("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mellea/plugins/builtin_debug/__init__.py b/mellea/plugins/builtin_debug/__init__.py
new file mode 100644
index 000000000..73db88899
--- /dev/null
+++ b/mellea/plugins/builtin_debug/__init__.py
@@ -0,0 +1,55 @@
+"""Built-in debug plugins for Mellea.
+
+Provides pre-built plugins for common debugging tasks:
+- Generation pipeline tracing (requests, responses, latency, tokens)
+- Sampling strategy diagnostics (iterations, validation, repair, results)
+
+Examples:
+    Enable generation tracing:
+
+        from mellea.plugins.builtin_debug.generation import (
+            log_generation_pre_call,
+            log_generation_post_call,
+        )
+        from mellea.plugins import register
+
+        register([log_generation_pre_call, log_generation_post_call])
+
+    Enable sampling diagnostics:
+
+        from mellea.plugins.builtin_debug.sampling import (
+            log_sampling_loop_start,
+            log_sampling_iteration,
+            log_sampling_repair,
+            log_sampling_loop_end,
+        )
+
+        register([
+            log_sampling_loop_start,
+            log_sampling_iteration,
+            log_sampling_repair,
+            log_sampling_loop_end,
+        ])
+"""
+
+from __future__ import annotations
+
+from .generation import log_generation_post_call, log_generation_pre_call
+from .sampling import (
+    log_sampling_iteration,
+    log_sampling_loop_end,
+    log_sampling_loop_start,
+    log_sampling_repair,
+)
+from .validation import log_validation_post_check, log_validation_pre_check
+
+__all__ = [
+    "log_generation_post_call",
+    "log_generation_pre_call",
+    "log_sampling_iteration",
+    "log_sampling_loop_end",
+    "log_sampling_loop_start",
+    "log_sampling_repair",
+    "log_validation_post_check",
+    "log_validation_pre_check",
+]
diff --git a/mellea/plugins/builtin_debug/generation.py b/mellea/plugins/builtin_debug/generation.py
new file mode 100644
index 000000000..9e0a1bcac
--- /dev/null
+++ b/mellea/plugins/builtin_debug/generation.py
@@ -0,0 +1,178 @@
+"""Built-in debug plugin for generation pipeline (pre-call and post-call).
+
+Provides tracing for all LLM backend calls. Use for debugging model invocations,
+tracking latency, and understanding request/response flow.
+
+Examples:
+    Enable generation tracing:
+
+        from mellea.plugins.builtin_debug import GenerationTracingPlugin
+        from mellea import start_session
+        from mellea.plugins import register
+
+        register([GenerationTracingPlugin()])
+
+        with start_session() as m:
+            result = m.instruct("...")  # Tracing fires automatically
+"""
+
+from __future__ import annotations
+
+import logging
+
+from mellea.plugins import HookType, hook
+
+logger = logging.getLogger(__name__)
+
+
+def _get_prompt_preview(payload) -> str:
+    """Extract and shorten prompt for logging."""
+    action = getattr(payload, "action", None)
+    if not action:
+        return "(no action)"
+
+    text = None
+
+    # Try to get description from Instruction components
+    if hasattr(action, "_description"):
+        text = action._description or None
+
+    # Try format_for_llm() for structured output
+    if not text and hasattr(action, "format_for_llm"):
+        try:
+            formatted = action.format_for_llm()
+            # Extract args if available
+            if hasattr(formatted, "args"):
+                desc = formatted.args.get("description", "")
+                if desc:
+                    text = str(desc)[:200]
+        except Exception:
+            pass
+
+    # Fallback to string representation
+    if not text:
+        text = str(action)[:200]
+
+    text = str(text).replace("\n", " ").replace("  ", " ").strip()
+    if len(text) > 100:
+        text = text[:97] + "..."
+    return text
+
+
+def _get_response_preview(payload) -> str:
+    """Extract and shorten response for logging."""
+    try:
+        model_output = getattr(payload, "model_output", None)
+        if not model_output:
+            return "(no output)"
+
+        value = getattr(model_output, "value", None)
+        if not value:
+            return "(no value)"
+
+        text = str(value)[:200]
+        text = text.replace("\n", " ").replace("  ", " ").strip()
+        if len(text) > 100:
+            text = text[:97] + "..."
+        return text
+    except Exception:
+        return "(error reading response)"
+
+
+def _get_token_usage(payload) -> str:
+    """Extract token usage from payload."""
+    try:
+        model_output = getattr(payload, "model_output", None)
+        if not model_output:
+            return "unknown"
+
+        gen = getattr(model_output, "generation", None)
+        if not gen:
+            return "unknown"
+
+        usage = getattr(gen, "usage", {})
+        if not usage:
+            return "unknown"
+
+        total = usage.get("total_tokens", "?")
+        prompt = usage.get("prompt_tokens", "?")
+        completion = usage.get("completion_tokens", "?")
+        return f"({prompt}+{completion}={total})"
+    except Exception:
+        return "unknown"
+
+
+@hook(HookType.GENERATION_PRE_CALL)
+async def log_generation_pre_call(payload, ctx):
+    """Log request details before calling the LLM."""
+    model = getattr(payload, "backend", None)
+    model_id = model.model_id if model else "unknown"
+    gen_id = payload.generation_id or "no-id"
+
+    # Extract all data from the action
+    action = getattr(payload, "action", None)
+    requirements = []
+    repair_text = ""
+
+    if action and hasattr(action, "format_for_llm"):
+        try:
+            fmt = action.format_for_llm()
+            if hasattr(fmt, "args"):
+                requirements = fmt.args.get("requirements", [])
+                repair_text = fmt.args.get("repair", "")
+        except Exception:
+            pass
+
+    # Log main request info
+    prompt_preview = _get_prompt_preview(payload)
+    logger.info(
+        f"[📤 GEN-PRE-CALL gen_id={gen_id}] model={model_id} | prompt={prompt_preview}"
+    )
+
+    # Log requirements if present
+    if requirements:
+        logger.debug(f"   requirements ({len(requirements)}):")
+        for i, req in enumerate(requirements, 1):
+            req_desc = getattr(req, "description", str(req))
+            logger.debug(f"     {i}. {req_desc}")
+
+    # Log repair feedback if present (indicates a repair attempt)
+    if repair_text:
+        logger.info("   [⭐ REPAIR ATTEMPT] Repair feedback provided:")
+        # Show first 300 chars of repair text
+        repair_preview = repair_text[:300].replace("\n", " ")
+        if len(repair_text) > 300:
+            repair_preview += "..."
+        logger.info(f"   {repair_preview}")
+
+
+@hook(HookType.GENERATION_POST_CALL)
+async def log_generation_post_call(payload, ctx):
+    """Log response details after LLM returns."""
+    model_output = getattr(payload, "model_output", None)
+    model_id = "unknown"
+    if model_output:
+        gen = getattr(model_output, "generation", None)
+        if gen:
+            model_id = getattr(gen, "model", "unknown")
+
+    gen_id = payload.generation_id or "no-id"
+    latency_ms = payload.latency_ms or 0
+
+    response_preview = _get_response_preview(payload)
+    tokens = _get_token_usage(payload)
+
+    logger.info(
+        f"[📥 GEN-POST-CALL gen_id={gen_id}] "
+        f"model={model_id} | latency={latency_ms:.0f}ms | "
+        f"tokens={tokens} | response={response_preview}"
+    )
+
+
+# Export as a name for convenience
+class GenerationTracingPlugin:
+    """Marker class for GenerationTracingPlugin.
+
+    The actual hooks are registered via @hook decorators above.
+    Use this for reference or type hinting if needed.
+    """
diff --git a/mellea/plugins/builtin_debug/sampling.py b/mellea/plugins/builtin_debug/sampling.py
new file mode 100644
index 000000000..92102d024
--- /dev/null
+++ b/mellea/plugins/builtin_debug/sampling.py
@@ -0,0 +1,132 @@
+"""Built-in debug plugin for sampling pipeline.
+
+Provides tracing for sampling strategies including iteration tracking, validation
+results, repair events, and success/failure analysis.
+
+Examples:
+    Enable sampling tracing:
+
+        from mellea.plugins.builtin_debug.sampling import (
+            log_sampling_loop_start,
+            log_sampling_iteration,
+            log_sampling_repair,
+            log_sampling_loop_end,
+        )
+        from mellea.plugins import register
+
+        register([
+            log_sampling_loop_start,
+            log_sampling_iteration,
+            log_sampling_repair,
+            log_sampling_loop_end,
+        ])
+
+        with start_session() as m:
+            result = m.instruct("...", strategy=SamplingStrategy(...))
+"""
+
+from __future__ import annotations
+
+import logging
+
+from mellea.plugins import HookType, hook
+
+logger = logging.getLogger(__name__)
+
+
+@hook(HookType.SAMPLING_LOOP_START)
+async def log_sampling_loop_start(payload, ctx):
+    """Log sampling strategy initialization."""
+    strategy = payload.strategy_name
+    budget = payload.loop_budget
+    num_reqs = len(payload.requirements)
+
+    logger.info(
+        f"[🎯 SAMPLING-START] strategy={strategy} | loop_budget={budget} | "
+        f"requirements={num_reqs}"
+    )
+
+    if payload.requirements:
+        for i, req in enumerate(payload.requirements, 1):
+            req_desc = getattr(req, "description", str(req))
+            logger.debug(f"  {i}. {req_desc}")
+
+
+@hook(HookType.SAMPLING_ITERATION)
+async def log_sampling_iteration(payload, ctx):
+    """Log validation results for each sampling attempt."""
+    iteration = payload.iteration
+    passed = payload.valid_count
+    total = payload.total_count
+
+    if payload.all_validations_passed:
+        logger.info(
+            f"[✅ SAMPLING-ITER {iteration}] SUCCESS: {passed}/{total} validations passed"
+        )
+    else:
+        logger.info(
+            f"[❌ SAMPLING-ITER {iteration}] FAILED: {passed}/{total} validations passed"
+        )
+
+        if payload.validation_results:
+            for req_obj, result in payload.validation_results:
+                req_desc = getattr(req_obj, "description", str(req_obj))
+                status = "✓" if result.as_bool() else "❌"
+
+                if result.as_bool():
+                    logger.debug(f"    {status} {req_desc}")
+                else:
+                    logger.info(f"    {status} {req_desc}")
+                    # Show detailed reason only if informative
+                    reason = getattr(result, "reason", None)
+                    if reason and reason not in ("yes", "no"):
+                        logger.info(f"       └─ {reason}")
+
+
+@hook(HookType.SAMPLING_REPAIR)
+async def log_sampling_repair(payload, ctx):
+    """Log when repair is triggered (RepairTemplateStrategy only)."""
+    iteration = payload.repair_iteration
+    repair_type = payload.repair_type
+
+    logger.info(f"\n[🔧 REPAIR-TRIGGERED] at iteration {iteration}")
+    logger.info(f"   repair_type={repair_type}")
+    logger.info("   failed_validations:")
+
+    for req_obj, result in payload.failed_validations:
+        if not result.as_bool():
+            req_desc = getattr(req_obj, "description", str(req_obj))
+            logger.info(f"     • {req_desc}")
+
+
+@hook(HookType.SAMPLING_LOOP_END)
+async def log_sampling_loop_end(payload, ctx):
+    """Log sampling completion and overall results."""
+    strategy = payload.strategy_name
+    iterations = payload.iterations_used
+    success = payload.success
+    failure_reason = payload.failure_reason
+
+    if success:
+        logger.info(
+            f"\n[🎉 SAMPLING-END] SUCCESS in {iterations} iteration(s) using {strategy}"
+        )
+    else:
+        logger.info(
+            f"\n[💥 SAMPLING-END] FAILED after {iterations} iteration(s): "
+            f"{failure_reason}"
+        )
+
+    # Summary statistics
+    total_results = len(payload.all_results)
+    logger.info(f"   total_attempts={total_results}")
+
+    # Show best attempt statistics
+    if payload.all_validations:
+        best_valid_count = 0
+        for validation_list in payload.all_validations:
+            valid_count = sum(1 for _, result in validation_list if result.as_bool())
+            best_valid_count = max(best_valid_count, valid_count)
+
+        total_reqs = len(payload.all_validations[0]) if payload.all_validations else 0
+        logger.info(f"   best_validation_score={best_valid_count}/{total_reqs}")
diff --git a/mellea/plugins/builtin_debug/validation.py b/mellea/plugins/builtin_debug/validation.py
new file mode 100644
index 000000000..aa9938b35
--- /dev/null
+++ b/mellea/plugins/builtin_debug/validation.py
@@ -0,0 +1,91 @@
+"""Built-in debug plugin for validation pipeline.
+
+Provides tracing for requirement validation including pre-check setup, post-check
+results, and detailed requirement-by-requirement analysis.
+
+Examples:
+    Enable validation tracing:
+
+        from mellea.plugins.builtin_debug.validation import (
+            log_validation_pre_check,
+            log_validation_post_check,
+        )
+        from mellea.plugins import register
+
+        register([
+            log_validation_pre_check,
+            log_validation_post_check,
+        ])
+
+        with start_session() as m:
+            result = m.instruct("...", requirements=[...])
+"""
+
+from __future__ import annotations
+
+import logging
+
+from mellea.plugins import HookType, hook
+
+logger = logging.getLogger(__name__)
+
+
+@hook(HookType.VALIDATION_PRE_CHECK)
+async def log_validation_pre_check(payload, ctx):
+    """Log validation setup before requirements are checked."""
+    num_reqs = len(payload.requirements)
+    target_type = type(payload.target).__name__ if payload.target else "None"
+
+    logger.info(
+        f"[🔍 VALIDATION-PRE-CHECK] requirements={num_reqs} | target={target_type}"
+    )
+
+    if payload.requirements:
+        logger.debug("   Requirements to validate:")
+        for i, req in enumerate(payload.requirements, 1):
+            req_desc = getattr(req, "description", str(req))
+            req_type = type(req).__name__
+            logger.debug(f"     {i}. [{req_type}] {req_desc}")
+
+
+@hook(HookType.VALIDATION_POST_CHECK)
+async def log_validation_post_check(payload, ctx):
+    """Log validation results after requirements are checked."""
+    passed = payload.passed_count
+    failed = payload.failed_count
+    total = len(payload.requirements)
+    all_passed = payload.all_validations_passed
+
+    if all_passed:
+        logger.info(
+            f"[✅ VALIDATION-POST-CHECK] ALL PASSED: {passed}/{total} requirements"
+        )
+    else:
+        logger.info(
+            f"[❌ VALIDATION-POST-CHECK] MIXED RESULTS: {passed}/{total} passed, "
+            f"{failed}/{total} failed"
+        )
+
+    # Log detailed results per requirement
+    if payload.results:
+        for i, (req, result) in enumerate(
+            zip(payload.requirements, payload.results), 1
+        ):
+            req_desc = getattr(req, "description", str(req))
+            is_passed = result.as_bool()
+            status = "✓" if is_passed else "❌"
+
+            if is_passed:
+                logger.debug(f"    {status} {req_desc}")
+            else:
+                logger.info(f"    {status} {req_desc}")
+
+                # Show reason if available and informative
+                reason = getattr(result, "reason", None)
+                if reason and reason not in ("yes", "no", ""):
+                    logger.info(f"       └─ {reason}")
+
+                # Show score if available
+                score = getattr(result, "score", None)
+                if score is not None:
+                    logger.debug(f"       └─ score: {score:.2f}")

From 3a0e64aa117ca1ac23084528f7605141fb9d6e28 Mon Sep 17 00:00:00 2001
From: Akihiko Kuroda <akihikokuroda2020@gmail.com>
Date: Wed, 10 Jun 2026 17:02:23 -0400
Subject: [PATCH 2/4] add debug plugin collections

Signed-off-by: Akihiko Kuroda <akihikokuroda2020@gmail.com>
---
 mellea/plugins/builtin_debug/generation.py | 14 +++++++++--
 mellea/plugins/builtin_debug/sampling.py   | 28 ++++++++++++++++++----
 mellea/plugins/builtin_debug/validation.py | 14 +++++++++--
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/mellea/plugins/builtin_debug/generation.py b/mellea/plugins/builtin_debug/generation.py
index 9e0a1bcac..bd9e6afde 100644
--- a/mellea/plugins/builtin_debug/generation.py
+++ b/mellea/plugins/builtin_debug/generation.py
@@ -104,7 +104,12 @@ def _get_token_usage(payload) -> str:
 
 @hook(HookType.GENERATION_PRE_CALL)
 async def log_generation_pre_call(payload, ctx):
-    """Log request details before calling the LLM."""
+    """Log request details before calling the LLM.
+
+    Args:
+        payload: GenerationPreCallPayload containing backend, action, generation_id.
+        ctx: Plugin context for hook execution.
+    """
     model = getattr(payload, "backend", None)
     model_id = model.model_id if model else "unknown"
     gen_id = payload.generation_id or "no-id"
@@ -148,7 +153,12 @@ async def log_generation_pre_call(payload, ctx):
 
 @hook(HookType.GENERATION_POST_CALL)
 async def log_generation_post_call(payload, ctx):
-    """Log response details after LLM returns."""
+    """Log response details after LLM returns.
+
+    Args:
+        payload: GenerationPostCallPayload containing model_output, generation_id, latency_ms.
+        ctx: Plugin context for hook execution.
+    """
     model_output = getattr(payload, "model_output", None)
     model_id = "unknown"
     if model_output:
diff --git a/mellea/plugins/builtin_debug/sampling.py b/mellea/plugins/builtin_debug/sampling.py
index 92102d024..3cd3c43c7 100644
--- a/mellea/plugins/builtin_debug/sampling.py
+++ b/mellea/plugins/builtin_debug/sampling.py
@@ -36,7 +36,12 @@
 
 @hook(HookType.SAMPLING_LOOP_START)
 async def log_sampling_loop_start(payload, ctx):
-    """Log sampling strategy initialization."""
+    """Log sampling strategy initialization with budget and requirement count.
+
+    Args:
+        payload: SamplingLoopStartPayload with strategy_name, loop_budget, requirements.
+        ctx: Plugin context for hook execution.
+    """
     strategy = payload.strategy_name
     budget = payload.loop_budget
     num_reqs = len(payload.requirements)
@@ -54,7 +59,12 @@ async def log_sampling_loop_start(payload, ctx):
 
 @hook(HookType.SAMPLING_ITERATION)
 async def log_sampling_iteration(payload, ctx):
-    """Log validation results for each sampling attempt."""
+    """Log validation results for each sampling attempt.
+
+    Args:
+        payload: SamplingIterationPayload with iteration, valid_count, validation_results.
+        ctx: Plugin context for hook execution.
+    """
     iteration = payload.iteration
     passed = payload.valid_count
     total = payload.total_count
@@ -85,7 +95,12 @@ async def log_sampling_iteration(payload, ctx):
 
 @hook(HookType.SAMPLING_REPAIR)
 async def log_sampling_repair(payload, ctx):
-    """Log when repair is triggered (RepairTemplateStrategy only)."""
+    """Log when repair is triggered during sampling iterations.
+
+    Args:
+        payload: SamplingRepairPayload with repair_iteration, repair_type, failed_validations.
+        ctx: Plugin context for hook execution.
+    """
     iteration = payload.repair_iteration
     repair_type = payload.repair_type
 
@@ -101,7 +116,12 @@ async def log_sampling_repair(payload, ctx):
 
 @hook(HookType.SAMPLING_LOOP_END)
 async def log_sampling_loop_end(payload, ctx):
-    """Log sampling completion and overall results."""
+    """Log sampling completion with success status and attempt statistics.
+
+    Args:
+        payload: SamplingLoopEndPayload with success, iterations_used, all_results, all_validations.
+        ctx: Plugin context for hook execution.
+    """
     strategy = payload.strategy_name
     iterations = payload.iterations_used
     success = payload.success
diff --git a/mellea/plugins/builtin_debug/validation.py b/mellea/plugins/builtin_debug/validation.py
index aa9938b35..2a497d0cc 100644
--- a/mellea/plugins/builtin_debug/validation.py
+++ b/mellea/plugins/builtin_debug/validation.py
@@ -32,7 +32,12 @@
 
 @hook(HookType.VALIDATION_PRE_CHECK)
 async def log_validation_pre_check(payload, ctx):
-    """Log validation setup before requirements are checked."""
+    """Log validation setup before requirements are checked.
+
+    Args:
+        payload: ValidationPreCheckPayload containing requirements and target.
+        ctx: Plugin context for hook execution.
+    """
     num_reqs = len(payload.requirements)
     target_type = type(payload.target).__name__ if payload.target else "None"
 
@@ -50,7 +55,12 @@ async def log_validation_pre_check(payload, ctx):
 
 @hook(HookType.VALIDATION_POST_CHECK)
 async def log_validation_post_check(payload, ctx):
-    """Log validation results after requirements are checked."""
+    """Log validation results after requirements are checked.
+
+    Args:
+        payload: ValidationPostCheckPayload with passed_count, failed_count, results.
+        ctx: Plugin context for hook execution.
+    """
     passed = payload.passed_count
     failed = payload.failed_count
     total = len(payload.requirements)

From d1ab9a32f651301760195e2425dc4948467cd95d Mon Sep 17 00:00:00 2001
From: Akihiko Kuroda <akihikokuroda2020@gmail.com>
Date: Wed, 10 Jun 2026 18:05:51 -0400
Subject: [PATCH 3/4] add debug plugin collections

Signed-off-by: Akihiko Kuroda <akihikokuroda2020@gmail.com>
---
 mellea/plugins/builtin_debug/generation.py | 13 ++++++++++--
 mellea/plugins/builtin_debug/sampling.py   | 23 ++++++++++++++++++----
 mellea/plugins/builtin_debug/validation.py | 13 ++++++++++--
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/mellea/plugins/builtin_debug/generation.py b/mellea/plugins/builtin_debug/generation.py
index bd9e6afde..8f7d5086a 100644
--- a/mellea/plugins/builtin_debug/generation.py
+++ b/mellea/plugins/builtin_debug/generation.py
@@ -19,8 +19,13 @@
 from __future__ import annotations
 
 import logging
+from typing import Any
 
 from mellea.plugins import HookType, hook
+from mellea.plugins.hooks.generation import (
+    GenerationPostCallPayload,
+    GenerationPreCallPayload,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -103,7 +108,9 @@ def _get_token_usage(payload) -> str:
 
 
 @hook(HookType.GENERATION_PRE_CALL)
-async def log_generation_pre_call(payload, ctx):
+async def log_generation_pre_call(
+    payload: GenerationPreCallPayload, ctx: dict[str, Any]
+) -> None:
     """Log request details before calling the LLM.
 
     Args:
@@ -152,7 +159,9 @@ async def log_generation_pre_call(payload, ctx):
 
 
 @hook(HookType.GENERATION_POST_CALL)
-async def log_generation_post_call(payload, ctx):
+async def log_generation_post_call(
+    payload: GenerationPostCallPayload, ctx: dict[str, Any]
+) -> None:
     """Log response details after LLM returns.
 
     Args:
diff --git a/mellea/plugins/builtin_debug/sampling.py b/mellea/plugins/builtin_debug/sampling.py
index 3cd3c43c7..918bd4762 100644
--- a/mellea/plugins/builtin_debug/sampling.py
+++ b/mellea/plugins/builtin_debug/sampling.py
@@ -28,14 +28,23 @@
 from __future__ import annotations
 
 import logging
+from typing import Any
 
 from mellea.plugins import HookType, hook
+from mellea.plugins.hooks.sampling import (
+    SamplingIterationPayload,
+    SamplingLoopEndPayload,
+    SamplingLoopStartPayload,
+    SamplingRepairPayload,
+)
 
 logger = logging.getLogger(__name__)
 
 
 @hook(HookType.SAMPLING_LOOP_START)
-async def log_sampling_loop_start(payload, ctx):
+async def log_sampling_loop_start(
+    payload: SamplingLoopStartPayload, ctx: dict[str, Any]
+) -> None:
     """Log sampling strategy initialization with budget and requirement count.
 
     Args:
@@ -58,7 +67,9 @@ async def log_sampling_loop_start(payload, ctx):
 
 
 @hook(HookType.SAMPLING_ITERATION)
-async def log_sampling_iteration(payload, ctx):
+async def log_sampling_iteration(
+    payload: SamplingIterationPayload, ctx: dict[str, Any]
+) -> None:
     """Log validation results for each sampling attempt.
 
     Args:
@@ -94,7 +105,9 @@ async def log_sampling_iteration(payload, ctx):
 
 
 @hook(HookType.SAMPLING_REPAIR)
-async def log_sampling_repair(payload, ctx):
+async def log_sampling_repair(
+    payload: SamplingRepairPayload, ctx: dict[str, Any]
+) -> None:
     """Log when repair is triggered during sampling iterations.
 
     Args:
@@ -115,7 +128,9 @@ async def log_sampling_repair(payload, ctx):
 
 
 @hook(HookType.SAMPLING_LOOP_END)
-async def log_sampling_loop_end(payload, ctx):
+async def log_sampling_loop_end(
+    payload: SamplingLoopEndPayload, ctx: dict[str, Any]
+) -> None:
     """Log sampling completion with success status and attempt statistics.
 
     Args:
diff --git a/mellea/plugins/builtin_debug/validation.py b/mellea/plugins/builtin_debug/validation.py
index 2a497d0cc..6e5b0fd5c 100644
--- a/mellea/plugins/builtin_debug/validation.py
+++ b/mellea/plugins/builtin_debug/validation.py
@@ -24,14 +24,21 @@
 from __future__ import annotations
 
 import logging
+from typing import Any
 
 from mellea.plugins import HookType, hook
+from mellea.plugins.hooks.validation import (
+    ValidationPostCheckPayload,
+    ValidationPreCheckPayload,
+)
 
 logger = logging.getLogger(__name__)
 
 
 @hook(HookType.VALIDATION_PRE_CHECK)
-async def log_validation_pre_check(payload, ctx):
+async def log_validation_pre_check(
+    payload: ValidationPreCheckPayload, ctx: dict[str, Any]
+) -> None:
     """Log validation setup before requirements are checked.
 
     Args:
@@ -54,7 +61,9 @@ async def log_validation_pre_check(payload, ctx):
 
 
 @hook(HookType.VALIDATION_POST_CHECK)
-async def log_validation_post_check(payload, ctx):
+async def log_validation_post_check(
+    payload: ValidationPostCheckPayload, ctx: dict[str, Any]
+) -> None:
     """Log validation results after requirements are checked.
 
     Args:

From e8dea4f70eb70be987f7b8560f40a91262c44cbb Mon Sep 17 00:00:00 2001
From: Akihiko Kuroda <akihikokuroda2020@gmail.com>
Date: Wed, 10 Jun 2026 18:18:41 -0400
Subject: [PATCH 4/4] add debug plugin collections

Signed-off-by: Akihiko Kuroda <akihikokuroda2020@gmail.com>
---
 docs/docs/how-to/debug-with-plugins.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/docs/how-to/debug-with-plugins.md b/docs/docs/how-to/debug-with-plugins.md
index 8a63f45fa..1c94faa78 100644
--- a/docs/docs/how-to/debug-with-plugins.md
+++ b/docs/docs/how-to/debug-with-plugins.md
@@ -4,8 +4,7 @@ description: "Use built-in debug plugins to trace generation, validation, and sa
 # diataxis: how-to
 ---
 
-**Prerequisites:** [The Requirements System](../concepts/requirements-system),
-[Sampling Strategies](../concepts/sampling-strategies), `pip install mellea`.
+**Prerequisites:** [The Requirements System](../concepts/requirements-system), `pip install mellea`.
 
 Mellea's plugin system provides debug hooks that trace the full lifecycle of
 generation, validation, and sampling. Use these plugins to understand: