From 5a88d153050726b017d821b22d3a27755ee3146b Mon Sep 17 00:00:00 2001 From: Akihiko Kuroda Date: Wed, 10 Jun 2026 16:24:47 -0400 Subject: [PATCH 1/4] add debug plugin collections Signed-off-by: Akihiko Kuroda --- docs/docs/how-to/debug-with-plugins.md | 310 ++++++++++++++++++ .../plugins/builtin_complete_diagnostics.py | 121 +++++++ .../plugins/builtin_full_pipeline_tracing.py | 118 +++++++ .../plugins/builtin_generation_tracing.py | 60 ++++ .../plugins/builtin_sampling_diagnostics.py | 105 ++++++ .../plugins/builtin_validation_failures.py | 91 +++++ .../plugins/builtin_validation_strict.py | 95 ++++++ .../plugins/builtin_validation_tracing.py | 127 +++++++ mellea/plugins/builtin_debug/__init__.py | 55 ++++ mellea/plugins/builtin_debug/generation.py | 178 ++++++++++ mellea/plugins/builtin_debug/sampling.py | 132 ++++++++ mellea/plugins/builtin_debug/validation.py | 91 +++++ 12 files changed, 1483 insertions(+) create mode 100644 docs/docs/how-to/debug-with-plugins.md create mode 100644 docs/examples/plugins/builtin_complete_diagnostics.py create mode 100644 docs/examples/plugins/builtin_full_pipeline_tracing.py create mode 100644 docs/examples/plugins/builtin_generation_tracing.py create mode 100644 docs/examples/plugins/builtin_sampling_diagnostics.py create mode 100644 docs/examples/plugins/builtin_validation_failures.py create mode 100644 docs/examples/plugins/builtin_validation_strict.py create mode 100644 docs/examples/plugins/builtin_validation_tracing.py create mode 100644 mellea/plugins/builtin_debug/__init__.py create mode 100644 mellea/plugins/builtin_debug/generation.py create mode 100644 mellea/plugins/builtin_debug/sampling.py create mode 100644 mellea/plugins/builtin_debug/validation.py diff --git a/docs/docs/how-to/debug-with-plugins.md b/docs/docs/how-to/debug-with-plugins.md new file mode 100644 index 000000000..8a63f45fa --- /dev/null +++ b/docs/docs/how-to/debug-with-plugins.md @@ -0,0 +1,310 @@ +--- +title: "Debug with Plugins" +description: "Use built-in debug plugins to trace generation, validation, and sampling behavior in detail." +# diataxis: how-to +--- + +**Prerequisites:** [The Requirements System](../concepts/requirements-system), +[Sampling Strategies](../concepts/sampling-strategies), `pip install mellea`. + +Mellea's plugin system provides debug hooks that trace the full lifecycle of +generation, validation, and sampling. Use these plugins to understand: + +- What prompts are sent to the LLM +- Model latency and token usage +- Which requirements pass/fail and why +- When repair strategies trigger and what feedback they provide +- End-to-end flow through the sampling loop + +## Built-in debug plugins + +Mellea ships with three categories of debug plugins in `mellea.plugins.builtin_debug`: + +### Generation pipeline plugins + +Trace all LLM backend calls with request/response inspection, latency, and tokens. + +```python +from mellea.plugins.builtin_debug.generation import ( + log_generation_pre_call, + log_generation_post_call, +) +from mellea.plugins import register + +register([ + log_generation_pre_call, + log_generation_post_call, +]) +``` + +**Output:** + +```text +[📤 GEN-PRE-CALL gen_id=abc123...] model=granite4.1:3b | prompt=Write a thank you note +[📥 GEN-POST-CALL gen_id=abc123...] model=granite4.1:3b | latency=397ms | tokens=(47+19=66) | response=hello there thank you... +``` + +**Logs:** + +- Generation ID for correlation +- Model being called +- Request: prompt preview (first 100 chars) +- Response: preview, latency, token counts +- **Repair feedback** when present (shows guidance the model receives during repair) + +### Validation pipeline plugins + +Trace requirement validation with pre-check setup and per-requirement results. + +```python +from mellea.plugins.builtin_debug.validation import ( + log_validation_pre_check, + log_validation_post_check, +) +from mellea.plugins import register + +register([ + log_validation_pre_check, + log_validation_post_check, +]) +``` + +**Output:** + +```text +[🔍 VALIDATION-PRE-CHECK] requirements=3 | target=ModelOutputThunk + +[❌ VALIDATION-POST-CHECK] MIXED RESULTS: 2/3 passed, 1/3 failed + ✓ Use only lowercase letters + ✓ Include the phrase 'thank you' + ❌ Start with a greeting + └─ validated as "no" +``` + +**Logs:** + +- Pre-check: how many requirements, what's being validated +- Post-check: pass/fail count per requirement +- Per-requirement status with reasons for failures + +### Sampling pipeline plugins + +Trace the sampling strategy lifecycle including iterations, validation results, +and repair events. + +```python +from mellea.plugins.builtin_debug.sampling import ( + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, +) +from mellea.plugins import register + +register([ + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, +]) +``` + +**Output:** + +```text +[🎯 SAMPLING-START] strategy=RepairTemplateStrategy | loop_budget=3 | requirements=3 + +[❌ SAMPLING-ITER 1] FAILED: 2/3 validations passed + ❌ Start with a greeting + +[🔧 REPAIR-TRIGGERED] at iteration 1 + repair_type=template + failed_validations: + • Start with a greeting + +[❌ SAMPLING-ITER 2] FAILED: 2/3 validations passed + ❌ Start with a greeting + +[🎉 SAMPLING-END] SUCCESS in 2 iteration(s) using RepairTemplateStrategy + total_attempts=2 + best_validation_score=3/3 +``` + +**Logs:** + +- Loop start: strategy, budget, requirement count +- Each iteration: pass/fail count, failed requirement names +- Repair events: when triggered, repair type, failed requirements +- Loop end: success/failure, iterations used, final statistics + +## Enabling multiple plugins together + +Combine plugins for complete end-to-end visibility: + +```python +from mellea.plugins.builtin_debug.generation import ( + log_generation_pre_call, + log_generation_post_call, +) +from mellea.plugins.builtin_debug.validation import ( + log_validation_pre_check, + log_validation_post_check, +) +from mellea.plugins.builtin_debug.sampling import ( + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, +) +from mellea.plugins import register + +register([ + # Generation hooks + log_generation_pre_call, + log_generation_post_call, + # Validation hooks + log_validation_pre_check, + log_validation_post_check, + # Sampling hooks + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, +]) +``` + +This reveals the complete flow: + +```text +[🎯 SAMPLING-START] strategy=... | loop_budget=... | requirements=... + +[📤 GEN-PRE-CALL] prompt=... +[📥 GEN-POST-CALL] response=... | latency=... | tokens=... + +[🔍 VALIDATION-PRE-CHECK] requirements=... | target=... +[📤 GEN-PRE-CALL] prompt=Start with a greeting (validation check) +[📥 GEN-POST-CALL] response=no +[❌ VALIDATION-POST-CHECK] MIXED RESULTS: 2/3 passed, 1/3 failed + +[❌ SAMPLING-ITER 1] FAILED: 2/3 validations passed + +[🔧 REPAIR-TRIGGERED] at iteration 1 + failed_validations: Start with a greeting + +[📤 GEN-PRE-CALL] prompt=Write a thank you note + [⭐ REPAIR ATTEMPT] Repair feedback provided: ... +[📥 GEN-POST-CALL] response=... | latency=... | tokens=... + +[🔍 VALIDATION-PRE-CHECK] requirements=... | target=... +[📤 GEN-PRE-CALL] prompt=Start with a greeting +[📥 GEN-POST-CALL] response=yes +[✅ VALIDATION-POST-CHECK] ALL PASSED: 3/3 requirements + +[✅ SAMPLING-ITER 2] SUCCESS: 3/3 validations passed + +[🎉 SAMPLING-END] SUCCESS in 2 iteration(s) +``` + +## Example scripts + +Ready-to-run examples are available in `docs/examples/plugins/`: + +| Script | Plugins | Purpose | +| --------------------------------- | -------------------------- | ----------------------------------- | +| `builtin_generation_tracing.py` | Generation | Basic model call tracing | +| `builtin_validation_tracing.py` | Validation | Requirement validation | +| `builtin_validation_failures.py` | Validation | Show validation failures | +| `builtin_sampling_diagnostics.py` | Sampling | Strategy iterations | +| `builtin_full_pipeline_tracing.py`| Generation + Sampling | End-to-end with model visibility | +| `builtin_complete_diagnostics.py` | All 3 | Complete pipeline with validation | + +Run any example: + +```bash +uv run python docs/examples/plugins/builtin_generation_tracing.py +uv run python docs/examples/plugins/builtin_validation_failures.py +uv run python docs/examples/plugins/builtin_complete_diagnostics.py +``` + +## Common debugging scenarios + +### "Why is the model generating a different response than I expected?" + +Enable **generation tracing** to see: + +- Exactly what prompt was sent +- Model's latency and token usage +- Response preview +- When repair feedback is provided (if using RepairTemplateStrategy) + +This shows whether the issue is in the prompt, model behavior, or repair strategy. + +### "Why are my requirements failing?" + +Enable **validation tracing** to see: + +- Each requirement being checked +- Pass/fail status per requirement +- Failure reason (e.g., "validated as 'no'") +- Pass/fail counts + +This pinpoints which requirements are problematic and why. + +### "Why isn't the repair strategy helping?" + +Enable **all three plugin categories** to see: + +- Initial attempt (generation + validation) +- What failed (validation results) +- Repair feedback provided (in generation pre-call logs) +- Second attempt with feedback (generation + validation) +- Whether the repair improved the results + +This reveals whether the repair strategy is receiving the right feedback and the model is responding appropriately. + +### "Why is sampling taking so long?" + +Enable **sampling tracing** to see: + +- How many iterations ran +- Validation results per iteration +- When repairs were triggered +- Total attempts before success/failure + +This identifies whether the issue is budget exhaustion, frequent failures, or ineffective repair. + +## Controlling log output + +By default, debug plugins log at INFO level for important events and DEBUG level +for details. Control verbosity: + +```python +import logging + +# Show only failures and key events +logging.basicConfig(level=logging.INFO) + +# Show all details including passed requirements +logging.basicConfig(level=logging.DEBUG) + +# Silence a specific logger +logging.getLogger("httpx").setLevel(logging.ERROR) +logging.getLogger("ollama").setLevel(logging.ERROR) +``` + +## Performance notes + +Debug plugins have minimal overhead: + +- Pre-hooks check whether plugins are registered before building payloads +- Logging is formatted efficiently +- No plugins fire in the hot path when not registered + +For production use, you can safely leave plugins registered — they only log when +enabled. For maximum performance, simply don't register them. + +## Next steps + +- [Observability: Tracing](../observability/tracing.md) — export traces to Jaeger or Grafana +- [Handling Exceptions and Failures](./handling-exceptions.md) — work with sampling failures +- [The Requirements System](../concepts/requirements-system) — understand validation in depth diff --git a/docs/examples/plugins/builtin_complete_diagnostics.py b/docs/examples/plugins/builtin_complete_diagnostics.py new file mode 100644 index 000000000..aafc26238 --- /dev/null +++ b/docs/examples/plugins/builtin_complete_diagnostics.py @@ -0,0 +1,121 @@ +# pytest: ollama, e2e +"""Complete diagnostics example with all debug plugins. + +This example combines generation, sampling, AND validation tracing for maximum +visibility into the entire sampling + validation pipeline. + +You'll see: +1. Validation pre-check: requirements about to be checked +2. Generation pre/post-call: model called for main task + validation +3. Validation post-check: results per requirement +4. Sampling iteration: aggregate pass/fail count +5. Repair events: when and why repairs triggered +6. Final result: success/failure with statistics + +This provides complete end-to-end traceability for: +- Model behavior (generation) +- Validation logic (what passes/fails and why) +- Repair strategy (how feedback improves results) +- Overall sampling loop (iterations and budget) + +Run: + uv run python docs/examples/plugins/builtin_complete_diagnostics.py + +Watch the complete flow with all lifecycle events visible. +""" + +import logging + +import mellea +from mellea.core import Requirement +from mellea.plugins import register +from mellea.plugins.builtin_debug.generation import ( + log_generation_post_call, + log_generation_pre_call, +) +from mellea.plugins.builtin_debug.sampling import ( + log_sampling_iteration, + log_sampling_loop_end, + log_sampling_loop_start, + log_sampling_repair, +) +from mellea.plugins.builtin_debug.validation import ( + log_validation_post_check, + log_validation_pre_check, +) +from mellea.stdlib.requirements import req, simple_validate +from mellea.stdlib.sampling import RepairTemplateStrategy + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +log = logging.getLogger(__name__) + +# Enable ALL debug plugins +register( + [ + # Generation pipeline + log_generation_pre_call, + log_generation_post_call, + # Validation pipeline + log_validation_pre_check, + log_validation_post_check, + # Sampling pipeline + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, + ] +) + + +def is_lowercase_only(text: str) -> bool: + """Validation: text must be all lowercase.""" + return text == text.lower() + + +def has_thank_you(text: str) -> bool: + """Validation: text must contain 'thank you'.""" + return "thank you" in text.lower() + + +requirements: list[Requirement | str] = [ + req("Start with a greeting"), + req( + "Use only lowercase letters (no capitals)", + validation_fn=simple_validate(is_lowercase_only), + ), + req("Include the phrase 'thank you'", validation_fn=simple_validate(has_thank_you)), +] + + +def main(): + """Complete diagnostics example.""" + log.info("=" * 70) + log.info("Complete Diagnostics Example (All Debug Plugins)") + log.info("=" * 70) + log.info("") + log.info("All debug plugins are enabled:") + log.info(" [📤/📥 GEN-*] - model calls and responses") + log.info(" [🔍 VALIDATION-*] - requirement validation") + log.info(" [❌ SAMPLING-ITER] - iteration results") + log.info(" [🔧 REPAIR] - repair events") + log.info("") + + with mellea.start_session() as m: + log.info("Generating text with strict requirements and repair strategy...") + log.info("") + + result = m.instruct( + "Write a thank you note", + requirements=requirements, + strategy=RepairTemplateStrategy(loop_budget=3), + ) + + log.info("") + log.info("=" * 70) + log.info("Final result:") + log.info(str(result)[:300] + ("..." if len(str(result)) > 300 else "")) + log.info("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/plugins/builtin_full_pipeline_tracing.py b/docs/examples/plugins/builtin_full_pipeline_tracing.py new file mode 100644 index 000000000..b3500e2b1 --- /dev/null +++ b/docs/examples/plugins/builtin_full_pipeline_tracing.py @@ -0,0 +1,118 @@ +# pytest: ollama, e2e +"""Full pipeline tracing example combining generation and sampling diagnostics. + +This example demonstrates both the generation and sampling debug plugins working +together to provide end-to-end visibility into the entire sampling loop. + +You'll see: +1. Generation pre-call: what's being sent to the LLM +2. Generation post-call: model response, latency, and tokens +3. Sampling iteration: validation results per attempt +4. Repair events: when and why repairs are triggered +5. Final result: success/failure with statistics + +This combined view helps debug complex interactions between: +- Model behavior (generation tracing) +- Validation logic (sampling diagnostics) +- Repair strategies (how feedback improves results) + +Run: + uv run python docs/examples/plugins/builtin_full_pipeline_tracing.py + +Watch the complete flow: + [📤 GEN-PRE-CALL] → [📥 GEN-POST-CALL] (model called) + [❌ SAMPLING-ITER] → [🔧 REPAIR-TRIGGERED] (validation failed) + [📤 GEN-PRE-CALL] → [📥 GEN-POST-CALL] (repair attempt) + [❌ SAMPLING-ITER] → [🔧 REPAIR-TRIGGERED] (still failing) + ... + [💥 SAMPLING-END] (final result) +""" + +import logging + +import mellea +from mellea.core import Requirement +from mellea.plugins import register +from mellea.plugins.builtin_debug.generation import ( + log_generation_post_call, + log_generation_pre_call, +) +from mellea.plugins.builtin_debug.sampling import ( + log_sampling_iteration, + log_sampling_loop_end, + log_sampling_loop_start, + log_sampling_repair, +) +from mellea.stdlib.requirements import req, simple_validate +from mellea.stdlib.sampling import RepairTemplateStrategy + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +log = logging.getLogger(__name__) + +# Enable both generation and sampling tracing +register( + [ + # Generation pipeline + log_generation_pre_call, + log_generation_post_call, + # Sampling pipeline + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, + ] +) + + +def is_lowercase_only(text: str) -> bool: + """Validation: text must be all lowercase.""" + return text == text.lower() + + +def has_thank_you(text: str) -> bool: + """Validation: text must contain 'thank you'.""" + return "thank you" in text.lower() + + +requirements: list[Requirement | str] = [ + req("Start with a greeting"), + req( + "Use only lowercase letters (no capitals)", + validation_fn=simple_validate(is_lowercase_only), + ), + req("Include the phrase 'thank you'", validation_fn=simple_validate(has_thank_you)), +] + + +def main(): + """Full pipeline tracing example.""" + log.info("=" * 70) + log.info("Full Pipeline Tracing Example") + log.info("=" * 70) + log.info("") + log.info("Watch for both generation AND sampling events:") + log.info(" [📤 GEN-PRE-CALL] - what's sent to the LLM") + log.info(" [📥 GEN-POST-CALL] - model response + latency") + log.info(" [❌ SAMPLING-ITER] - validation results") + log.info(" [🔧 REPAIR-TRIGGERED] - repair strategy kicks in") + log.info("") + + with mellea.start_session() as m: + log.info("Generating text with strict requirements and repair strategy...") + log.info("") + + result = m.instruct( + "Write a thank you note", + requirements=requirements, + strategy=RepairTemplateStrategy(loop_budget=3), + ) + + log.info("") + log.info("=" * 70) + log.info("Final result:") + log.info(str(result)[:300] + ("..." if len(str(result)) > 300 else "")) + log.info("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/plugins/builtin_generation_tracing.py b/docs/examples/plugins/builtin_generation_tracing.py new file mode 100644 index 000000000..8e5f5c534 --- /dev/null +++ b/docs/examples/plugins/builtin_generation_tracing.py @@ -0,0 +1,60 @@ +# pytest: ollama, e2e +"""Built-in generation tracing plugin example. + +This example demonstrates the GenerationTracingPlugin from mellea.plugins.builtin_debug, +which traces all LLM backend calls with request/response inspection. + +The plugin logs: +- Generation ID for correlation +- Model being called +- Prompt preview (first 100 chars) +- Response preview (first 100 chars) +- Latency in milliseconds +- Token usage (prompt + completion = total) + +Run: + uv run python docs/examples/plugins/builtin_generation_tracing.py + +Watch the logs to see tracing in action: + [📤 GEN-PRE-CALL gen_id=...] model=... | prompt=... + [📥 GEN-POST-CALL gen_id=...] model=... | latency=...ms | tokens=(...) | response=... +""" + +import logging + +import mellea +from mellea.plugins import register +from mellea.plugins.builtin_debug.generation import ( + log_generation_post_call, + log_generation_pre_call, +) + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +log = logging.getLogger(__name__) + +# Enable generation tracing by registering the hooks +register([log_generation_pre_call, log_generation_post_call]) + + +def main(): + """Example: Use GenerationTracingPlugin to debug generation calls.""" + log.info("=" * 70) + log.info("Generation Tracing Plugin Example") + log.info("=" * 70) + log.info("") + log.info("Watch the logs for [📤 GEN-PRE-CALL] and [📥 GEN-POST-CALL] entries.") + log.info("") + + with mellea.start_session() as m: + result = m.instruct("What are the three main colors of the rainbow?") + + log.info("") + log.info("=" * 70) + log.info("Final result:") + result_str = str(result) + log.info(result_str[:200] + "..." if len(result_str) > 200 else result_str) + log.info("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/plugins/builtin_sampling_diagnostics.py b/docs/examples/plugins/builtin_sampling_diagnostics.py new file mode 100644 index 000000000..ed6eab68c --- /dev/null +++ b/docs/examples/plugins/builtin_sampling_diagnostics.py @@ -0,0 +1,105 @@ +# pytest: ollama, e2e +"""Built-in sampling diagnostics plugin example. + +This example demonstrates the sampling hooks from mellea.plugins.builtin_debug, +which trace the sampling strategy pipeline with iteration tracking, validation +results, repair events, and success/failure analysis. + +The plugin logs: +- Strategy initialization with loop budget and requirements count +- Each iteration with validation pass/fail status +- Detailed validation results per requirement +- Repair events when triggered +- Final sampling result with success/failure reason + +Run: + uv run python docs/examples/plugins/builtin_sampling_diagnostics.py + +Watch the logs to see: + [🎯 SAMPLING-START] strategy=... | loop_budget=... | requirements=... + [✅ SAMPLING-ITER 1] SUCCESS: ... validations passed + [❌ SAMPLING-ITER 2] FAILED: ... validations passed + [🔧 REPAIR-TRIGGERED] at iteration ... + [🎉 SAMPLING-END] SUCCESS in ... iterations +""" + +import logging + +import mellea +from mellea.core import Requirement +from mellea.plugins import register +from mellea.plugins.builtin_debug.sampling import ( + log_sampling_iteration, + log_sampling_loop_end, + log_sampling_loop_start, + log_sampling_repair, +) +from mellea.stdlib.requirements import req, simple_validate +from mellea.stdlib.sampling import RepairTemplateStrategy + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +log = logging.getLogger(__name__) + +# Enable sampling diagnostics +register( + [ + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, + ] +) + + +def is_lowercase_only(text: str) -> bool: + """Validation: text must be all lowercase.""" + return text == text.lower() + + +def has_thank_you(text: str) -> bool: + """Validation: text must contain 'thank you'.""" + return "thank you" in text.lower() + + +requirements: list[Requirement | str] = [ + req("Start with a greeting"), + req( + "Use only lowercase letters (no capitals)", + validation_fn=simple_validate(is_lowercase_only), + ), + req("Include the phrase 'thank you'", validation_fn=simple_validate(has_thank_you)), +] + + +def main(): + """Example: Use sampling diagnostics to debug repair strategies.""" + log.info("=" * 70) + log.info("Sampling Diagnostics Plugin Example") + log.info("=" * 70) + log.info("") + log.info("Watch the logs for sampling lifecycle events:") + log.info(" [🎯 SAMPLING-START] - strategy begins") + log.info(" [✅/❌ SAMPLING-ITER] - per-iteration validation results") + log.info(" [🔧 REPAIR-TRIGGERED] - repair invoked (RepairTemplateStrategy only)") + log.info(" [🎉/💥 SAMPLING-END] - final result") + log.info("") + + with mellea.start_session() as m: + log.info("Generating text with strict requirements...") + log.info("") + + result = m.instruct( + "Write a thank you note", + requirements=requirements, + strategy=RepairTemplateStrategy(loop_budget=3), + ) + + log.info("") + log.info("=" * 70) + log.info("Final result:") + log.info(str(result)[:300] + ("..." if len(str(result)) > 300 else "")) + log.info("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/plugins/builtin_validation_failures.py b/docs/examples/plugins/builtin_validation_failures.py new file mode 100644 index 000000000..a714b439e --- /dev/null +++ b/docs/examples/plugins/builtin_validation_failures.py @@ -0,0 +1,91 @@ +# pytest: ollama, e2e +"""Validation tracing example showing real failures. + +This example uses instruct with immediate validation (no repair strategy), +so we'll see real validation failures in the logs. + +Run: + uv run python docs/examples/plugins/builtin_validation_failures.py + +Watch the logs to see: + [🔍 VALIDATION-PRE-CHECK] requirements setup + [❌ VALIDATION-POST-CHECK] with MIXED RESULTS showing: + ✓ Passed requirements (debug level) + ❌ Failed requirements (info level) with reasons +""" + +import logging + +import mellea +from mellea.core import Requirement +from mellea.plugins import register +from mellea.plugins.builtin_debug.validation import ( + log_validation_post_check, + log_validation_pre_check, +) +from mellea.stdlib.requirements import req, simple_validate + +logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(message)s") +log = logging.getLogger(__name__) + +# Enable validation tracing +register([log_validation_pre_check, log_validation_post_check]) + + +def requires_hello(text: str) -> bool: + """Validation: must contain the word 'hello'.""" + return "hello" in text.lower() + + +def no_exclamation(text: str) -> bool: + """Validation: no exclamation marks allowed.""" + return "!" not in text + + +def is_lowercase_only(text: str) -> bool: + """Validation: all lowercase.""" + return text == text.lower() + + +def is_short(text: str) -> bool: + """Validation: must be 20 characters or less.""" + return len(text) <= 20 + + +requirements: list[Requirement | str] = [ + req("Must contain the word 'hello'", validation_fn=simple_validate(requires_hello)), + req("No exclamation marks allowed", validation_fn=simple_validate(no_exclamation)), + req("All lowercase", validation_fn=simple_validate(is_lowercase_only)), + req("Must be 20 characters or less", validation_fn=simple_validate(is_short)), +] + + +def main(): + """Example that will show validation failures.""" + log.info("=" * 70) + log.info("Validation Failures Example") + log.info("=" * 70) + log.info("") + log.info("Requirements (with potential for failures):") + for i, req_obj in enumerate(requirements, 1): + req_desc = getattr(req_obj, "description", str(req_obj)) + log.info(f" {i}. {req_desc}") + log.info("") + + with mellea.start_session() as m: + log.info("Test: Generate casual greeting (likely to fail some requirements)") + log.info("-" * 70) + log.info("") + + # Use immediate validation (no repair) to see failures + result = m.instruct( + "Say a casual greeting with punctuation", requirements=requirements + ) + log.info("") + log.info(f"Generated text: {result}") + log.info("") + log.info("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/plugins/builtin_validation_strict.py b/docs/examples/plugins/builtin_validation_strict.py new file mode 100644 index 000000000..6dec56fff --- /dev/null +++ b/docs/examples/plugins/builtin_validation_strict.py @@ -0,0 +1,95 @@ +# pytest: ollama, e2e +"""Strict validation tracing example with intentional failures. + +This example demonstrates validation tracing with STRICT requirements designed +to trigger failures so you can see the full validation output including: +- Failed requirement details +- Failure reasons +- Pass/fail counts + +Run: + uv run python docs/examples/plugins/builtin_validation_strict.py + +Watch the logs to see validation failures: + [🔍 VALIDATION-PRE-CHECK] requirements=... | target=... + [❌ VALIDATION-POST-CHECK] MIXED RESULTS: ... passed, ... failed + ✓ Passed requirement + ❌ Failed requirement + └─ reason why it failed +""" + +import logging + +import mellea +from mellea.core import Requirement +from mellea.plugins import register +from mellea.plugins.builtin_debug.validation import ( + log_validation_post_check, + log_validation_pre_check, +) +from mellea.stdlib.requirements import req, simple_validate + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +log = logging.getLogger(__name__) + +# Enable validation tracing +register([log_validation_pre_check, log_validation_post_check]) + + +def is_single_word(text: str) -> bool: + """Validation: response must be exactly one word.""" + return len(text.strip().split()) == 1 + + +def is_all_caps(text: str) -> bool: + """Validation: all letters must be uppercase.""" + return text == text.upper() and text.isalpha() + + +def is_very_short(text: str) -> bool: + """Validation: must be 5 characters or less.""" + return len(text) <= 5 + + +requirements: list[Requirement | str] = [ + req( + "Response must be exactly one word", + validation_fn=simple_validate(is_single_word), + ), + req("All letters must be UPPERCASE", validation_fn=simple_validate(is_all_caps)), + req( + "Response must be 5 characters or less", + validation_fn=simple_validate(is_very_short), + ), +] + + +def main(): + """Example with strict requirements that will fail.""" + log.info("=" * 70) + log.info("Strict Validation Tracing Example") + log.info("=" * 70) + log.info("") + log.info("Intentionally strict requirements to demonstrate validation failures:") + for i, req_obj in enumerate(requirements, 1): + req_desc = getattr(req_obj, "description", str(req_obj)) + log.info(f" {i}. {req_desc}") + log.info("") + + with mellea.start_session() as m: + log.info( + "Generating multi-word response (will fail 'single word' requirement):" + ) + log.info("-" * 70) + + result = m.instruct("Say hello world", requirements=requirements) + log.info(f"Result: {result}") + log.info("") + + log.info("=" * 70) + log.info("Validation tracing complete!") + log.info("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/plugins/builtin_validation_tracing.py b/docs/examples/plugins/builtin_validation_tracing.py new file mode 100644 index 000000000..870d0ed14 --- /dev/null +++ b/docs/examples/plugins/builtin_validation_tracing.py @@ -0,0 +1,127 @@ +# pytest: ollama, e2e +"""Validation tracing plugin example. + +This example demonstrates the validation debug plugins from mellea.plugins.builtin_debug, +which trace requirement validation with pre-check setup and detailed per-requirement +results including pass/fail status, reasons, and scores. + +The plugin logs: +- Pre-check: requirements about to be validated, target being checked +- Per-requirement: pass/fail status with reasons for failures +- Post-check: aggregate pass/fail counts and summary + +Run: + uv run python docs/examples/plugins/builtin_validation_tracing.py + +Watch the logs to see: + [🔍 VALIDATION-PRE-CHECK] requirements=... | target=... + [❌ VALIDATION-POST-CHECK] MIXED RESULTS: ... passed, ... failed + ✓ Passed requirement + ❌ Failed requirement + └─ reason why it failed +""" + +import logging + +import mellea +from mellea.core import Requirement +from mellea.plugins import register +from mellea.plugins.builtin_debug.validation import ( + log_validation_post_check, + log_validation_pre_check, +) +from mellea.stdlib.requirements import check, req, simple_validate + +logging.basicConfig( + level=logging.DEBUG, # DEBUG to see passed requirements too + format="%(levelname)s: %(message)s", +) +log = logging.getLogger(__name__) + +# Enable validation tracing +register([log_validation_pre_check, log_validation_post_check]) + + +def is_lowercase_only(text: str) -> bool: + """Validation: text must be all lowercase.""" + return text == text.lower() + + +def has_required_phrase(text: str) -> bool: + """Validation: text must contain 'thank you'.""" + return "thank you" in text.lower() + + +def has_proper_length(text: str) -> bool: + """Validation: text should be between 10 and 500 characters.""" + return 10 <= len(text) <= 500 + + +requirements: list[Requirement | str] = [ + req( + "Use only lowercase letters (no capitals)", + validation_fn=simple_validate(is_lowercase_only), + ), + req( + "Include the phrase 'thank you'", + validation_fn=simple_validate(has_required_phrase), + ), + req( + "Text length between 10 and 500 characters", + validation_fn=simple_validate(has_proper_length), + ), + check("Response should be helpful and polite"), +] + + +def main(): + """Example: Use validation tracing to debug requirement checking.""" + log.info("=" * 70) + log.info("Validation Tracing Plugin Example") + log.info("=" * 70) + log.info("") + log.info("Watch the logs for validation lifecycle events:") + log.info(" [🔍 VALIDATION-PRE-CHECK] - setup phase before validation") + log.info(" [❌ VALIDATION-POST-CHECK] - results after validation") + log.info("") + log.info("Requirements being validated:") + for i, req_obj in enumerate(requirements, 1): + req_desc = getattr(req_obj, "description", str(req_obj)) + log.info(f" {i}. {req_desc}") + log.info("") + + with mellea.start_session() as m: + log.info("Test 1: Good response (should pass most requirements)") + log.info("-" * 70) + + result1 = m.instruct( + "Say thank you", + requirements=requirements[:3], # Use first 3 requirements only + ) + log.info("") + log.info(f"Result:\n{result1}") + log.info("") + + log.info("Test 2: Uppercase response (should fail lowercase requirement)") + log.info("-" * 70) + + result2 = m.instruct("Say THANK YOU in all caps", requirements=requirements[:3]) + log.info("") + log.info(f"Result:\n{result2}") + log.info("") + + log.info("Test 3: Short response (should fail length requirement)") + log.info("-" * 70) + + result3 = m.instruct("Say hi", requirements=requirements[:3]) + log.info("") + log.info(f"Result:\n{result3}") + log.info("") + + log.info("=" * 70) + log.info("Validation tracing complete!") + log.info("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/mellea/plugins/builtin_debug/__init__.py b/mellea/plugins/builtin_debug/__init__.py new file mode 100644 index 000000000..73db88899 --- /dev/null +++ b/mellea/plugins/builtin_debug/__init__.py @@ -0,0 +1,55 @@ +"""Built-in debug plugins for Mellea. + +Provides pre-built plugins for common debugging tasks: +- Generation pipeline tracing (requests, responses, latency, tokens) +- Sampling strategy diagnostics (iterations, validation, repair, results) + +Examples: + Enable generation tracing: + + from mellea.plugins.builtin_debug.generation import ( + log_generation_pre_call, + log_generation_post_call, + ) + from mellea.plugins import register + + register([log_generation_pre_call, log_generation_post_call]) + + Enable sampling diagnostics: + + from mellea.plugins.builtin_debug.sampling import ( + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, + ) + + register([ + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, + ]) +""" + +from __future__ import annotations + +from .generation import log_generation_post_call, log_generation_pre_call +from .sampling import ( + log_sampling_iteration, + log_sampling_loop_end, + log_sampling_loop_start, + log_sampling_repair, +) +from .validation import log_validation_post_check, log_validation_pre_check + +__all__ = [ + "log_generation_post_call", + "log_generation_pre_call", + "log_sampling_iteration", + "log_sampling_loop_end", + "log_sampling_loop_start", + "log_sampling_repair", + "log_validation_post_check", + "log_validation_pre_check", +] diff --git a/mellea/plugins/builtin_debug/generation.py b/mellea/plugins/builtin_debug/generation.py new file mode 100644 index 000000000..9e0a1bcac --- /dev/null +++ b/mellea/plugins/builtin_debug/generation.py @@ -0,0 +1,178 @@ +"""Built-in debug plugin for generation pipeline (pre-call and post-call). + +Provides tracing for all LLM backend calls. Use for debugging model invocations, +tracking latency, and understanding request/response flow. + +Examples: + Enable generation tracing: + + from mellea.plugins.builtin_debug import GenerationTracingPlugin + from mellea import start_session + from mellea.plugins import register + + register([GenerationTracingPlugin()]) + + with start_session() as m: + result = m.instruct("...") # Tracing fires automatically +""" + +from __future__ import annotations + +import logging + +from mellea.plugins import HookType, hook + +logger = logging.getLogger(__name__) + + +def _get_prompt_preview(payload) -> str: + """Extract and shorten prompt for logging.""" + action = getattr(payload, "action", None) + if not action: + return "(no action)" + + text = None + + # Try to get description from Instruction components + if hasattr(action, "_description"): + text = action._description or None + + # Try format_for_llm() for structured output + if not text and hasattr(action, "format_for_llm"): + try: + formatted = action.format_for_llm() + # Extract args if available + if hasattr(formatted, "args"): + desc = formatted.args.get("description", "") + if desc: + text = str(desc)[:200] + except Exception: + pass + + # Fallback to string representation + if not text: + text = str(action)[:200] + + text = str(text).replace("\n", " ").replace(" ", " ").strip() + if len(text) > 100: + text = text[:97] + "..." + return text + + +def _get_response_preview(payload) -> str: + """Extract and shorten response for logging.""" + try: + model_output = getattr(payload, "model_output", None) + if not model_output: + return "(no output)" + + value = getattr(model_output, "value", None) + if not value: + return "(no value)" + + text = str(value)[:200] + text = text.replace("\n", " ").replace(" ", " ").strip() + if len(text) > 100: + text = text[:97] + "..." + return text + except Exception: + return "(error reading response)" + + +def _get_token_usage(payload) -> str: + """Extract token usage from payload.""" + try: + model_output = getattr(payload, "model_output", None) + if not model_output: + return "unknown" + + gen = getattr(model_output, "generation", None) + if not gen: + return "unknown" + + usage = getattr(gen, "usage", {}) + if not usage: + return "unknown" + + total = usage.get("total_tokens", "?") + prompt = usage.get("prompt_tokens", "?") + completion = usage.get("completion_tokens", "?") + return f"({prompt}+{completion}={total})" + except Exception: + return "unknown" + + +@hook(HookType.GENERATION_PRE_CALL) +async def log_generation_pre_call(payload, ctx): + """Log request details before calling the LLM.""" + model = getattr(payload, "backend", None) + model_id = model.model_id if model else "unknown" + gen_id = payload.generation_id or "no-id" + + # Extract all data from the action + action = getattr(payload, "action", None) + requirements = [] + repair_text = "" + + if action and hasattr(action, "format_for_llm"): + try: + fmt = action.format_for_llm() + if hasattr(fmt, "args"): + requirements = fmt.args.get("requirements", []) + repair_text = fmt.args.get("repair", "") + except Exception: + pass + + # Log main request info + prompt_preview = _get_prompt_preview(payload) + logger.info( + f"[📤 GEN-PRE-CALL gen_id={gen_id}] model={model_id} | prompt={prompt_preview}" + ) + + # Log requirements if present + if requirements: + logger.debug(f" requirements ({len(requirements)}):") + for i, req in enumerate(requirements, 1): + req_desc = getattr(req, "description", str(req)) + logger.debug(f" {i}. {req_desc}") + + # Log repair feedback if present (indicates a repair attempt) + if repair_text: + logger.info(" [⭐ REPAIR ATTEMPT] Repair feedback provided:") + # Show first 300 chars of repair text + repair_preview = repair_text[:300].replace("\n", " ") + if len(repair_text) > 300: + repair_preview += "..." + logger.info(f" {repair_preview}") + + +@hook(HookType.GENERATION_POST_CALL) +async def log_generation_post_call(payload, ctx): + """Log response details after LLM returns.""" + model_output = getattr(payload, "model_output", None) + model_id = "unknown" + if model_output: + gen = getattr(model_output, "generation", None) + if gen: + model_id = getattr(gen, "model", "unknown") + + gen_id = payload.generation_id or "no-id" + latency_ms = payload.latency_ms or 0 + + response_preview = _get_response_preview(payload) + tokens = _get_token_usage(payload) + + logger.info( + f"[📥 GEN-POST-CALL gen_id={gen_id}] " + f"model={model_id} | latency={latency_ms:.0f}ms | " + f"tokens={tokens} | response={response_preview}" + ) + + +# Export as a name for convenience +class GenerationTracingPlugin: + """Marker class for GenerationTracingPlugin. + + The actual hooks are registered via @hook decorators above. + Use this for reference or type hinting if needed. + """ diff --git a/mellea/plugins/builtin_debug/sampling.py b/mellea/plugins/builtin_debug/sampling.py new file mode 100644 index 000000000..92102d024 --- /dev/null +++ b/mellea/plugins/builtin_debug/sampling.py @@ -0,0 +1,132 @@ +"""Built-in debug plugin for sampling pipeline. + +Provides tracing for sampling strategies including iteration tracking, validation +results, repair events, and success/failure analysis. + +Examples: + Enable sampling tracing: + + from mellea.plugins.builtin_debug.sampling import ( + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, + ) + from mellea.plugins import register + + register([ + log_sampling_loop_start, + log_sampling_iteration, + log_sampling_repair, + log_sampling_loop_end, + ]) + + with start_session() as m: + result = m.instruct("...", strategy=SamplingStrategy(...)) +""" + +from __future__ import annotations + +import logging + +from mellea.plugins import HookType, hook + +logger = logging.getLogger(__name__) + + +@hook(HookType.SAMPLING_LOOP_START) +async def log_sampling_loop_start(payload, ctx): + """Log sampling strategy initialization.""" + strategy = payload.strategy_name + budget = payload.loop_budget + num_reqs = len(payload.requirements) + + logger.info( + f"[🎯 SAMPLING-START] strategy={strategy} | loop_budget={budget} | " + f"requirements={num_reqs}" + ) + + if payload.requirements: + for i, req in enumerate(payload.requirements, 1): + req_desc = getattr(req, "description", str(req)) + logger.debug(f" {i}. {req_desc}") + + +@hook(HookType.SAMPLING_ITERATION) +async def log_sampling_iteration(payload, ctx): + """Log validation results for each sampling attempt.""" + iteration = payload.iteration + passed = payload.valid_count + total = payload.total_count + + if payload.all_validations_passed: + logger.info( + f"[✅ SAMPLING-ITER {iteration}] SUCCESS: {passed}/{total} validations passed" + ) + else: + logger.info( + f"[❌ SAMPLING-ITER {iteration}] FAILED: {passed}/{total} validations passed" + ) + + if payload.validation_results: + for req_obj, result in payload.validation_results: + req_desc = getattr(req_obj, "description", str(req_obj)) + status = "✓" if result.as_bool() else "❌" + + if result.as_bool(): + logger.debug(f" {status} {req_desc}") + else: + logger.info(f" {status} {req_desc}") + # Show detailed reason only if informative + reason = getattr(result, "reason", None) + if reason and reason not in ("yes", "no"): + logger.info(f" └─ {reason}") + + +@hook(HookType.SAMPLING_REPAIR) +async def log_sampling_repair(payload, ctx): + """Log when repair is triggered (RepairTemplateStrategy only).""" + iteration = payload.repair_iteration + repair_type = payload.repair_type + + logger.info(f"\n[🔧 REPAIR-TRIGGERED] at iteration {iteration}") + logger.info(f" repair_type={repair_type}") + logger.info(" failed_validations:") + + for req_obj, result in payload.failed_validations: + if not result.as_bool(): + req_desc = getattr(req_obj, "description", str(req_obj)) + logger.info(f" • {req_desc}") + + +@hook(HookType.SAMPLING_LOOP_END) +async def log_sampling_loop_end(payload, ctx): + """Log sampling completion and overall results.""" + strategy = payload.strategy_name + iterations = payload.iterations_used + success = payload.success + failure_reason = payload.failure_reason + + if success: + logger.info( + f"\n[🎉 SAMPLING-END] SUCCESS in {iterations} iteration(s) using {strategy}" + ) + else: + logger.info( + f"\n[💥 SAMPLING-END] FAILED after {iterations} iteration(s): " + f"{failure_reason}" + ) + + # Summary statistics + total_results = len(payload.all_results) + logger.info(f" total_attempts={total_results}") + + # Show best attempt statistics + if payload.all_validations: + best_valid_count = 0 + for validation_list in payload.all_validations: + valid_count = sum(1 for _, result in validation_list if result.as_bool()) + best_valid_count = max(best_valid_count, valid_count) + + total_reqs = len(payload.all_validations[0]) if payload.all_validations else 0 + logger.info(f" best_validation_score={best_valid_count}/{total_reqs}") diff --git a/mellea/plugins/builtin_debug/validation.py b/mellea/plugins/builtin_debug/validation.py new file mode 100644 index 000000000..aa9938b35 --- /dev/null +++ b/mellea/plugins/builtin_debug/validation.py @@ -0,0 +1,91 @@ +"""Built-in debug plugin for validation pipeline. + +Provides tracing for requirement validation including pre-check setup, post-check +results, and detailed requirement-by-requirement analysis. + +Examples: + Enable validation tracing: + + from mellea.plugins.builtin_debug.validation import ( + log_validation_pre_check, + log_validation_post_check, + ) + from mellea.plugins import register + + register([ + log_validation_pre_check, + log_validation_post_check, + ]) + + with start_session() as m: + result = m.instruct("...", requirements=[...]) +""" + +from __future__ import annotations + +import logging + +from mellea.plugins import HookType, hook + +logger = logging.getLogger(__name__) + + +@hook(HookType.VALIDATION_PRE_CHECK) +async def log_validation_pre_check(payload, ctx): + """Log validation setup before requirements are checked.""" + num_reqs = len(payload.requirements) + target_type = type(payload.target).__name__ if payload.target else "None" + + logger.info( + f"[🔍 VALIDATION-PRE-CHECK] requirements={num_reqs} | target={target_type}" + ) + + if payload.requirements: + logger.debug(" Requirements to validate:") + for i, req in enumerate(payload.requirements, 1): + req_desc = getattr(req, "description", str(req)) + req_type = type(req).__name__ + logger.debug(f" {i}. [{req_type}] {req_desc}") + + +@hook(HookType.VALIDATION_POST_CHECK) +async def log_validation_post_check(payload, ctx): + """Log validation results after requirements are checked.""" + passed = payload.passed_count + failed = payload.failed_count + total = len(payload.requirements) + all_passed = payload.all_validations_passed + + if all_passed: + logger.info( + f"[✅ VALIDATION-POST-CHECK] ALL PASSED: {passed}/{total} requirements" + ) + else: + logger.info( + f"[❌ VALIDATION-POST-CHECK] MIXED RESULTS: {passed}/{total} passed, " + f"{failed}/{total} failed" + ) + + # Log detailed results per requirement + if payload.results: + for i, (req, result) in enumerate( + zip(payload.requirements, payload.results), 1 + ): + req_desc = getattr(req, "description", str(req)) + is_passed = result.as_bool() + status = "✓" if is_passed else "❌" + + if is_passed: + logger.debug(f" {status} {req_desc}") + else: + logger.info(f" {status} {req_desc}") + + # Show reason if available and informative + reason = getattr(result, "reason", None) + if reason and reason not in ("yes", "no", ""): + logger.info(f" └─ {reason}") + + # Show score if available + score = getattr(result, "score", None) + if score is not None: + logger.debug(f" └─ score: {score:.2f}") From 3a0e64aa117ca1ac23084528f7605141fb9d6e28 Mon Sep 17 00:00:00 2001 From: Akihiko Kuroda Date: Wed, 10 Jun 2026 17:02:23 -0400 Subject: [PATCH 2/4] add debug plugin collections Signed-off-by: Akihiko Kuroda --- mellea/plugins/builtin_debug/generation.py | 14 +++++++++-- mellea/plugins/builtin_debug/sampling.py | 28 ++++++++++++++++++---- mellea/plugins/builtin_debug/validation.py | 14 +++++++++-- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/mellea/plugins/builtin_debug/generation.py b/mellea/plugins/builtin_debug/generation.py index 9e0a1bcac..bd9e6afde 100644 --- a/mellea/plugins/builtin_debug/generation.py +++ b/mellea/plugins/builtin_debug/generation.py @@ -104,7 +104,12 @@ def _get_token_usage(payload) -> str: @hook(HookType.GENERATION_PRE_CALL) async def log_generation_pre_call(payload, ctx): - """Log request details before calling the LLM.""" + """Log request details before calling the LLM. + + Args: + payload: GenerationPreCallPayload containing backend, action, generation_id. + ctx: Plugin context for hook execution. + """ model = getattr(payload, "backend", None) model_id = model.model_id if model else "unknown" gen_id = payload.generation_id or "no-id" @@ -148,7 +153,12 @@ async def log_generation_pre_call(payload, ctx): @hook(HookType.GENERATION_POST_CALL) async def log_generation_post_call(payload, ctx): - """Log response details after LLM returns.""" + """Log response details after LLM returns. + + Args: + payload: GenerationPostCallPayload containing model_output, generation_id, latency_ms. + ctx: Plugin context for hook execution. + """ model_output = getattr(payload, "model_output", None) model_id = "unknown" if model_output: diff --git a/mellea/plugins/builtin_debug/sampling.py b/mellea/plugins/builtin_debug/sampling.py index 92102d024..3cd3c43c7 100644 --- a/mellea/plugins/builtin_debug/sampling.py +++ b/mellea/plugins/builtin_debug/sampling.py @@ -36,7 +36,12 @@ @hook(HookType.SAMPLING_LOOP_START) async def log_sampling_loop_start(payload, ctx): - """Log sampling strategy initialization.""" + """Log sampling strategy initialization with budget and requirement count. + + Args: + payload: SamplingLoopStartPayload with strategy_name, loop_budget, requirements. + ctx: Plugin context for hook execution. + """ strategy = payload.strategy_name budget = payload.loop_budget num_reqs = len(payload.requirements) @@ -54,7 +59,12 @@ async def log_sampling_loop_start(payload, ctx): @hook(HookType.SAMPLING_ITERATION) async def log_sampling_iteration(payload, ctx): - """Log validation results for each sampling attempt.""" + """Log validation results for each sampling attempt. + + Args: + payload: SamplingIterationPayload with iteration, valid_count, validation_results. + ctx: Plugin context for hook execution. + """ iteration = payload.iteration passed = payload.valid_count total = payload.total_count @@ -85,7 +95,12 @@ async def log_sampling_iteration(payload, ctx): @hook(HookType.SAMPLING_REPAIR) async def log_sampling_repair(payload, ctx): - """Log when repair is triggered (RepairTemplateStrategy only).""" + """Log when repair is triggered during sampling iterations. + + Args: + payload: SamplingRepairPayload with repair_iteration, repair_type, failed_validations. + ctx: Plugin context for hook execution. + """ iteration = payload.repair_iteration repair_type = payload.repair_type @@ -101,7 +116,12 @@ async def log_sampling_repair(payload, ctx): @hook(HookType.SAMPLING_LOOP_END) async def log_sampling_loop_end(payload, ctx): - """Log sampling completion and overall results.""" + """Log sampling completion with success status and attempt statistics. + + Args: + payload: SamplingLoopEndPayload with success, iterations_used, all_results, all_validations. + ctx: Plugin context for hook execution. + """ strategy = payload.strategy_name iterations = payload.iterations_used success = payload.success diff --git a/mellea/plugins/builtin_debug/validation.py b/mellea/plugins/builtin_debug/validation.py index aa9938b35..2a497d0cc 100644 --- a/mellea/plugins/builtin_debug/validation.py +++ b/mellea/plugins/builtin_debug/validation.py @@ -32,7 +32,12 @@ @hook(HookType.VALIDATION_PRE_CHECK) async def log_validation_pre_check(payload, ctx): - """Log validation setup before requirements are checked.""" + """Log validation setup before requirements are checked. + + Args: + payload: ValidationPreCheckPayload containing requirements and target. + ctx: Plugin context for hook execution. + """ num_reqs = len(payload.requirements) target_type = type(payload.target).__name__ if payload.target else "None" @@ -50,7 +55,12 @@ async def log_validation_pre_check(payload, ctx): @hook(HookType.VALIDATION_POST_CHECK) async def log_validation_post_check(payload, ctx): - """Log validation results after requirements are checked.""" + """Log validation results after requirements are checked. + + Args: + payload: ValidationPostCheckPayload with passed_count, failed_count, results. + ctx: Plugin context for hook execution. + """ passed = payload.passed_count failed = payload.failed_count total = len(payload.requirements) From d1ab9a32f651301760195e2425dc4948467cd95d Mon Sep 17 00:00:00 2001 From: Akihiko Kuroda Date: Wed, 10 Jun 2026 18:05:51 -0400 Subject: [PATCH 3/4] add debug plugin collections Signed-off-by: Akihiko Kuroda --- mellea/plugins/builtin_debug/generation.py | 13 ++++++++++-- mellea/plugins/builtin_debug/sampling.py | 23 ++++++++++++++++++---- mellea/plugins/builtin_debug/validation.py | 13 ++++++++++-- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/mellea/plugins/builtin_debug/generation.py b/mellea/plugins/builtin_debug/generation.py index bd9e6afde..8f7d5086a 100644 --- a/mellea/plugins/builtin_debug/generation.py +++ b/mellea/plugins/builtin_debug/generation.py @@ -19,8 +19,13 @@ from __future__ import annotations import logging +from typing import Any from mellea.plugins import HookType, hook +from mellea.plugins.hooks.generation import ( + GenerationPostCallPayload, + GenerationPreCallPayload, +) logger = logging.getLogger(__name__) @@ -103,7 +108,9 @@ def _get_token_usage(payload) -> str: @hook(HookType.GENERATION_PRE_CALL) -async def log_generation_pre_call(payload, ctx): +async def log_generation_pre_call( + payload: GenerationPreCallPayload, ctx: dict[str, Any] +) -> None: """Log request details before calling the LLM. Args: @@ -152,7 +159,9 @@ async def log_generation_pre_call(payload, ctx): @hook(HookType.GENERATION_POST_CALL) -async def log_generation_post_call(payload, ctx): +async def log_generation_post_call( + payload: GenerationPostCallPayload, ctx: dict[str, Any] +) -> None: """Log response details after LLM returns. Args: diff --git a/mellea/plugins/builtin_debug/sampling.py b/mellea/plugins/builtin_debug/sampling.py index 3cd3c43c7..918bd4762 100644 --- a/mellea/plugins/builtin_debug/sampling.py +++ b/mellea/plugins/builtin_debug/sampling.py @@ -28,14 +28,23 @@ from __future__ import annotations import logging +from typing import Any from mellea.plugins import HookType, hook +from mellea.plugins.hooks.sampling import ( + SamplingIterationPayload, + SamplingLoopEndPayload, + SamplingLoopStartPayload, + SamplingRepairPayload, +) logger = logging.getLogger(__name__) @hook(HookType.SAMPLING_LOOP_START) -async def log_sampling_loop_start(payload, ctx): +async def log_sampling_loop_start( + payload: SamplingLoopStartPayload, ctx: dict[str, Any] +) -> None: """Log sampling strategy initialization with budget and requirement count. Args: @@ -58,7 +67,9 @@ async def log_sampling_loop_start(payload, ctx): @hook(HookType.SAMPLING_ITERATION) -async def log_sampling_iteration(payload, ctx): +async def log_sampling_iteration( + payload: SamplingIterationPayload, ctx: dict[str, Any] +) -> None: """Log validation results for each sampling attempt. Args: @@ -94,7 +105,9 @@ async def log_sampling_iteration(payload, ctx): @hook(HookType.SAMPLING_REPAIR) -async def log_sampling_repair(payload, ctx): +async def log_sampling_repair( + payload: SamplingRepairPayload, ctx: dict[str, Any] +) -> None: """Log when repair is triggered during sampling iterations. Args: @@ -115,7 +128,9 @@ async def log_sampling_repair(payload, ctx): @hook(HookType.SAMPLING_LOOP_END) -async def log_sampling_loop_end(payload, ctx): +async def log_sampling_loop_end( + payload: SamplingLoopEndPayload, ctx: dict[str, Any] +) -> None: """Log sampling completion with success status and attempt statistics. Args: diff --git a/mellea/plugins/builtin_debug/validation.py b/mellea/plugins/builtin_debug/validation.py index 2a497d0cc..6e5b0fd5c 100644 --- a/mellea/plugins/builtin_debug/validation.py +++ b/mellea/plugins/builtin_debug/validation.py @@ -24,14 +24,21 @@ from __future__ import annotations import logging +from typing import Any from mellea.plugins import HookType, hook +from mellea.plugins.hooks.validation import ( + ValidationPostCheckPayload, + ValidationPreCheckPayload, +) logger = logging.getLogger(__name__) @hook(HookType.VALIDATION_PRE_CHECK) -async def log_validation_pre_check(payload, ctx): +async def log_validation_pre_check( + payload: ValidationPreCheckPayload, ctx: dict[str, Any] +) -> None: """Log validation setup before requirements are checked. Args: @@ -54,7 +61,9 @@ async def log_validation_pre_check(payload, ctx): @hook(HookType.VALIDATION_POST_CHECK) -async def log_validation_post_check(payload, ctx): +async def log_validation_post_check( + payload: ValidationPostCheckPayload, ctx: dict[str, Any] +) -> None: """Log validation results after requirements are checked. Args: From e8dea4f70eb70be987f7b8560f40a91262c44cbb Mon Sep 17 00:00:00 2001 From: Akihiko Kuroda Date: Wed, 10 Jun 2026 18:18:41 -0400 Subject: [PATCH 4/4] add debug plugin collections Signed-off-by: Akihiko Kuroda --- docs/docs/how-to/debug-with-plugins.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/docs/how-to/debug-with-plugins.md b/docs/docs/how-to/debug-with-plugins.md index 8a63f45fa..1c94faa78 100644 --- a/docs/docs/how-to/debug-with-plugins.md +++ b/docs/docs/how-to/debug-with-plugins.md @@ -4,8 +4,7 @@ description: "Use built-in debug plugins to trace generation, validation, and sa # diataxis: how-to --- -**Prerequisites:** [The Requirements System](../concepts/requirements-system), -[Sampling Strategies](../concepts/sampling-strategies), `pip install mellea`. +**Prerequisites:** [The Requirements System](../concepts/requirements-system), `pip install mellea`. Mellea's plugin system provides debug hooks that trace the full lifecycle of generation, validation, and sampling. Use these plugins to understand: