From 3be20c10236f5f0f58a53e40ea928b0b89500cf0 Mon Sep 17 00:00:00 2001 From: Mattias Lundell Date: Mon, 5 May 2025 16:11:48 +0200 Subject: [PATCH] feat: add automatic tracing of evaluations --- src/opperai/_opper.py | 90 +++++++++++++++++----------- src/opperai/evaluations/decorator.py | 4 +- src/opperai/spans/async_spans.py | 6 ++ src/opperai/types/spans.py | 1 + 4 files changed, 65 insertions(+), 36 deletions(-) diff --git a/src/opperai/_opper.py b/src/opperai/_opper.py index 91a75b2..f7f2f30 100644 --- a/src/opperai/_opper.py +++ b/src/opperai/_opper.py @@ -1,4 +1,5 @@ import inspect +import json from typing import Any, Dict, List, Optional from opperai.embeddings.async_embeddings import AsyncEmbeddings @@ -12,6 +13,7 @@ from opperai.spans.spans import Spans from ._client import AsyncClient, Client +from .core.utils import prepare_input DEFAULT_API_URL = "https://api.opper.ai" DEFAULT_TIMEOUT = 120 @@ -93,43 +95,63 @@ async def _evaluate( # Store all metrics all_metrics = {} + opper = AsyncOpper() # Get span for saving metrics - span = AsyncOpper().spans.get_span(span_id=span_id) + span = opper.spans.get_span(span_id=span_id) # Run each evaluator and process the metrics - for evaluator_result in evaluators: - # Check if result is a coroutine and await it if needed - if inspect.iscoroutine(evaluator_result): - metrics = await evaluator_result - elif callable(evaluator_result): - metrics = evaluator_result() - # Check if the callable returned a coroutine - if inspect.iscoroutine(metrics): - metrics = await metrics - else: - metrics = evaluator_result - - # Ensure we have a list of metrics - if not isinstance(metrics, list): - metrics = [metrics] - - # Validate that each metric has a dimension - for i, metric in enumerate(metrics): - if not metric.dimension: - raise ValueError(f"Metric at index {i} must have a dimension") - - # Save metrics directly to span - for metric in metrics: - await span.save_metric( - dimension=metric.dimension, - value=metric.value or 0.0, - comment=metric.comment or "", - ) - - # Add metrics to flat list - if "metrics" not in all_metrics: - all_metrics["metrics"] = [] - all_metrics["metrics"].extend(metrics) + for evaluator_result, kwargs in evaluators: + async with opper.spans.start( + name=evaluator_result.__name__, + parent_span_id=span.uuid, + type="evaluation", + ) as evaluation_span: + # Check if result is a coroutine and await it if needed + if inspect.iscoroutine(evaluator_result): + print("evaluator_result is a coroutine") + metrics = await evaluator_result + elif callable(evaluator_result): + print("evaluator_result is a callable") + metrics = evaluator_result() + # Check if the callable returned a coroutine + if inspect.iscoroutine(metrics): + print("metrics is a coroutine") + metrics = await metrics + else: + print("evaluator_result is not a coroutine or callable") + metrics = evaluator_result + + # Ensure we have a list of metrics + if not isinstance(metrics, list): + metrics = [metrics] + + # Validate that each metric has a dimension + for i, metric in enumerate(metrics): + if not metric.dimension: + raise ValueError(f"Metric at index {i} must have a dimension") + + # Save metrics directly to span + for metric in metrics: + await span.save_metric( + dimension=metric.dimension, + value=metric.value or 0.0, + comment=metric.comment or "", + ) + + # Add metrics to flat list + if "metrics" not in all_metrics: + all_metrics["metrics"] = [] + all_metrics["metrics"].extend(metrics) + + if isinstance(kwargs, str): + await evaluation_span.update( + input=kwargs, output=json.dumps(prepare_input(metrics)) + ) + else: + await evaluation_span.update( + input=json.dumps(prepare_input(kwargs)), + output=json.dumps(prepare_input(metrics)), + ) # Create the final evaluation return Evaluation(metrics=all_metrics) diff --git a/src/opperai/evaluations/decorator.py b/src/opperai/evaluations/decorator.py index e42b6f3..d149c46 100644 --- a/src/opperai/evaluations/decorator.py +++ b/src/opperai/evaluations/decorator.py @@ -22,7 +22,7 @@ def decorator(f): def evaluator_func(**kwargs): # Merge default kwargs with provided kwargs combined_kwargs = {**decorator_kwargs, **kwargs} - return f(**combined_kwargs) + return f(**combined_kwargs), kwargs # Copy the original function name and docstring evaluator_func.__name__ = f.__name__ @@ -33,7 +33,7 @@ def evaluator_func(**kwargs): # Called without parameters: @evaluator def evaluator_func(**kwargs): - return func(**kwargs) + return func(**kwargs), kwargs # Copy the original function name and docstring evaluator_func.__name__ = func.__name__ diff --git a/src/opperai/spans/async_spans.py b/src/opperai/spans/async_spans.py index 297afd5..5fa92ef 100644 --- a/src/opperai/spans/async_spans.py +++ b/src/opperai/spans/async_spans.py @@ -98,12 +98,14 @@ async def start( input: str = None, meta: dict = None, parent_span_id: str = None, + type: str = None, ) -> AsyncIterator[AsyncSpan]: span = await self.start_span( name=name, input=input, meta=meta, parent_span_id=parent_span_id, + type=type, ) try: yield span @@ -117,6 +119,7 @@ async def start_span( meta: dict = None, start_time: datetime = None, parent_span_id: str = None, + type: str = None, ) -> AsyncSpan: span, token = await self._create_span( name=name, @@ -124,6 +127,7 @@ async def start_span( meta=meta, parent_span_id=parent_span_id, start_time=start_time, + type=type, ) async def end_span(end_time: datetime = None): @@ -145,6 +149,7 @@ async def _create_span( meta: dict, parent_span_id: str, start_time: datetime, + type: str, ): parent_span_id = parent_span_id if parent_span_id else _current_span_id.get() span_model = await self._client.spans.create( @@ -154,6 +159,7 @@ async def _create_span( start_time=start_time if start_time else datetime.now(timezone.utc), parent_uuid=parent_span_id, meta=meta, + type=type, ) ) span = AsyncSpan(self._client, str(span_model.uuid)) diff --git a/src/opperai/types/spans.py b/src/opperai/types/spans.py index d8ab856..ba04db5 100644 --- a/src/opperai/types/spans.py +++ b/src/opperai/types/spans.py @@ -36,6 +36,7 @@ class Span(BaseModel): evaluations: Optional[dict] = None score: Optional[int] = None generation: Optional[GenerationIn] = None + type: Optional[str] = None class SpanMetric(BaseModel):