diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index ede6d9fe..9a15da88 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -379,22 +379,32 @@ def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMe Returns: InputMetadata object """ - # Extract completion parameters from observations + # Extract completion parameters from trace input first, then observations completion_params = {} - # Look for model parameters in observations - for obs in observations: - if hasattr(obs, "model") and obs.model: - completion_params["model"] = obs.model - if hasattr(obs, "model_parameters") and obs.model_parameters: - params = obs.model_parameters - if "temperature" in params: - completion_params["temperature"] = params["temperature"] - if "max_tokens" in params: - completion_params["max_tokens"] = params["max_tokens"] - if "top_p" in params: - completion_params["top_p"] = params["top_p"] - break + # First check trace input for evaluation test completion_params + if hasattr(trace, "input") and trace.input: + if isinstance(trace.input, dict): + kwargs = trace.input.get("kwargs", {}) + if "completion_params" in kwargs: + trace_completion_params = kwargs["completion_params"] + if trace_completion_params and isinstance(trace_completion_params, dict): + completion_params.update(trace_completion_params) + + # Fallback: Look for model parameters in observations if not found in trace input + if not completion_params: + for obs in observations: + if hasattr(obs, "model") and obs.model: + completion_params["model"] = obs.model + if hasattr(obs, "model_parameters") and obs.model_parameters: + params = obs.model_parameters + if "temperature" in params: + completion_params["temperature"] = params["temperature"] + if "max_tokens" in params: + completion_params["max_tokens"] = params["max_tokens"] + if "top_p" in params: + completion_params["top_p"] = params["top_p"] + break # Create dataset info from trace metadata dataset_info = { diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index e9dd37e4..cf2c4a77 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -271,9 +271,11 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo passed=None, ) for row in data: - row.input_metadata.completion_params = ( - completion_params if completion_params is not None else {} - ) + # Only set completion_params if they don't already exist + if not row.input_metadata.completion_params: + row.input_metadata.completion_params = ( + completion_params if completion_params is not None else {} + ) # Add mode to session_data if row.input_metadata.session_data is None: row.input_metadata.session_data = {} diff --git a/tests/chinook/langfuse/generate_traces.py b/tests/chinook/langfuse/generate_traces.py index 16e52dba..e2d7d011 100644 --- a/tests/chinook/langfuse/generate_traces.py +++ b/tests/chinook/langfuse/generate_traces.py @@ -63,6 +63,9 @@ async def test_complex_query_0(row: EvaluationRow) -> EvaluationRow: """ Complex queries - PydanticAI automatically creates rich Langfuse traces. """ + if langfuse_client: + langfuse_client.update_current_trace(tags=["chinook_sql"]) + return row @@ -92,6 +95,9 @@ async def test_complex_query_1(row: EvaluationRow) -> EvaluationRow: """ Complex queries - PydanticAI automatically creates rich Langfuse traces. """ + if langfuse_client: + langfuse_client.update_current_trace(tags=["chinook_sql"]) + return row @@ -121,6 +127,9 @@ async def test_complex_query_2(row: EvaluationRow) -> EvaluationRow: """ Complex queries - PydanticAI automatically creates rich Langfuse traces. """ + if langfuse_client: + langfuse_client.update_current_trace(tags=["chinook_sql"]) + return row @@ -150,6 +159,9 @@ async def test_complex_query_3(row: EvaluationRow) -> EvaluationRow: """ Complex queries - PydanticAI automatically creates rich Langfuse traces. """ + if langfuse_client: + langfuse_client.update_current_trace(tags=["chinook_sql"]) + return row @@ -179,6 +191,9 @@ async def test_complex_query_4(row: EvaluationRow) -> EvaluationRow: """ Complex queries - PydanticAI automatically creates rich Langfuse traces. """ + if langfuse_client: + langfuse_client.update_current_trace(tags=["chinook_sql"]) + return row @@ -208,4 +223,7 @@ async def test_complex_query_5(row: EvaluationRow) -> EvaluationRow: """ Complex queries - PydanticAI automatically creates rich Langfuse traces. """ + if langfuse_client: + langfuse_client.update_current_trace(tags=["chinook_sql"]) + return row diff --git a/tests/chinook/langfuse/test_langfuse_chinook.py b/tests/chinook/langfuse/test_langfuse_chinook.py index 2aaf7f16..66a873ad 100644 --- a/tests/chinook/langfuse/test_langfuse_chinook.py +++ b/tests/chinook/langfuse/test_langfuse_chinook.py @@ -45,7 +45,9 @@ class Response(BaseModel): reason: str -def fetch_langfuse_traces_as_evaluation_rows(hours_back: int = 168) -> List[EvaluationRow]: +def fetch_langfuse_traces_as_evaluation_rows( + hours_back: int = 168, tags: List[str] = ["chinook_sql"] +) -> List[EvaluationRow]: try: from eval_protocol.adapters.langfuse import create_langfuse_adapter @@ -59,7 +61,7 @@ def fetch_langfuse_traces_as_evaluation_rows(hours_back: int = 168) -> List[Eval from_timestamp = now - timedelta(hours=hours_back) return adapter.get_evaluation_rows( - limit=20, from_timestamp=from_timestamp, to_timestamp=now, include_tool_calls=True + limit=20, from_timestamp=from_timestamp, to_timestamp=now, include_tool_calls=True, tags=tags ) except Exception as e: