Skip to content

Commit d563336

Browse files
authored
Fix completion params getting overwritten (#162)
* Fix completion params getting overwritten * add tags so we know which model to compare to which when doing evaluation
1 parent 8e8eef4 commit d563336

File tree

4 files changed

+51
-19
lines changed

4 files changed

+51
-19
lines changed

eval_protocol/adapters/langfuse.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -379,22 +379,32 @@ def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMe
379379
Returns:
380380
InputMetadata object
381381
"""
382-
# Extract completion parameters from observations
382+
# Extract completion parameters from trace input first, then observations
383383
completion_params = {}
384384

385-
# Look for model parameters in observations
386-
for obs in observations:
387-
if hasattr(obs, "model") and obs.model:
388-
completion_params["model"] = obs.model
389-
if hasattr(obs, "model_parameters") and obs.model_parameters:
390-
params = obs.model_parameters
391-
if "temperature" in params:
392-
completion_params["temperature"] = params["temperature"]
393-
if "max_tokens" in params:
394-
completion_params["max_tokens"] = params["max_tokens"]
395-
if "top_p" in params:
396-
completion_params["top_p"] = params["top_p"]
397-
break
385+
# First check trace input for evaluation test completion_params
386+
if hasattr(trace, "input") and trace.input:
387+
if isinstance(trace.input, dict):
388+
kwargs = trace.input.get("kwargs", {})
389+
if "completion_params" in kwargs:
390+
trace_completion_params = kwargs["completion_params"]
391+
if trace_completion_params and isinstance(trace_completion_params, dict):
392+
completion_params.update(trace_completion_params)
393+
394+
# Fallback: Look for model parameters in observations if not found in trace input
395+
if not completion_params:
396+
for obs in observations:
397+
if hasattr(obs, "model") and obs.model:
398+
completion_params["model"] = obs.model
399+
if hasattr(obs, "model_parameters") and obs.model_parameters:
400+
params = obs.model_parameters
401+
if "temperature" in params:
402+
completion_params["temperature"] = params["temperature"]
403+
if "max_tokens" in params:
404+
completion_params["max_tokens"] = params["max_tokens"]
405+
if "top_p" in params:
406+
completion_params["top_p"] = params["top_p"]
407+
break
398408

399409
# Create dataset info from trace metadata
400410
dataset_info = {

eval_protocol/pytest/evaluation_test.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,11 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
271271
passed=None,
272272
)
273273
for row in data:
274-
row.input_metadata.completion_params = (
275-
completion_params if completion_params is not None else {}
276-
)
274+
# Only set completion_params if they don't already exist
275+
if not row.input_metadata.completion_params:
276+
row.input_metadata.completion_params = (
277+
completion_params if completion_params is not None else {}
278+
)
277279
# Add mode to session_data
278280
if row.input_metadata.session_data is None:
279281
row.input_metadata.session_data = {}

tests/chinook/langfuse/generate_traces.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ async def test_complex_query_0(row: EvaluationRow) -> EvaluationRow:
6363
"""
6464
Complex queries - PydanticAI automatically creates rich Langfuse traces.
6565
"""
66+
if langfuse_client:
67+
langfuse_client.update_current_trace(tags=["chinook_sql"])
68+
6669
return row
6770

6871

@@ -92,6 +95,9 @@ async def test_complex_query_1(row: EvaluationRow) -> EvaluationRow:
9295
"""
9396
Complex queries - PydanticAI automatically creates rich Langfuse traces.
9497
"""
98+
if langfuse_client:
99+
langfuse_client.update_current_trace(tags=["chinook_sql"])
100+
95101
return row
96102

97103

@@ -121,6 +127,9 @@ async def test_complex_query_2(row: EvaluationRow) -> EvaluationRow:
121127
"""
122128
Complex queries - PydanticAI automatically creates rich Langfuse traces.
123129
"""
130+
if langfuse_client:
131+
langfuse_client.update_current_trace(tags=["chinook_sql"])
132+
124133
return row
125134

126135

@@ -150,6 +159,9 @@ async def test_complex_query_3(row: EvaluationRow) -> EvaluationRow:
150159
"""
151160
Complex queries - PydanticAI automatically creates rich Langfuse traces.
152161
"""
162+
if langfuse_client:
163+
langfuse_client.update_current_trace(tags=["chinook_sql"])
164+
153165
return row
154166

155167

@@ -179,6 +191,9 @@ async def test_complex_query_4(row: EvaluationRow) -> EvaluationRow:
179191
"""
180192
Complex queries - PydanticAI automatically creates rich Langfuse traces.
181193
"""
194+
if langfuse_client:
195+
langfuse_client.update_current_trace(tags=["chinook_sql"])
196+
182197
return row
183198

184199

@@ -208,4 +223,7 @@ async def test_complex_query_5(row: EvaluationRow) -> EvaluationRow:
208223
"""
209224
Complex queries - PydanticAI automatically creates rich Langfuse traces.
210225
"""
226+
if langfuse_client:
227+
langfuse_client.update_current_trace(tags=["chinook_sql"])
228+
211229
return row

tests/chinook/langfuse/test_langfuse_chinook.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ class Response(BaseModel):
4545
reason: str
4646

4747

48-
def fetch_langfuse_traces_as_evaluation_rows(hours_back: int = 168) -> List[EvaluationRow]:
48+
def fetch_langfuse_traces_as_evaluation_rows(
49+
hours_back: int = 168, tags: List[str] = ["chinook_sql"]
50+
) -> List[EvaluationRow]:
4951
try:
5052
from eval_protocol.adapters.langfuse import create_langfuse_adapter
5153

@@ -59,7 +61,7 @@ def fetch_langfuse_traces_as_evaluation_rows(hours_back: int = 168) -> List[Eval
5961
from_timestamp = now - timedelta(hours=hours_back)
6062

6163
return adapter.get_evaluation_rows(
62-
limit=20, from_timestamp=from_timestamp, to_timestamp=now, include_tool_calls=True
64+
limit=20, from_timestamp=from_timestamp, to_timestamp=now, include_tool_calls=True, tags=tags
6365
)
6466

6567
except Exception as e:

0 commit comments

Comments
 (0)