add tags so we know which model to compare to which when doing evaluation

xzrderek · xzrderek · commit 0ba11df19f2b · 2025-09-05T15:05:46.000-07:00
diff --git a/tests/chinook/langfuse/generate_traces.py b/tests/chinook/langfuse/generate_traces.py
@@ -63,6 +63,9 @@ async def test_complex_query_0(row: EvaluationRow) -> EvaluationRow:
     """
     Complex queries - PydanticAI automatically creates rich Langfuse traces.
     """
+    if langfuse_client:
+        langfuse_client.update_current_trace(tags=["chinook_sql"])
+
     return row
 
 
@@ -92,6 +95,9 @@ async def test_complex_query_1(row: EvaluationRow) -> EvaluationRow:
     """
     Complex queries - PydanticAI automatically creates rich Langfuse traces.
     """
+    if langfuse_client:
+        langfuse_client.update_current_trace(tags=["chinook_sql"])
+
     return row
 
 
@@ -121,6 +127,9 @@ async def test_complex_query_2(row: EvaluationRow) -> EvaluationRow:
     """
     Complex queries - PydanticAI automatically creates rich Langfuse traces.
     """
+    if langfuse_client:
+        langfuse_client.update_current_trace(tags=["chinook_sql"])
+
     return row
 
 
@@ -150,6 +159,9 @@ async def test_complex_query_3(row: EvaluationRow) -> EvaluationRow:
     """
     Complex queries - PydanticAI automatically creates rich Langfuse traces.
     """
+    if langfuse_client:
+        langfuse_client.update_current_trace(tags=["chinook_sql"])
+
     return row
 
 
@@ -179,6 +191,9 @@ async def test_complex_query_4(row: EvaluationRow) -> EvaluationRow:
     """
     Complex queries - PydanticAI automatically creates rich Langfuse traces.
     """
+    if langfuse_client:
+        langfuse_client.update_current_trace(tags=["chinook_sql"])
+
     return row
 
 
@@ -208,4 +223,7 @@ async def test_complex_query_5(row: EvaluationRow) -> EvaluationRow:
     """
     Complex queries - PydanticAI automatically creates rich Langfuse traces.
     """
+    if langfuse_client:
+        langfuse_client.update_current_trace(tags=["chinook_sql"])
+
     return row
diff --git a/tests/chinook/langfuse/test_langfuse_chinook.py b/tests/chinook/langfuse/test_langfuse_chinook.py
@@ -45,7 +45,9 @@ class Response(BaseModel):
     reason: str
 
 
-def fetch_langfuse_traces_as_evaluation_rows(hours_back: int = 168) -> List[EvaluationRow]:
+def fetch_langfuse_traces_as_evaluation_rows(
+    hours_back: int = 168, tags: List[str] = ["chinook_sql"]
+) -> List[EvaluationRow]:
     try:
         from eval_protocol.adapters.langfuse import create_langfuse_adapter
 
@@ -59,7 +61,7 @@ def fetch_langfuse_traces_as_evaluation_rows(hours_back: int = 168) -> List[Eval
         from_timestamp = now - timedelta(hours=hours_back)
 
         return adapter.get_evaluation_rows(
-            limit=20, from_timestamp=from_timestamp, to_timestamp=now, include_tool_calls=True
+            limit=20, from_timestamp=from_timestamp, to_timestamp=now, include_tool_calls=True, tags=tags
         )
 
     except Exception as e: