1313from eval_protocol .pytest import evaluation_test
1414from eval_protocol .pytest .remote_rollout_processor import RemoteRolloutProcessor
1515from eval_protocol .adapters .langfuse import create_langfuse_adapter
16+ from eval_protocol .quickstart .utils import filter_longest_conversation
1617
17- INVOCATION_ID = ""
18- ASSERTION_EXECUTED = False
18+ ROLLOUT_IDS = set ()
1919
2020
2121@pytest .fixture (autouse = True )
22- def check_assertion_executed ():
23- """Ensure the test actually executed the Langfuse validation """
24- global ASSERTION_EXECUTED
25- ASSERTION_EXECUTED = False # Reset before test
22+ def check_rollout_coverage ():
23+ """Ensure we processed all expected rollout_ids """
24+ global ROLLOUT_IDS
25+ ROLLOUT_IDS . clear ()
2626 yield
27- # After test completes, verify the assertion was executed
28- assert ASSERTION_EXECUTED , (
29- "Test passed but never validated Langfuse data - check if output_data_loader returned empty results"
27+
28+ # Verify we've seen the expected number of rollout_ids after test is done
29+ expected_rollout_count = 3
30+ assert len (ROLLOUT_IDS ) == expected_rollout_count , (
31+ f"Expected to see { expected_rollout_count } rollout_ids, but only saw { len (ROLLOUT_IDS )} : { ROLLOUT_IDS } "
3032 )
3133
3234
33- def fetch_trajectories ( invocation_id : str ) -> List [EvaluationRow ]:
34- global INVOCATION_ID # This is just to verify the invocation_id is set correctly in the test
35- INVOCATION_ID = invocation_id
35+ def fetch_langfuse_traces ( rollout_id : str ) -> List [EvaluationRow ]:
36+ global ROLLOUT_IDS # Track all rollout_ids we've seen
37+ ROLLOUT_IDS . add ( rollout_id )
3638
3739 adapter = create_langfuse_adapter ()
38- return adapter .get_evaluation_rows (tags = [f"invocation_id: { invocation_id } " ])
40+ return adapter .get_evaluation_rows (tags = [f"rollout_id: { rollout_id } " ])
3941
4042
41- def create_output_data_loader (invocation_id : str ) -> DynamicDataLoader :
42- return DynamicDataLoader (generators = [lambda : fetch_trajectories (invocation_id )])
43+ def langfuse_output_data_loader (rollout_id : str ) -> DynamicDataLoader :
44+ return DynamicDataLoader (
45+ generators = [lambda : fetch_langfuse_traces (rollout_id )], preprocess_fn = filter_longest_conversation
46+ )
4347
4448
4549def _start_remote_server ():
@@ -87,7 +91,7 @@ def remote_langfuse_data_generator() -> List[EvaluationRow]:
8791
8892 # Minimal single-user-turn message to trigger a response
8993 row = EvaluationRow (messages = [Message (role = "user" , content = "Hello there! Please say hi back." )])
90- return [row ]
94+ return [row , row , row ]
9195
9296
9397@pytest .mark .skipif (os .environ .get ("CI" ) == "true" , reason = "Only run this test locally (skipped in CI)" )
@@ -100,7 +104,7 @@ def remote_langfuse_data_generator() -> List[EvaluationRow]:
100104 remote_base_url = "http://127.0.0.1:7077" ,
101105 num_turns = 2 ,
102106 timeout_seconds = 30 ,
103- output_data_loader = create_output_data_loader ,
107+ output_data_loader = langfuse_output_data_loader ,
104108 ),
105109)
106110async def test_remote_rollout_and_fetch_langfuse (row : EvaluationRow ) -> EvaluationRow :
@@ -110,13 +114,9 @@ async def test_remote_rollout_and_fetch_langfuse(row: EvaluationRow) -> Evaluati
110114 - trigger remote rollout via RemoteRolloutProcessor (calls init/status)
111115 - fetch traces from Langfuse filtered by metadata via output_data_loader; FAIL if none found
112116 """
113- global ASSERTION_EXECUTED
114-
115- # Sanity check: row should have an invocation_id since it came from Langfuse via output_data_loader
116117 assert row .messages [0 ].content == "Hello there! Please say hi back." , "Row should have correct message content"
117- assert row .execution_metadata .invocation_id == INVOCATION_ID , "Row should have correct invocation_id set"
118-
119- ASSERTION_EXECUTED = True
120- print (f"✅ Successfully received row from Langfuse with invocation_id: { row .execution_metadata .invocation_id } " )
118+ assert row .execution_metadata .rollout_id in ROLLOUT_IDS , (
119+ f"Row rollout_id { row .execution_metadata .rollout_id } should be in tracked rollout_ids: { ROLLOUT_IDS } "
120+ )
121121
122122 return row
0 commit comments