@@ -47,11 +47,14 @@ async def aha_judge(
4747 model_a_answer = str (row .ground_truth )
4848 model_b_answer = serialize_message (row .messages [- 1 ])
4949
50- client = AsyncOpenAI (api_key = judge_config .get ("api_key" ), base_url = judge_config .get ("base_url" ))
51-
52- # Run two judgment rounds in sequence (A vs B, then B vs A)
53- result1 = await run_single_judgment (question_text , model_a_answer , model_b_answer , row .tools , judge_config , client )
54- result2 = await run_single_judgment (question_text , model_b_answer , model_a_answer , row .tools , judge_config , client )
50+ async with AsyncOpenAI (api_key = judge_config .get ("api_key" ), base_url = judge_config .get ("base_url" )) as client :
51+ # Run two judgment rounds in sequence (A vs B, then B vs A)
52+ result1 = await run_single_judgment (
53+ question_text , model_a_answer , model_b_answer , row .tools , judge_config , client
54+ )
55+ result2 = await run_single_judgment (
56+ question_text , model_b_answer , model_a_answer , row .tools , judge_config , client
57+ )
5558
5659 if not result1 or not result2 or not result1 .get ("score" ) or not result2 .get ("score" ):
5760 # If either judgment failed, mark as invalid (don't include in distribution)
0 commit comments