update

xzrderek · xzrderek · commit c952929cdb68 · 2025-09-22T00:28:33.000-07:00
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -47,11 +47,14 @@ async def aha_judge(
     model_a_answer = str(row.ground_truth)
     model_b_answer = serialize_message(row.messages[-1])
 
-    client = AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url"))
-
-    # Run two judgment rounds in sequence (A vs B, then B vs A)
-    result1 = await run_single_judgment(question_text, model_a_answer, model_b_answer, row.tools, judge_config, client)
-    result2 = await run_single_judgment(question_text, model_b_answer, model_a_answer, row.tools, judge_config, client)
+    async with AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")) as client:
+        # Run two judgment rounds in sequence (A vs B, then B vs A)
+        result1 = await run_single_judgment(
+            question_text, model_a_answer, model_b_answer, row.tools, judge_config, client
+        )
+        result2 = await run_single_judgment(
+            question_text, model_b_answer, model_a_answer, row.tools, judge_config, client
+        )
 
     if not result1 or not result2 or not result1.get("score") or not result2.get("score"):
         # If either judgment failed, mark as invalid (don't include in distribution)
diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -22,8 +22,8 @@
 adapter = create_langfuse_adapter()
 input_rows = adapter.get_evaluation_rows(
     to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
-    limit=1,
-    sample_size=1,
+    limit=711,
+    sample_size=50,
     sleep_between_gets=3.0,
     max_retries=5,
 )
@@ -52,7 +52,7 @@
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
     max_concurrent_rollouts=64,
-    max_concurrent_evaluations=2,
+    max_concurrent_evaluations=16,
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)