Skip to content

Commit c952929

Browse files
committed
update
1 parent 2ab0b5c commit c952929

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

eval_protocol/quickstart/llm_judge.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,14 @@ async def aha_judge(
4747
model_a_answer = str(row.ground_truth)
4848
model_b_answer = serialize_message(row.messages[-1])
4949

50-
client = AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url"))
51-
52-
# Run two judgment rounds in sequence (A vs B, then B vs A)
53-
result1 = await run_single_judgment(question_text, model_a_answer, model_b_answer, row.tools, judge_config, client)
54-
result2 = await run_single_judgment(question_text, model_b_answer, model_a_answer, row.tools, judge_config, client)
50+
async with AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")) as client:
51+
# Run two judgment rounds in sequence (A vs B, then B vs A)
52+
result1 = await run_single_judgment(
53+
question_text, model_a_answer, model_b_answer, row.tools, judge_config, client
54+
)
55+
result2 = await run_single_judgment(
56+
question_text, model_b_answer, model_a_answer, row.tools, judge_config, client
57+
)
5558

5659
if not result1 or not result2 or not result1.get("score") or not result2.get("score"):
5760
# If either judgment failed, mark as invalid (don't include in distribution)

eval_protocol/quickstart/llm_judge_langfuse.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
adapter = create_langfuse_adapter()
2323
input_rows = adapter.get_evaluation_rows(
2424
to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
25-
limit=1,
26-
sample_size=1,
25+
limit=711,
26+
sample_size=50,
2727
sleep_between_gets=3.0,
2828
max_retries=5,
2929
)
@@ -52,7 +52,7 @@
5252
rollout_processor=SingleTurnRolloutProcessor(),
5353
preprocess_fn=multi_turn_assistant_to_ground_truth,
5454
max_concurrent_rollouts=64,
55-
max_concurrent_evaluations=2,
55+
max_concurrent_evaluations=16,
5656
)
5757
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
5858
return await aha_judge(row)

0 commit comments

Comments
 (0)