Skip to content

Commit 824e0db

Browse files
committed
big run with kimi judge
1 parent 21fdb2b commit 824e0db

File tree

2 files changed

+12
-27
lines changed

2 files changed

+12
-27
lines changed

eval_protocol/quickstart/llm_judge.py

Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,31 +21,14 @@
2121
)
2222
import asyncio
2323
from openai import AsyncOpenAI
24-
import litellm
25-
26-
27-
@pytest.mark.asyncio
28-
@pytest.mark.parametrize(
29-
"model",
30-
[
31-
"gpt-4o",
32-
# "gpt-4o-mini",
33-
# "gpt-3.5-turbo",
34-
"fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
35-
"fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
36-
],
37-
)
38-
async def test_multiple_models(model):
39-
response = await litellm.acompletion(model=model, messages=[{"role": "user", "content": "Hello"}])
40-
assert response.choices[0].message.content # pyright: ignore[reportAttributeAccessIssue]
4124

4225

4326
@pytest.mark.asyncio
4427
@evaluation_test(
4528
input_rows=[
4629
fetch_langfuse_traces_as_evaluation_rows(
4730
hours_back=24,
48-
limit=1,
31+
limit=40,
4932
page_size=10,
5033
sleep_between_gets=3.0,
5134
max_retries=5,
@@ -55,7 +38,7 @@ async def test_multiple_models(model):
5538
# {
5639
# "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
5740
# },
58-
# {"model": "gpt-4.1"},
41+
{"model": "gpt-4.1"},
5942
{
6043
"max_tokens": 131000,
6144
"extra_body": {"reasoning_effort": "medium"},
@@ -66,14 +49,9 @@ async def test_multiple_models(model):
6649
"extra_body": {"reasoning_effort": "low"},
6750
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
6851
},
69-
{
70-
"max_tokens": 131000,
71-
"extra_body": {"reasoning_effort": "low"},
72-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
73-
},
7452
],
7553
rollout_processor=SingleTurnRolloutProcessor(),
76-
# preprocess_fn=split_multi_turn_rows,
54+
preprocess_fn=split_multi_turn_rows,
7755
max_concurrent_rollouts=64,
7856
mode="all",
7957
)
@@ -95,8 +73,7 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
9573
Same rows with updated evaluation_result containing scores and judgments
9674
"""
9775

98-
# judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.
99-
judge_name = "gpt-4.1"
76+
judge_name = "kimi-k2-instruct-0905" # Edit to which judge you'd like to use. Configs are in utils.py.
10077

10178
if not rows:
10279
print("❌ No evaluation rows provided")

eval_protocol/quickstart/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@
4949
"base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
5050
"max_concurrency": 16,
5151
},
52+
"kimi-k2-instruct-0905": {
53+
"model": "kimi-k2-instruct-0905",
54+
"temperature": 0.6, # Kimi recommended temperature
55+
"max_tokens": 131000,
56+
"api_key": os.getenv("FIREWORKS_API_KEY"),
57+
"base_url": "https://api.fireworks.ai/inference/v1",
58+
"max_concurrency": 64,
59+
},
5260
}
5361

5462
# Mapping from Arena-Hard-Auto judgment labels to numerical scores

0 commit comments

Comments
 (0)