2121)
2222import asyncio
2323from openai import AsyncOpenAI
24- import litellm
25-
26-
27- @pytest .mark .asyncio
28- @pytest .mark .parametrize (
29- "model" ,
30- [
31- "gpt-4o" ,
32- # "gpt-4o-mini",
33- # "gpt-3.5-turbo",
34- "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ,
35- "fireworks_ai/accounts/fireworks/models/gpt-oss-20b" ,
36- ],
37- )
38- async def test_multiple_models (model ):
39- response = await litellm .acompletion (model = model , messages = [{"role" : "user" , "content" : "Hello" }])
40- assert response .choices [0 ].message .content # pyright: ignore[reportAttributeAccessIssue]
4124
4225
4326@pytest .mark .asyncio
4427@evaluation_test (
4528 input_rows = [
4629 fetch_langfuse_traces_as_evaluation_rows (
4730 hours_back = 24 ,
48- limit = 1 ,
31+ limit = 40 ,
4932 page_size = 10 ,
5033 sleep_between_gets = 3.0 ,
5134 max_retries = 5 ,
@@ -55,7 +38,7 @@ async def test_multiple_models(model):
5538 # {
5639 # "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
5740 # },
58- # {"model": "gpt-4.1"},
41+ {"model" : "gpt-4.1" },
5942 {
6043 "max_tokens" : 131000 ,
6144 "extra_body" : {"reasoning_effort" : "medium" },
@@ -66,14 +49,9 @@ async def test_multiple_models(model):
6649 "extra_body" : {"reasoning_effort" : "low" },
6750 "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-20b" ,
6851 },
69- {
70- "max_tokens" : 131000 ,
71- "extra_body" : {"reasoning_effort" : "low" },
72- "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ,
73- },
7452 ],
7553 rollout_processor = SingleTurnRolloutProcessor (),
76- # preprocess_fn=split_multi_turn_rows,
54+ preprocess_fn = split_multi_turn_rows ,
7755 max_concurrent_rollouts = 64 ,
7856 mode = "all" ,
7957)
@@ -95,8 +73,7 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
9573 Same rows with updated evaluation_result containing scores and judgments
9674 """
9775
98- # judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.
99- judge_name = "gpt-4.1"
76+ judge_name = "kimi-k2-instruct-0905" # Edit to which judge you'd like to use. Configs are in utils.py.
10077
10178 if not rows :
10279 print ("❌ No evaluation rows provided" )
0 commit comments