Skip to content

Commit 1b53571

Browse files
authored
switch to 0905 (#352)
* switch to 0905 * update
1 parent 19bdd2e commit 1b53571

13 files changed

+35
-15
lines changed

eval_protocol/benchmarks/test_frozen_lake.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
3939
input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
4040
dataset_adapter=frozen_lake_to_evaluation_row,
4141
completion_params=[
42-
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
42+
{
43+
"temperature": 0.0,
44+
"max_tokens": 4096,
45+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
46+
}
4347
],
4448
rollout_processor=MCPGymRolloutProcessor(),
4549
passed_threshold=0.66,

tests/chinook/pydantic/test_pydantic_chinook.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:
3232
"completion_params",
3333
[
3434
{
35-
"model": "accounts/fireworks/models/kimi-k2-instruct",
35+
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
3636
"provider": "fireworks",
3737
},
3838
{
@@ -82,7 +82,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow:
8282
)
8383
else:
8484
model = OpenAIChatModel(
85-
"accounts/fireworks/models/kimi-k2-instruct",
85+
"accounts/fireworks/models/kimi-k2-instruct-0905",
8686
provider="fireworks",
8787
)
8888

tests/pytest/test_apps_coding.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
2727
input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
2828
dataset_adapter=apps_dataset_to_evaluation_row,
2929
completion_params=[
30-
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
30+
{
31+
"temperature": 0.0,
32+
"max_tokens": 4096,
33+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
34+
}
3135
],
3236
passed_threshold=0.33,
3337
rollout_processor=SingleTurnRolloutProcessor(),

tests/pytest/test_basic_coding.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,11 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
2929
input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
3030
dataset_adapter=coding_dataset_to_evaluation_row,
3131
completion_params=[
32-
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
32+
{
33+
"temperature": 0.0,
34+
"max_tokens": 4096,
35+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
36+
}
3337
],
3438
passed_threshold=0.8,
3539
rollout_processor=SingleTurnRolloutProcessor(),

tests/pytest/test_frozen_lake.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
3939
input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
4040
dataset_adapter=frozen_lake_to_evaluation_row,
4141
completion_params=[
42-
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
42+
{
43+
"temperature": 0.0,
44+
"max_tokens": 4096,
45+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
46+
}
4347
],
4448
rollout_processor=MCPGymRolloutProcessor(),
4549
passed_threshold=0.66,

tests/pytest/test_hallucination.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
1717

1818
# Configure the judge model for LiteLLM
19-
JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"
19+
JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"
2020

2121

2222
def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -35,7 +35,11 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
3535
input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
3636
dataset_adapter=hallucination_dataset_adapter,
3737
completion_params=[
38-
{"temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
38+
{
39+
"temperature": 0.0,
40+
"max_tokens": 512,
41+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
42+
}
3943
],
4044
rollout_processor=SingleTurnRolloutProcessor(),
4145
passed_threshold=0.33,

tests/pytest/test_openenv_browsergym_basic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def test_openenv_browsergym_basic():
6565
"model": os.getenv(
6666
"OPENENV_TEST_MODEL",
6767
# Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY
68-
"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
68+
"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
6969
),
7070
"temperature": 0.0,
7171
"max_tokens": 16,

tests/pytest/test_openenv_browsergym_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ def action_parser(response_text: str):
233233
{
234234
"temperature": 0.0,
235235
"max_tokens": 512,
236-
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
236+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
237237
}
238238
],
239239
# Keep concurrency and steps low for a quick health-check

tests/pytest/test_openenv_echo_hub.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def action_parser(response_text: str):
7676
"temperature": 0.0,
7777
"max_tokens": 16,
7878
# Any working model with your API key; match other tests' default
79-
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
79+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
8080
}
8181
],
8282
num_runs=1,

tests/pytest/test_openenv_textarena_docker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def action_parser(response_text: str):
9494
"temperature": 0.7,
9595
"max_tokens": 32,
9696
# Any working model with your API key
97-
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
97+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
9898
}
9999
],
100100
num_runs=1,

0 commit comments

Comments
 (0)