switch to 0905 (#352)

xzrderek · web-flow · commit 1b5357126bf7 · 2025-11-26T20:53:06.000-08:00
* switch to 0905

* update
diff --git a/eval_protocol/benchmarks/test_frozen_lake.py b/eval_protocol/benchmarks/test_frozen_lake.py
@@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
     dataset_adapter=frozen_lake_to_evaluation_row,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 4096,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     rollout_processor=MCPGymRolloutProcessor(),
     passed_threshold=0.66,
diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py
@@ -32,7 +32,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:
     "completion_params",
     [
         {
-            "model": "accounts/fireworks/models/kimi-k2-instruct",
+            "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
             "provider": "fireworks",
         },
         {
@@ -82,7 +82,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow:
         )
     else:
         model = OpenAIChatModel(
-            "accounts/fireworks/models/kimi-k2-instruct",
+            "accounts/fireworks/models/kimi-k2-instruct-0905",
             provider="fireworks",
         )
 
diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -27,7 +27,11 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
     dataset_adapter=apps_dataset_to_evaluation_row,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 4096,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     passed_threshold=0.33,
     rollout_processor=SingleTurnRolloutProcessor(),
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
@@ -29,7 +29,11 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
     dataset_adapter=coding_dataset_to_evaluation_row,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 4096,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     passed_threshold=0.8,
     rollout_processor=SingleTurnRolloutProcessor(),
diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
@@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
     dataset_adapter=frozen_lake_to_evaluation_row,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 4096,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     rollout_processor=MCPGymRolloutProcessor(),
     passed_threshold=0.66,
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
@@ -16,7 +16,7 @@
 from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
 
 # Configure the judge model for LiteLLM
-JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"
+JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"
 
 
 def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -35,7 +35,11 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
     dataset_adapter=hallucination_dataset_adapter,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 512,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     passed_threshold=0.33,
diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py
@@ -65,7 +65,7 @@ def test_openenv_browsergym_basic():
         "model": os.getenv(
             "OPENENV_TEST_MODEL",
             # Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY
-            "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
+            "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         ),
         "temperature": 0.0,
         "max_tokens": 16,
diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py
@@ -233,7 +233,7 @@ def action_parser(response_text: str):
         {
             "temperature": 0.0,
             "max_tokens": 512,
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         }
     ],
     # Keep concurrency and steps low for a quick health-check
diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py
@@ -76,7 +76,7 @@ def action_parser(response_text: str):
             "temperature": 0.0,
             "max_tokens": 16,
             # Any working model with your API key; match other tests' default
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         }
     ],
     num_runs=1,
diff --git a/tests/pytest/test_openenv_textarena_docker.py b/tests/pytest/test_openenv_textarena_docker.py
@@ -94,7 +94,7 @@ def action_parser(response_text: str):
             "temperature": 0.7,
             "max_tokens": 32,
             # Any working model with your API key
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         }
     ],
     num_runs=1,
diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py
@@ -19,7 +19,7 @@
         ]
     ],
     rollout_processor=AgentRolloutProcessor(),
-    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
+    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
     mode="all",
 )
 def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:
diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py
@@ -16,7 +16,7 @@ class ResponseFormat(BaseModel):
 @evaluation_test(
     input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
     rollout_processor=AgentRolloutProcessor(),
-    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
+    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
     mode="pointwise",
     mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
 )
diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py
@@ -21,7 +21,7 @@
         ]
     ],
     rollout_processor=AgentRolloutProcessor(),
-    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
+    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
     mode="pointwise",
     mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json",
 )

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:`
`32`	`32`	`"completion_params",`
`33`	`33`	`[`
`34`	`34`	`{`
`35`		`- "model": "accounts/fireworks/models/kimi-k2-instruct",`
	`35`	`+ "model": "accounts/fireworks/models/kimi-k2-instruct-0905",`
`36`	`36`	`"provider": "fireworks",`
`37`	`37`	`},`
`38`	`38`	`{`
`@@ -82,7 +82,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow:`
`82`	`82`	`)`
`83`	`83`	`else:`
`84`	`84`	`model = OpenAIChatModel(`
`85`		`- "accounts/fireworks/models/kimi-k2-instruct",`
	`85`	`+ "accounts/fireworks/models/kimi-k2-instruct-0905",`
`86`	`86`	`provider="fireworks",`
`87`	`87`	`)`
`88`	`88`
Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,7 @@ def action_parser(response_text: str):`
`233`	`233`	`{`
`234`	`234`	`"temperature": 0.0,`
`235`	`235`	`"max_tokens": 512,`
`236`		`- "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",`
	`236`	`+ "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",`
`237`	`237`	`}`
`238`	`238`	`],`
`239`	`239`	`# Keep concurrency and steps low for a quick health-check`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def action_parser(response_text: str):`
`76`	`76`	`"temperature": 0.0,`
`77`	`77`	`"max_tokens": 16,`
`78`	`78`	`# Any working model with your API key; match other tests' default`
`79`		`- "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",`
	`79`	`+ "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",`
`80`	`80`	`}`
`81`	`81`	`],`
`82`	`82`	`num_runs=1,`
Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ def action_parser(response_text: str):`
`94`	`94`	`"temperature": 0.7,`
`95`	`95`	`"max_tokens": 32,`
`96`	`96`	`# Any working model with your API key`
`97`		`- "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",`
	`97`	`+ "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",`
`98`	`98`	`}`
`99`	`99`	`],`
`100`	`100`	`num_runs=1,`