eval-protocol · SunnySoldier357 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/eval_protocol/benchmarks/test_frozen_lake.py b/eval_protocol/benchmarks/test_frozen_lake.py
@@ -42,7 +42,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
         {
             "temperature": 0.0,
             "max_tokens": 4096,
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
         }
     ],
     rollout_processor=MCPGymRolloutProcessor(),

diff --git a/eval_protocol/quickstart/aha_judge/llm_judge.py b/eval_protocol/quickstart/aha_judge/llm_judge.py
@@ -17,7 +17,7 @@
 
 
 async def aha_judge(
-    row: EvaluationRow, judge_name: str = "kimi-k2-instruct-0905", adapter: Optional[BaseAdapter] = None
+    row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None
 ) -> EvaluationRow:
     """
     LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row.

diff --git a/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py b/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py
@@ -50,7 +50,7 @@ def openai_responses_data_generator():
             "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
         },
         {
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
         },
     ],
 )

diff --git a/eval_protocol/quickstart/aha_judge/utils.py b/eval_protocol/quickstart/aha_judge/utils.py
@@ -49,8 +49,8 @@
         "api_key": os.getenv("GEMINI_API_KEY"),
         "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
     },
-    "kimi-k2-instruct-0905": {
-        "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
+    "kimi-k2p5": {
+        "model": "accounts/fireworks/models/kimi-k2p5",
         "temperature": 0.6,  # Kimi recommended temperature
         "max_tokens": 131000,
         "api_key": os.getenv("FIREWORKS_API_KEY"),

diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -17,7 +17,7 @@
 
 
 async def aha_judge(
-    row: EvaluationRow, judge_name: str = "kimi-k2-instruct-0905", adapter: Optional[BaseAdapter] = None
+    row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None
 ) -> EvaluationRow:
     """
     LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row.

diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py
@@ -51,8 +51,8 @@
         "api_key": os.getenv("GEMINI_API_KEY"),
         "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
     },
-    "kimi-k2-instruct-0905": {
-        "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
+    "kimi-k2p5": {
+        "model": "accounts/fireworks/models/kimi-k2p5",
         "temperature": 0.6,  # Kimi recommended temperature
         "max_tokens": 131000,
         "api_key": os.getenv("FIREWORKS_API_KEY"),

diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py
@@ -40,7 +40,7 @@
     },
     # Fireworks models
     "kimi-k2": {
-        "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
         "temperature": 0.6,
         "max_tokens": 131000,
     },
@@ -68,7 +68,7 @@ def build_reflection_lm(reflection_lm_name: str) -> LM:
 
     Args:
         reflection_lm_name: One of the predefined configs ("gpt-5", "gpt-4o",
-                           "claude-sonnet", "kimi-k2-instruct-0905")
+                           "claude-sonnet", "kimi-k2p5")
                            OR a raw LiteLLM model string (e.g., "openai/gpt-4o")
 
     Returns:

diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py
@@ -38,8 +38,9 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:
     "completion_params",
     [
         {
-            "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
+            "model": "accounts/fireworks/models/kimi-k2p5",
             "provider": "fireworks",
+            "reasoning_effort": "none",
         },
         {
             "model": "gpt-5",
@@ -88,7 +89,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow:
         )
     else:
         model = OpenAIChatModel(
-            "accounts/fireworks/models/kimi-k2-instruct-0905",
+            "accounts/fireworks/models/kimi-k2p5",
             provider="fireworks",
         )
 

diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries.py b/tests/chinook/pydantic/test_pydantic_complex_queries.py
@@ -48,7 +48,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:
         #     "provider": "fireworks",
         # },
         # {
-        #     "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
+        #     "model": "accounts/fireworks/models/kimi-k2p5",
         #     "provider": "fireworks",
         # },
         {"model": "gpt-5"},

diff --git a/tests/pytest/data/basic_coding_dataset.jsonl b/tests/pytest/data/basic_coding_dataset.jsonl
diff --git a/tests/pytest/data/halueval_sample_dataset.jsonl b/tests/pytest/data/halueval_sample_dataset.jsonl
diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -30,7 +30,8 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
         {
             "temperature": 0.0,
             "max_tokens": 4096,
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
+            "reasoning_effort": "none",
         }
     ],
     passed_threshold=0.33,

diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
@@ -42,7 +42,8 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
         {
             "temperature": 0.0,
             "max_tokens": 4096,
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
+            "reasoning_effort": "none",
         }
     ],
     rollout_processor=MCPGymRolloutProcessor(),

diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py
@@ -65,10 +65,11 @@ def test_openenv_browsergym_basic():
         "model": os.getenv(
             "OPENENV_TEST_MODEL",
             # Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY
-            "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+            "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
         ),
         "temperature": 0.0,
         "max_tokens": 16,
+        "reasoning_effort": "none",
     }
 
     # Limit to a single step to keep the test fast and robust

diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py
@@ -233,7 +233,8 @@ def action_parser(response_text: str):
         {
             "temperature": 0.0,
             "max_tokens": 512,
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
+            "reasoning_effort": "none",
         }
     ],
     # Keep concurrency and steps low for a quick health-check

diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py
@@ -76,7 +76,8 @@ def action_parser(response_text: str):
             "temperature": 0.0,
             "max_tokens": 16,
             # Any working model with your API key; match other tests' default
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
+            "reasoning_effort": "none",
         }
     ],
     num_runs=1,

diff --git a/tests/pytest/test_openenv_textarena_docker.py b/tests/pytest/test_openenv_textarena_docker.py
@@ -94,7 +94,8 @@ def action_parser(response_text: str):
             "temperature": 0.7,
             "max_tokens": 32,
             # Any working model with your API key
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
+            "reasoning_effort": "none",
         }
     ],
     num_runs=1,
-Original file line number
+Diff line change
@@ Expand Up / @@ -50,7 +50,7 @@ def openai_responses_data_generator(): @@
                 "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
             },
             {
-                "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+                "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
             },
         ],
     )
@@ Expand Down @@