diff --git a/eval_protocol/benchmarks/test_frozen_lake.py b/eval_protocol/benchmarks/test_frozen_lake.py index f0b7ef55..af7c007d 100644 --- a/eval_protocol/benchmarks/test_frozen_lake.py +++ b/eval_protocol/benchmarks/test_frozen_lake.py @@ -42,7 +42,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation { "temperature": 0.0, "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], rollout_processor=MCPGymRolloutProcessor(), diff --git a/eval_protocol/quickstart/aha_judge/llm_judge.py b/eval_protocol/quickstart/aha_judge/llm_judge.py index b8e0a212..0400ff58 100644 --- a/eval_protocol/quickstart/aha_judge/llm_judge.py +++ b/eval_protocol/quickstart/aha_judge/llm_judge.py @@ -17,7 +17,7 @@ async def aha_judge( - row: EvaluationRow, judge_name: str = "kimi-k2-instruct-0905", adapter: Optional[BaseAdapter] = None + row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None ) -> EvaluationRow: """ LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row. diff --git a/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py b/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py index b7aa5110..fe37944d 100644 --- a/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py @@ -50,7 +50,7 @@ def openai_responses_data_generator(): "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", }, { - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", }, ], ) diff --git a/eval_protocol/quickstart/aha_judge/utils.py b/eval_protocol/quickstart/aha_judge/utils.py index 0ebc8432..d5e75e03 100644 --- a/eval_protocol/quickstart/aha_judge/utils.py +++ b/eval_protocol/quickstart/aha_judge/utils.py @@ -49,8 +49,8 @@ "api_key": os.getenv("GEMINI_API_KEY"), "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/", }, - "kimi-k2-instruct-0905": { - "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "kimi-k2p5": { + "model": "accounts/fireworks/models/kimi-k2p5", "temperature": 0.6, # Kimi recommended temperature "max_tokens": 131000, "api_key": os.getenv("FIREWORKS_API_KEY"), diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py index a5225857..d7b51ffa 100644 --- a/eval_protocol/quickstart/llm_judge.py +++ b/eval_protocol/quickstart/llm_judge.py @@ -17,7 +17,7 @@ async def aha_judge( - row: EvaluationRow, judge_name: str = "kimi-k2-instruct-0905", adapter: Optional[BaseAdapter] = None + row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None ) -> EvaluationRow: """ LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row. diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py index 36685425..98f61c02 100644 --- a/eval_protocol/quickstart/utils.py +++ b/eval_protocol/quickstart/utils.py @@ -51,8 +51,8 @@ "api_key": os.getenv("GEMINI_API_KEY"), "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/", }, - "kimi-k2-instruct-0905": { - "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "kimi-k2p5": { + "model": "accounts/fireworks/models/kimi-k2p5", "temperature": 0.6, # Kimi recommended temperature "max_tokens": 131000, "api_key": os.getenv("FIREWORKS_API_KEY"), diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index 15f30681..72bb5bad 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -40,7 +40,7 @@ }, # Fireworks models "kimi-k2": { - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", "temperature": 0.6, "max_tokens": 131000, }, @@ -68,7 +68,7 @@ def build_reflection_lm(reflection_lm_name: str) -> LM: Args: reflection_lm_name: One of the predefined configs ("gpt-5", "gpt-4o", - "claude-sonnet", "kimi-k2-instruct-0905") + "claude-sonnet", "kimi-k2p5") OR a raw LiteLLM model string (e.g., "openai/gpt-4o") Returns: diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py index 94f3bee0..04e7a578 100644 --- a/tests/chinook/pydantic/test_pydantic_chinook.py +++ b/tests/chinook/pydantic/test_pydantic_chinook.py @@ -38,8 +38,9 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: "completion_params", [ { - "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "accounts/fireworks/models/kimi-k2p5", "provider": "fireworks", + "reasoning_effort": "none", }, { "model": "gpt-5", @@ -88,7 +89,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow: ) else: model = OpenAIChatModel( - "accounts/fireworks/models/kimi-k2-instruct-0905", + "accounts/fireworks/models/kimi-k2p5", provider="fireworks", ) diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries.py b/tests/chinook/pydantic/test_pydantic_complex_queries.py index 583c90df..fb857e40 100644 --- a/tests/chinook/pydantic/test_pydantic_complex_queries.py +++ b/tests/chinook/pydantic/test_pydantic_complex_queries.py @@ -48,7 +48,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: # "provider": "fireworks", # }, # { - # "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + # "model": "accounts/fireworks/models/kimi-k2p5", # "provider": "fireworks", # }, {"model": "gpt-5"}, diff --git a/tests/pytest/data/basic_coding_dataset.jsonl b/tests/pytest/data/basic_coding_dataset.jsonl deleted file mode 100644 index fc25abcd..00000000 --- a/tests/pytest/data/basic_coding_dataset.jsonl +++ /dev/null @@ -1,10 +0,0 @@ -{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "5", "expected_output": "6"} -{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "-2", "expected_output": "-1"} -{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "0", "expected_output": "1"} -{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "3", "expected_output": "6"} -{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "-4", "expected_output": "-8"} -{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "0", "expected_output": "0"} -{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "10", "expected_output": "20"} -{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[1, 2, 3]", "expected_output": "3"} -{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[]", "expected_output": "0"} -{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "['a', 'b', 'c', 'd']", "expected_output": "4"} diff --git a/tests/pytest/data/halueval_sample_dataset.jsonl b/tests/pytest/data/halueval_sample_dataset.jsonl deleted file mode 100644 index e7671ac9..00000000 --- a/tests/pytest/data/halueval_sample_dataset.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -{"knowledge": " It is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol.Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and simple alcohol with the chemical formula C2H5OH .", "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", "right_answer": "alcohol", "hallucinated_answer": "water with a hint of alcohol"} -{"knowledge": "The Great Outdoors is a 1988 American comedy film directed by Howard Deutch, and written and produced by John Hughes. It stars Dan Aykroyd, John Candy, Stephanie Faracy and Annette Bening in her film debut.Annette Carol Bening (born May 29, 1958) is an American actress. She is a four-time Academy Award nominee; for \"The Grifters\" (1990), \"American Beauty\" (1999), \"Being Julia\" (2004) and \"The Kids Are All Right\" (2010). In 2006, she received a star on the Hollywood Walk of Fame.", "question": "The 1988 American comedy film, The Great Outdoors, starred a four-time Academy Award nominee, who received a star on the Hollywood Walk of Fame in what year?", "right_answer": "2006", "hallucinated_answer": "Annette Bening received her Hollywood star in 1988."} -{"knowledge": " Her self-titled debut studio album was released on 2 June 2017.\"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album (2017).", "question": "Dua Lipa, an English singer, songwriter and model, the album spawned the number-one single \"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album, released in what year?", "right_answer": "2017", "hallucinated_answer": "The album was released in 2018."} diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py index d7157ff1..1b3ea40a 100644 --- a/tests/pytest/test_apps_coding.py +++ b/tests/pytest/test_apps_coding.py @@ -30,7 +30,8 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio { "temperature": 0.0, "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], passed_threshold=0.33, diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py deleted file mode 100644 index 3b3ce560..00000000 --- a/tests/pytest/test_basic_coding.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Pytest test for coding code evaluation using the evaluation_test decorator. - -This test demonstrates how to evaluate code correctness by executing Python code locally -and comparing the output against expected results in a pointwise manner. -""" - -from typing import Any, Dict, List - -from eval_protocol.models import EvaluateResult, EvaluationRow, Message -from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test -from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks - - -def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: - """ - Convert entries from coding dataset to EvaluationRow objects. - """ - return [ - EvaluationRow( - messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")], - ground_truth=row["expected_output"], - ) - for row in data - ] - - -@evaluation_test( - input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"], - dataset_adapter=coding_dataset_to_evaluation_row, - completion_params=[ - { - "temperature": 0.0, - "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", - } - ], - passed_threshold=0.8, - rollout_processor=SingleTurnRolloutProcessor(), - num_runs=1, - mode="pointwise", -) -def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow: - """ - Evaluation function that tests code correctness by executing it locally. - - This function: - 1. Extracts Python code from the assistant's response - 2. Executes the code locally with timeout=10 - 3. Compares the output to ground_truth - 4. Returns a score of 1.0 if output matches, 0.0 otherwise - - Args: - row: EvaluationRow containing the conversation messages and expected_output in ground_truth - - Returns: - EvaluationRow with the evaluation result - """ - # Check if we have an assistant response - if len(row.messages) < 2 or row.messages[-1].role != "assistant": - row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found") - return row - - assistant_content = row.messages[-1].content or "" - expected_output = (row.ground_truth or "").strip() - - # Extract Python code blocks - code_blocks = extract_code_blocks(assistant_content, language="python") - if not code_blocks: - row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found") - return row - - code = code_blocks[0]["code"] - - # Execute the code locally - execution_result = execute_python_code(code, timeout=10) - - if not execution_result.get("success", False): - error_msg = execution_result.get("error", "Code execution failed") - row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}") - return row - - # Compare output with expected - actual_output = (execution_result.get("output", "") or "").strip() - - if actual_output == expected_output: - row.evaluation_result = EvaluateResult(score=1.0, reason=f"✅ Output matches: '{actual_output}'") - else: - row.evaluation_result = EvaluateResult( - score=0.0, reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'" - ) - - return row diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py index 2a053425..9f5b6946 100644 --- a/tests/pytest/test_frozen_lake.py +++ b/tests/pytest/test_frozen_lake.py @@ -42,7 +42,8 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation { "temperature": 0.0, "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], rollout_processor=MCPGymRolloutProcessor(), diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py deleted file mode 100644 index 0003a88e..00000000 --- a/tests/pytest/test_hallucination.py +++ /dev/null @@ -1,119 +0,0 @@ -""" -Hallucination detection test using LLM-as-judge. - -This test demonstrates how to detect factual inaccuracies in model responses -by comparing them against provided knowledge using an LLM judge, similar to -tau's evaluate_nl_assertions approach. -""" - -import json -from typing import Any, Dict, List -import pytest - -import litellm - -from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult -from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test - -# Configure the judge model for LiteLLM -JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905" - - -def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]: - """Convert HaluEval dataset to EvaluationRow objects.""" - return [ - EvaluationRow( - messages=[Message(role="user", content=f"Knowledge: {item['knowledge']}\n\nQuestion: {item['question']}")], - ground_truth=item["right_answer"], - ) - for item in data - ] - - -@pytest.mark.asyncio -@evaluation_test( - input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"], - dataset_adapter=hallucination_dataset_adapter, - completion_params=[ - { - "temperature": 0.0, - "max_tokens": 512, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", - } - ], - rollout_processor=SingleTurnRolloutProcessor(), - passed_threshold=0.33, - num_runs=1, - mode="pointwise", -) -async def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow: - """ - Test for response correctness using LLM-as-judge. - """ - messages = row.messages - assistant_response = messages[-1].content - - if not assistant_response: - return EvaluateResult(score=0.0, reason="❌ No assistant response found") - - correct_answer = row.ground_truth - - system_prompt = """ - TASK - - You will be given an assistant's response and the correct answer. - - Your job is to evaluate whether the assistant's response is factually consistent with the correct answer. - - Grade whether the assistant got it right or wrong. - - FORMAT - - Your response should be a JSON object with the following fields: - - `reasoning`: a short explanation for your classification - - `is_correct`: `true` if the assistant's response matches the correct answer, `false` otherwise - - Example response structure: - { - "reasoning": "", - "is_correct": - } - """ - - user_prompt = f""" - assistant_response: - {assistant_response} - - correct_answer: - {correct_answer} - """ - - try: - response = await litellm.acompletion( - model=JUDGE_MODEL, - messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}], - temperature=0.1, - max_tokens=500, - ) - - result_data = json.loads(response.choices[0].message.content) - is_correct = result_data.get("is_correct", False) - reasoning = result_data.get("reasoning", "Could not parse reasoning") - - except Exception as e: - # Fallback if parsing fails - is_correct = False - reasoning = f"Evaluation failed: {str(e)}" - - score = 1.0 if is_correct else 0.0 - - if is_correct: - assessment = "✅ Response is correct" - else: - assessment = "❌ Response is incorrect" - - reason = f"{assessment}\nReasoning: {reasoning}" - - row.evaluation_result = EvaluateResult( - score=score, - reason=reason, - metrics={"llm_judge": MetricResult(score=score, reason=reasoning, is_score_valid=True)}, - ) - - return row diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py index 0f52a7ad..3640a8c5 100644 --- a/tests/pytest/test_openenv_browsergym_basic.py +++ b/tests/pytest/test_openenv_browsergym_basic.py @@ -65,10 +65,11 @@ def test_openenv_browsergym_basic(): "model": os.getenv( "OPENENV_TEST_MODEL", # Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY - "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "fireworks_ai/accounts/fireworks/models/kimi-k2p5", ), "temperature": 0.0, "max_tokens": 16, + "reasoning_effort": "none", } # Limit to a single step to keep the test fast and robust diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py index 251608fa..2621297f 100644 --- a/tests/pytest/test_openenv_browsergym_eval.py +++ b/tests/pytest/test_openenv_browsergym_eval.py @@ -233,7 +233,8 @@ def action_parser(response_text: str): { "temperature": 0.0, "max_tokens": 512, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], # Keep concurrency and steps low for a quick health-check diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py index 3b8bed01..78d78764 100644 --- a/tests/pytest/test_openenv_echo_hub.py +++ b/tests/pytest/test_openenv_echo_hub.py @@ -76,7 +76,8 @@ def action_parser(response_text: str): "temperature": 0.0, "max_tokens": 16, # Any working model with your API key; match other tests' default - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], num_runs=1, diff --git a/tests/pytest/test_openenv_textarena_docker.py b/tests/pytest/test_openenv_textarena_docker.py index 6e9da0c5..16f41e2c 100644 --- a/tests/pytest/test_openenv_textarena_docker.py +++ b/tests/pytest/test_openenv_textarena_docker.py @@ -94,7 +94,8 @@ def action_parser(response_text: str): "temperature": 0.7, "max_tokens": 32, # Any working model with your API key - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], num_runs=1, diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py index 9ed2b3dc..0ddec778 100644 --- a/tests/pytest/test_pytest_default_agent_rollout_processor.py +++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py @@ -19,7 +19,7 @@ ] ], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", "reasoning_effort": "none"}], mode="all", ) def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]: diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 9bcb3e97..73b13220 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -22,7 +22,7 @@ class ResponseFormat(BaseModel): @evaluation_test( input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-thinking"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json", ) @@ -34,7 +34,7 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1" ) as client: response = await client.chat.completions.create( - model="accounts/fireworks/models/kimi-k2-thinking", + model="accounts/fireworks/models/kimi-k2p5", messages=[ { "role": "system", diff --git a/tests/pytest/test_pytest_klavis_sandbox.py b/tests/pytest/test_pytest_klavis_sandbox.py index 7ae84bc3..f5619251 100644 --- a/tests/pytest/test_pytest_klavis_sandbox.py +++ b/tests/pytest/test_pytest_klavis_sandbox.py @@ -58,7 +58,7 @@ def klavis_gmail_sandbox_dataset_adapter(rows: list[dict]) -> list[EvaluationRow rollout_processor=KlavisSandboxRolloutProcessor( server_name="gmail", ), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-thinking"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5"}], mode="pointwise", dataset_adapter=klavis_gmail_sandbox_dataset_adapter, ) @@ -110,7 +110,7 @@ async def test_pytest_gmail_sandbox(row: EvaluationRow) -> EvaluationRow: try: response = await client.chat.completions.create( - model="accounts/fireworks/models/kimi-k2-thinking", + model="accounts/fireworks/models/kimi-k2p5", messages=[ { "role": "system", diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py index c0d35b87..deeb5d45 100644 --- a/tests/pytest/test_pytest_mcp_url.py +++ b/tests/pytest/test_pytest_mcp_url.py @@ -21,7 +21,7 @@ ] ], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", "reasoning_effort": "none"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json", )