From 5af3ca80efc30c3815423f703e2de75396828210 Mon Sep 17 00:00:00 2001 From: Sandeep Singh Date: Fri, 24 Apr 2026 12:57:06 -0700 Subject: [PATCH 1/5] use kimi k2.5 --- eval_protocol/benchmarks/test_frozen_lake.py | 2 +- eval_protocol/quickstart/aha_judge/llm_judge.py | 2 +- .../quickstart/aha_judge/llm_judge_openai_responses.py | 2 +- eval_protocol/quickstart/aha_judge/utils.py | 4 ++-- eval_protocol/quickstart/llm_judge.py | 2 +- eval_protocol/quickstart/utils.py | 4 ++-- eval_protocol/training/gepa_utils.py | 4 ++-- tests/chinook/pydantic/test_pydantic_chinook.py | 4 ++-- tests/chinook/pydantic/test_pydantic_complex_queries.py | 2 +- tests/pytest/test_apps_coding.py | 2 +- tests/pytest/test_basic_coding.py | 2 +- tests/pytest/test_frozen_lake.py | 2 +- tests/pytest/test_hallucination.py | 4 ++-- tests/pytest/test_openenv_browsergym_basic.py | 2 +- tests/pytest/test_openenv_browsergym_eval.py | 2 +- tests/pytest/test_openenv_echo_hub.py | 2 +- tests/pytest/test_openenv_textarena_docker.py | 2 +- tests/pytest/test_pytest_default_agent_rollout_processor.py | 2 +- tests/pytest/test_pytest_mcp_url.py | 2 +- 19 files changed, 24 insertions(+), 24 deletions(-) diff --git a/eval_protocol/benchmarks/test_frozen_lake.py b/eval_protocol/benchmarks/test_frozen_lake.py index f0b7ef55..af7c007d 100644 --- a/eval_protocol/benchmarks/test_frozen_lake.py +++ b/eval_protocol/benchmarks/test_frozen_lake.py @@ -42,7 +42,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation { "temperature": 0.0, "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], rollout_processor=MCPGymRolloutProcessor(), diff --git a/eval_protocol/quickstart/aha_judge/llm_judge.py b/eval_protocol/quickstart/aha_judge/llm_judge.py index b8e0a212..0400ff58 100644 --- a/eval_protocol/quickstart/aha_judge/llm_judge.py +++ b/eval_protocol/quickstart/aha_judge/llm_judge.py @@ -17,7 +17,7 @@ async def aha_judge( - row: EvaluationRow, judge_name: str = "kimi-k2-instruct-0905", adapter: Optional[BaseAdapter] = None + row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None ) -> EvaluationRow: """ LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row. diff --git a/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py b/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py index b7aa5110..fe37944d 100644 --- a/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py @@ -50,7 +50,7 @@ def openai_responses_data_generator(): "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", }, { - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", }, ], ) diff --git a/eval_protocol/quickstart/aha_judge/utils.py b/eval_protocol/quickstart/aha_judge/utils.py index 0ebc8432..d5e75e03 100644 --- a/eval_protocol/quickstart/aha_judge/utils.py +++ b/eval_protocol/quickstart/aha_judge/utils.py @@ -49,8 +49,8 @@ "api_key": os.getenv("GEMINI_API_KEY"), "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/", }, - "kimi-k2-instruct-0905": { - "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "kimi-k2p5": { + "model": "accounts/fireworks/models/kimi-k2p5", "temperature": 0.6, # Kimi recommended temperature "max_tokens": 131000, "api_key": os.getenv("FIREWORKS_API_KEY"), diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py index a5225857..d7b51ffa 100644 --- a/eval_protocol/quickstart/llm_judge.py +++ b/eval_protocol/quickstart/llm_judge.py @@ -17,7 +17,7 @@ async def aha_judge( - row: EvaluationRow, judge_name: str = "kimi-k2-instruct-0905", adapter: Optional[BaseAdapter] = None + row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None ) -> EvaluationRow: """ LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row. diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py index 36685425..98f61c02 100644 --- a/eval_protocol/quickstart/utils.py +++ b/eval_protocol/quickstart/utils.py @@ -51,8 +51,8 @@ "api_key": os.getenv("GEMINI_API_KEY"), "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/", }, - "kimi-k2-instruct-0905": { - "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "kimi-k2p5": { + "model": "accounts/fireworks/models/kimi-k2p5", "temperature": 0.6, # Kimi recommended temperature "max_tokens": 131000, "api_key": os.getenv("FIREWORKS_API_KEY"), diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index 15f30681..72bb5bad 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -40,7 +40,7 @@ }, # Fireworks models "kimi-k2": { - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", "temperature": 0.6, "max_tokens": 131000, }, @@ -68,7 +68,7 @@ def build_reflection_lm(reflection_lm_name: str) -> LM: Args: reflection_lm_name: One of the predefined configs ("gpt-5", "gpt-4o", - "claude-sonnet", "kimi-k2-instruct-0905") + "claude-sonnet", "kimi-k2p5") OR a raw LiteLLM model string (e.g., "openai/gpt-4o") Returns: diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py index 94f3bee0..6eebd803 100644 --- a/tests/chinook/pydantic/test_pydantic_chinook.py +++ b/tests/chinook/pydantic/test_pydantic_chinook.py @@ -38,7 +38,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: "completion_params", [ { - "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "accounts/fireworks/models/kimi-k2p5", "provider": "fireworks", }, { @@ -88,7 +88,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow: ) else: model = OpenAIChatModel( - "accounts/fireworks/models/kimi-k2-instruct-0905", + "accounts/fireworks/models/kimi-k2p5", provider="fireworks", ) diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries.py b/tests/chinook/pydantic/test_pydantic_complex_queries.py index 583c90df..fb857e40 100644 --- a/tests/chinook/pydantic/test_pydantic_complex_queries.py +++ b/tests/chinook/pydantic/test_pydantic_complex_queries.py @@ -48,7 +48,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: # "provider": "fireworks", # }, # { - # "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + # "model": "accounts/fireworks/models/kimi-k2p5", # "provider": "fireworks", # }, {"model": "gpt-5"}, diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py index d7157ff1..3fd63b63 100644 --- a/tests/pytest/test_apps_coding.py +++ b/tests/pytest/test_apps_coding.py @@ -30,7 +30,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio { "temperature": 0.0, "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], passed_threshold=0.33, diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py index 3b3ce560..ec9782e2 100644 --- a/tests/pytest/test_basic_coding.py +++ b/tests/pytest/test_basic_coding.py @@ -32,7 +32,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat { "temperature": 0.0, "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], passed_threshold=0.8, diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py index 2a053425..fa1faafc 100644 --- a/tests/pytest/test_frozen_lake.py +++ b/tests/pytest/test_frozen_lake.py @@ -42,7 +42,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation { "temperature": 0.0, "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], rollout_processor=MCPGymRolloutProcessor(), diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py index 0003a88e..6ca93dab 100644 --- a/tests/pytest/test_hallucination.py +++ b/tests/pytest/test_hallucination.py @@ -16,7 +16,7 @@ from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test # Configure the judge model for LiteLLM -JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905" +JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2p5" def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]: @@ -38,7 +38,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation { "temperature": 0.0, "max_tokens": 512, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], rollout_processor=SingleTurnRolloutProcessor(), diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py index 0f52a7ad..7c2ee8b6 100644 --- a/tests/pytest/test_openenv_browsergym_basic.py +++ b/tests/pytest/test_openenv_browsergym_basic.py @@ -65,7 +65,7 @@ def test_openenv_browsergym_basic(): "model": os.getenv( "OPENENV_TEST_MODEL", # Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY - "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "fireworks_ai/accounts/fireworks/models/kimi-k2p5", ), "temperature": 0.0, "max_tokens": 16, diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py index 251608fa..bdd9b7b1 100644 --- a/tests/pytest/test_openenv_browsergym_eval.py +++ b/tests/pytest/test_openenv_browsergym_eval.py @@ -233,7 +233,7 @@ def action_parser(response_text: str): { "temperature": 0.0, "max_tokens": 512, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], # Keep concurrency and steps low for a quick health-check diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py index 3b8bed01..b97e1ac9 100644 --- a/tests/pytest/test_openenv_echo_hub.py +++ b/tests/pytest/test_openenv_echo_hub.py @@ -76,7 +76,7 @@ def action_parser(response_text: str): "temperature": 0.0, "max_tokens": 16, # Any working model with your API key; match other tests' default - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], num_runs=1, diff --git a/tests/pytest/test_openenv_textarena_docker.py b/tests/pytest/test_openenv_textarena_docker.py index 6e9da0c5..758a1887 100644 --- a/tests/pytest/test_openenv_textarena_docker.py +++ b/tests/pytest/test_openenv_textarena_docker.py @@ -94,7 +94,7 @@ def action_parser(response_text: str): "temperature": 0.7, "max_tokens": 32, # Any working model with your API key - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", } ], num_runs=1, diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py index 9ed2b3dc..26cec8e8 100644 --- a/tests/pytest/test_pytest_default_agent_rollout_processor.py +++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py @@ -19,7 +19,7 @@ ] ], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5"}], mode="all", ) def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]: diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py index c0d35b87..fc82abec 100644 --- a/tests/pytest/test_pytest_mcp_url.py +++ b/tests/pytest/test_pytest_mcp_url.py @@ -21,7 +21,7 @@ ] ], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json", ) From 14563f6e37cc6e175c8d37a5f8b722e81464eab1 Mon Sep 17 00:00:00 2001 From: Sandeep Singh Date: Fri, 24 Apr 2026 13:01:56 -0700 Subject: [PATCH 2/5] turn reasoning off --- tests/chinook/pydantic/test_pydantic_chinook.py | 1 + tests/pytest/test_apps_coding.py | 1 + tests/pytest/test_basic_coding.py | 1 + tests/pytest/test_frozen_lake.py | 1 + tests/pytest/test_hallucination.py | 1 + tests/pytest/test_openenv_browsergym_basic.py | 1 + tests/pytest/test_openenv_browsergym_eval.py | 1 + tests/pytest/test_openenv_echo_hub.py | 1 + tests/pytest/test_openenv_textarena_docker.py | 1 + tests/pytest/test_pytest_default_agent_rollout_processor.py | 2 +- tests/pytest/test_pytest_mcp_url.py | 2 +- 11 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py index 6eebd803..04e7a578 100644 --- a/tests/chinook/pydantic/test_pydantic_chinook.py +++ b/tests/chinook/pydantic/test_pydantic_chinook.py @@ -40,6 +40,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: { "model": "accounts/fireworks/models/kimi-k2p5", "provider": "fireworks", + "reasoning_effort": "none", }, { "model": "gpt-5", diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py index 3fd63b63..1b3ea40a 100644 --- a/tests/pytest/test_apps_coding.py +++ b/tests/pytest/test_apps_coding.py @@ -31,6 +31,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio "temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], passed_threshold=0.33, diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py index ec9782e2..4deae092 100644 --- a/tests/pytest/test_basic_coding.py +++ b/tests/pytest/test_basic_coding.py @@ -33,6 +33,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat "temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], passed_threshold=0.8, diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py index fa1faafc..9f5b6946 100644 --- a/tests/pytest/test_frozen_lake.py +++ b/tests/pytest/test_frozen_lake.py @@ -43,6 +43,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation "temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], rollout_processor=MCPGymRolloutProcessor(), diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py index 6ca93dab..22ad6fb5 100644 --- a/tests/pytest/test_hallucination.py +++ b/tests/pytest/test_hallucination.py @@ -39,6 +39,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation "temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], rollout_processor=SingleTurnRolloutProcessor(), diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py index 7c2ee8b6..3640a8c5 100644 --- a/tests/pytest/test_openenv_browsergym_basic.py +++ b/tests/pytest/test_openenv_browsergym_basic.py @@ -69,6 +69,7 @@ def test_openenv_browsergym_basic(): ), "temperature": 0.0, "max_tokens": 16, + "reasoning_effort": "none", } # Limit to a single step to keep the test fast and robust diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py index bdd9b7b1..2621297f 100644 --- a/tests/pytest/test_openenv_browsergym_eval.py +++ b/tests/pytest/test_openenv_browsergym_eval.py @@ -234,6 +234,7 @@ def action_parser(response_text: str): "temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], # Keep concurrency and steps low for a quick health-check diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py index b97e1ac9..78d78764 100644 --- a/tests/pytest/test_openenv_echo_hub.py +++ b/tests/pytest/test_openenv_echo_hub.py @@ -77,6 +77,7 @@ def action_parser(response_text: str): "max_tokens": 16, # Any working model with your API key; match other tests' default "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], num_runs=1, diff --git a/tests/pytest/test_openenv_textarena_docker.py b/tests/pytest/test_openenv_textarena_docker.py index 758a1887..16f41e2c 100644 --- a/tests/pytest/test_openenv_textarena_docker.py +++ b/tests/pytest/test_openenv_textarena_docker.py @@ -95,6 +95,7 @@ def action_parser(response_text: str): "max_tokens": 32, # Any working model with your API key "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", + "reasoning_effort": "none", } ], num_runs=1, diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py index 26cec8e8..0ddec778 100644 --- a/tests/pytest/test_pytest_default_agent_rollout_processor.py +++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py @@ -19,7 +19,7 @@ ] ], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", "reasoning_effort": "none"}], mode="all", ) def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]: diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py index fc82abec..deeb5d45 100644 --- a/tests/pytest/test_pytest_mcp_url.py +++ b/tests/pytest/test_pytest_mcp_url.py @@ -21,7 +21,7 @@ ] ], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", "reasoning_effort": "none"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json", ) From 72599172695d235f52a5fa355e094e7efd70e109 Mon Sep 17 00:00:00 2001 From: Sandeep Singh Date: Fri, 24 Apr 2026 13:14:55 -0700 Subject: [PATCH 3/5] turn on reasoning --- tests/pytest/test_basic_coding.py | 1 - tests/pytest/test_hallucination.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py index 4deae092..ec9782e2 100644 --- a/tests/pytest/test_basic_coding.py +++ b/tests/pytest/test_basic_coding.py @@ -33,7 +33,6 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat "temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", - "reasoning_effort": "none", } ], passed_threshold=0.8, diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py index 22ad6fb5..6ca93dab 100644 --- a/tests/pytest/test_hallucination.py +++ b/tests/pytest/test_hallucination.py @@ -39,7 +39,6 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation "temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", - "reasoning_effort": "none", } ], rollout_processor=SingleTurnRolloutProcessor(), From 27944e8d23d77f8f04670abcc2c956e51164134c Mon Sep 17 00:00:00 2001 From: Sandeep Singh Date: Fri, 24 Apr 2026 13:34:37 -0700 Subject: [PATCH 4/5] delete them --- tests/pytest/data/basic_coding_dataset.jsonl | 10 -- .../pytest/data/halueval_sample_dataset.jsonl | 3 - tests/pytest/test_basic_coding.py | 93 -------------- tests/pytest/test_hallucination.py | 119 ------------------ 4 files changed, 225 deletions(-) delete mode 100644 tests/pytest/data/basic_coding_dataset.jsonl delete mode 100644 tests/pytest/data/halueval_sample_dataset.jsonl delete mode 100644 tests/pytest/test_basic_coding.py delete mode 100644 tests/pytest/test_hallucination.py diff --git a/tests/pytest/data/basic_coding_dataset.jsonl b/tests/pytest/data/basic_coding_dataset.jsonl deleted file mode 100644 index fc25abcd..00000000 --- a/tests/pytest/data/basic_coding_dataset.jsonl +++ /dev/null @@ -1,10 +0,0 @@ -{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "5", "expected_output": "6"} -{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "-2", "expected_output": "-1"} -{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "0", "expected_output": "1"} -{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "3", "expected_output": "6"} -{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "-4", "expected_output": "-8"} -{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "0", "expected_output": "0"} -{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "10", "expected_output": "20"} -{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[1, 2, 3]", "expected_output": "3"} -{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[]", "expected_output": "0"} -{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "['a', 'b', 'c', 'd']", "expected_output": "4"} diff --git a/tests/pytest/data/halueval_sample_dataset.jsonl b/tests/pytest/data/halueval_sample_dataset.jsonl deleted file mode 100644 index e7671ac9..00000000 --- a/tests/pytest/data/halueval_sample_dataset.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -{"knowledge": " It is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol.Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and simple alcohol with the chemical formula C2H5OH .", "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", "right_answer": "alcohol", "hallucinated_answer": "water with a hint of alcohol"} -{"knowledge": "The Great Outdoors is a 1988 American comedy film directed by Howard Deutch, and written and produced by John Hughes. It stars Dan Aykroyd, John Candy, Stephanie Faracy and Annette Bening in her film debut.Annette Carol Bening (born May 29, 1958) is an American actress. She is a four-time Academy Award nominee; for \"The Grifters\" (1990), \"American Beauty\" (1999), \"Being Julia\" (2004) and \"The Kids Are All Right\" (2010). In 2006, she received a star on the Hollywood Walk of Fame.", "question": "The 1988 American comedy film, The Great Outdoors, starred a four-time Academy Award nominee, who received a star on the Hollywood Walk of Fame in what year?", "right_answer": "2006", "hallucinated_answer": "Annette Bening received her Hollywood star in 1988."} -{"knowledge": " Her self-titled debut studio album was released on 2 June 2017.\"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album (2017).", "question": "Dua Lipa, an English singer, songwriter and model, the album spawned the number-one single \"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album, released in what year?", "right_answer": "2017", "hallucinated_answer": "The album was released in 2018."} diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py deleted file mode 100644 index ec9782e2..00000000 --- a/tests/pytest/test_basic_coding.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Pytest test for coding code evaluation using the evaluation_test decorator. - -This test demonstrates how to evaluate code correctness by executing Python code locally -and comparing the output against expected results in a pointwise manner. -""" - -from typing import Any, Dict, List - -from eval_protocol.models import EvaluateResult, EvaluationRow, Message -from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test -from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks - - -def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: - """ - Convert entries from coding dataset to EvaluationRow objects. - """ - return [ - EvaluationRow( - messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")], - ground_truth=row["expected_output"], - ) - for row in data - ] - - -@evaluation_test( - input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"], - dataset_adapter=coding_dataset_to_evaluation_row, - completion_params=[ - { - "temperature": 0.0, - "max_tokens": 4096, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", - } - ], - passed_threshold=0.8, - rollout_processor=SingleTurnRolloutProcessor(), - num_runs=1, - mode="pointwise", -) -def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow: - """ - Evaluation function that tests code correctness by executing it locally. - - This function: - 1. Extracts Python code from the assistant's response - 2. Executes the code locally with timeout=10 - 3. Compares the output to ground_truth - 4. Returns a score of 1.0 if output matches, 0.0 otherwise - - Args: - row: EvaluationRow containing the conversation messages and expected_output in ground_truth - - Returns: - EvaluationRow with the evaluation result - """ - # Check if we have an assistant response - if len(row.messages) < 2 or row.messages[-1].role != "assistant": - row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found") - return row - - assistant_content = row.messages[-1].content or "" - expected_output = (row.ground_truth or "").strip() - - # Extract Python code blocks - code_blocks = extract_code_blocks(assistant_content, language="python") - if not code_blocks: - row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found") - return row - - code = code_blocks[0]["code"] - - # Execute the code locally - execution_result = execute_python_code(code, timeout=10) - - if not execution_result.get("success", False): - error_msg = execution_result.get("error", "Code execution failed") - row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}") - return row - - # Compare output with expected - actual_output = (execution_result.get("output", "") or "").strip() - - if actual_output == expected_output: - row.evaluation_result = EvaluateResult(score=1.0, reason=f"✅ Output matches: '{actual_output}'") - else: - row.evaluation_result = EvaluateResult( - score=0.0, reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'" - ) - - return row diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py deleted file mode 100644 index 6ca93dab..00000000 --- a/tests/pytest/test_hallucination.py +++ /dev/null @@ -1,119 +0,0 @@ -""" -Hallucination detection test using LLM-as-judge. - -This test demonstrates how to detect factual inaccuracies in model responses -by comparing them against provided knowledge using an LLM judge, similar to -tau's evaluate_nl_assertions approach. -""" - -import json -from typing import Any, Dict, List -import pytest - -import litellm - -from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult -from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test - -# Configure the judge model for LiteLLM -JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2p5" - - -def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]: - """Convert HaluEval dataset to EvaluationRow objects.""" - return [ - EvaluationRow( - messages=[Message(role="user", content=f"Knowledge: {item['knowledge']}\n\nQuestion: {item['question']}")], - ground_truth=item["right_answer"], - ) - for item in data - ] - - -@pytest.mark.asyncio -@evaluation_test( - input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"], - dataset_adapter=hallucination_dataset_adapter, - completion_params=[ - { - "temperature": 0.0, - "max_tokens": 512, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5", - } - ], - rollout_processor=SingleTurnRolloutProcessor(), - passed_threshold=0.33, - num_runs=1, - mode="pointwise", -) -async def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow: - """ - Test for response correctness using LLM-as-judge. - """ - messages = row.messages - assistant_response = messages[-1].content - - if not assistant_response: - return EvaluateResult(score=0.0, reason="❌ No assistant response found") - - correct_answer = row.ground_truth - - system_prompt = """ - TASK - - You will be given an assistant's response and the correct answer. - - Your job is to evaluate whether the assistant's response is factually consistent with the correct answer. - - Grade whether the assistant got it right or wrong. - - FORMAT - - Your response should be a JSON object with the following fields: - - `reasoning`: a short explanation for your classification - - `is_correct`: `true` if the assistant's response matches the correct answer, `false` otherwise - - Example response structure: - { - "reasoning": "", - "is_correct": - } - """ - - user_prompt = f""" - assistant_response: - {assistant_response} - - correct_answer: - {correct_answer} - """ - - try: - response = await litellm.acompletion( - model=JUDGE_MODEL, - messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}], - temperature=0.1, - max_tokens=500, - ) - - result_data = json.loads(response.choices[0].message.content) - is_correct = result_data.get("is_correct", False) - reasoning = result_data.get("reasoning", "Could not parse reasoning") - - except Exception as e: - # Fallback if parsing fails - is_correct = False - reasoning = f"Evaluation failed: {str(e)}" - - score = 1.0 if is_correct else 0.0 - - if is_correct: - assessment = "✅ Response is correct" - else: - assessment = "❌ Response is incorrect" - - reason = f"{assessment}\nReasoning: {reasoning}" - - row.evaluation_result = EvaluateResult( - score=score, - reason=reason, - metrics={"llm_judge": MetricResult(score=score, reason=reasoning, is_score_valid=True)}, - ) - - return row From c2d5fa3d86f12c3604981c82111a39599e5e1857 Mon Sep 17 00:00:00 2001 From: Sandeep Singh Date: Fri, 24 Apr 2026 13:53:20 -0700 Subject: [PATCH 5/5] update model --- tests/pytest/test_pytest_klavis_mcp.py | 4 ++-- tests/pytest/test_pytest_klavis_sandbox.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 9bcb3e97..73b13220 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -22,7 +22,7 @@ class ResponseFormat(BaseModel): @evaluation_test( input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-thinking"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json", ) @@ -34,7 +34,7 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1" ) as client: response = await client.chat.completions.create( - model="accounts/fireworks/models/kimi-k2-thinking", + model="accounts/fireworks/models/kimi-k2p5", messages=[ { "role": "system", diff --git a/tests/pytest/test_pytest_klavis_sandbox.py b/tests/pytest/test_pytest_klavis_sandbox.py index 7ae84bc3..f5619251 100644 --- a/tests/pytest/test_pytest_klavis_sandbox.py +++ b/tests/pytest/test_pytest_klavis_sandbox.py @@ -58,7 +58,7 @@ def klavis_gmail_sandbox_dataset_adapter(rows: list[dict]) -> list[EvaluationRow rollout_processor=KlavisSandboxRolloutProcessor( server_name="gmail", ), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-thinking"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5"}], mode="pointwise", dataset_adapter=klavis_gmail_sandbox_dataset_adapter, ) @@ -110,7 +110,7 @@ async def test_pytest_gmail_sandbox(row: EvaluationRow) -> EvaluationRow: try: response = await client.chat.completions.create( - model="accounts/fireworks/models/kimi-k2-thinking", + model="accounts/fireworks/models/kimi-k2p5", messages=[ { "role": "system",