Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eval_protocol/benchmarks/test_frozen_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
{
"temperature": 0.0,
"max_tokens": 4096,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
Comment thread
SunnySoldier357 marked this conversation as resolved.
}
],
rollout_processor=MCPGymRolloutProcessor(),
Expand Down
2 changes: 1 addition & 1 deletion eval_protocol/quickstart/aha_judge/llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


async def aha_judge(
row: EvaluationRow, judge_name: str = "kimi-k2-instruct-0905", adapter: Optional[BaseAdapter] = None
row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None
) -> EvaluationRow:
"""
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def openai_responses_data_generator():
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
},
{
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
},
],
)
Expand Down
4 changes: 2 additions & 2 deletions eval_protocol/quickstart/aha_judge/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
"api_key": os.getenv("GEMINI_API_KEY"),
"base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
},
"kimi-k2-instruct-0905": {
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
"kimi-k2p5": {
"model": "accounts/fireworks/models/kimi-k2p5",
"temperature": 0.6, # Kimi recommended temperature
"max_tokens": 131000,
"api_key": os.getenv("FIREWORKS_API_KEY"),
Expand Down
2 changes: 1 addition & 1 deletion eval_protocol/quickstart/llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


async def aha_judge(
row: EvaluationRow, judge_name: str = "kimi-k2-instruct-0905", adapter: Optional[BaseAdapter] = None
row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None
) -> EvaluationRow:
"""
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row.
Expand Down
4 changes: 2 additions & 2 deletions eval_protocol/quickstart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@
"api_key": os.getenv("GEMINI_API_KEY"),
"base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
},
"kimi-k2-instruct-0905": {
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
"kimi-k2p5": {
"model": "accounts/fireworks/models/kimi-k2p5",
"temperature": 0.6, # Kimi recommended temperature
"max_tokens": 131000,
"api_key": os.getenv("FIREWORKS_API_KEY"),
Expand Down
4 changes: 2 additions & 2 deletions eval_protocol/training/gepa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
},
# Fireworks models
"kimi-k2": {
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
"temperature": 0.6,
"max_tokens": 131000,
},
Expand Down Expand Up @@ -68,7 +68,7 @@ def build_reflection_lm(reflection_lm_name: str) -> LM:

Args:
reflection_lm_name: One of the predefined configs ("gpt-5", "gpt-4o",
"claude-sonnet", "kimi-k2-instruct-0905")
"claude-sonnet", "kimi-k2p5")
OR a raw LiteLLM model string (e.g., "openai/gpt-4o")

Returns:
Expand Down
5 changes: 3 additions & 2 deletions tests/chinook/pydantic/test_pydantic_chinook.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:
"completion_params",
[
{
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "accounts/fireworks/models/kimi-k2p5",
"provider": "fireworks",
"reasoning_effort": "none",
},
{
"model": "gpt-5",
Expand Down Expand Up @@ -88,7 +89,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow:
)
else:
model = OpenAIChatModel(
"accounts/fireworks/models/kimi-k2-instruct-0905",
"accounts/fireworks/models/kimi-k2p5",
provider="fireworks",
)

Expand Down
2 changes: 1 addition & 1 deletion tests/chinook/pydantic/test_pydantic_complex_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:
# "provider": "fireworks",
# },
# {
# "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
# "model": "accounts/fireworks/models/kimi-k2p5",
# "provider": "fireworks",
# },
{"model": "gpt-5"},
Expand Down
10 changes: 0 additions & 10 deletions tests/pytest/data/basic_coding_dataset.jsonl

This file was deleted.

3 changes: 0 additions & 3 deletions tests/pytest/data/halueval_sample_dataset.jsonl

This file was deleted.

3 changes: 2 additions & 1 deletion tests/pytest/test_apps_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
{
"temperature": 0.0,
"max_tokens": 4096,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
"reasoning_effort": "none",
}
],
passed_threshold=0.33,
Expand Down
93 changes: 0 additions & 93 deletions tests/pytest/test_basic_coding.py

This file was deleted.

3 changes: 2 additions & 1 deletion tests/pytest/test_frozen_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
{
"temperature": 0.0,
"max_tokens": 4096,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
"reasoning_effort": "none",
}
],
rollout_processor=MCPGymRolloutProcessor(),
Expand Down
119 changes: 0 additions & 119 deletions tests/pytest/test_hallucination.py

This file was deleted.

3 changes: 2 additions & 1 deletion tests/pytest/test_openenv_browsergym_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,11 @@ def test_openenv_browsergym_basic():
"model": os.getenv(
"OPENENV_TEST_MODEL",
# Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY
"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"fireworks_ai/accounts/fireworks/models/kimi-k2p5",
),
"temperature": 0.0,
"max_tokens": 16,
"reasoning_effort": "none",
Comment thread
SunnySoldier357 marked this conversation as resolved.
}

# Limit to a single step to keep the test fast and robust
Expand Down
3 changes: 2 additions & 1 deletion tests/pytest/test_openenv_browsergym_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ def action_parser(response_text: str):
{
"temperature": 0.0,
"max_tokens": 512,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
"reasoning_effort": "none",
}
],
# Keep concurrency and steps low for a quick health-check
Expand Down
3 changes: 2 additions & 1 deletion tests/pytest/test_openenv_echo_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def action_parser(response_text: str):
"temperature": 0.0,
"max_tokens": 16,
# Any working model with your API key; match other tests' default
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
"reasoning_effort": "none",
}
],
num_runs=1,
Expand Down
3 changes: 2 additions & 1 deletion tests/pytest/test_openenv_textarena_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def action_parser(response_text: str):
"temperature": 0.7,
"max_tokens": 32,
# Any working model with your API key
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2p5",
"reasoning_effort": "none",
}
],
num_runs=1,
Expand Down
Loading
Loading