diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d291b6c0..901b9fad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,13 +48,9 @@ jobs: - name: Ruff lint run: uv run ruff check . - - name: Type check with pyright + - name: Run pre-commit (format, lint, type check) run: | - # 'set +e' disables immediate exit on error so we can capture and report errors but exit 0 - # Note: We currently suppress pyright failures to allow CI to pass while we iteratively fix all type issues. - # Once all type errors are resolved, we will remove this suppression and enforce strict type checking. - set +e - uv run basedpyright || true + uv run pre-commit run --all-files test-core: name: Core Tests (Python ${{ matrix.python-version }}) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2dce407d..72e9817c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,3 +31,4 @@ repos: NODE_OPTIONS: "--max-old-space-size=4096" # Only check Python files in the main package to reduce memory usage files: ^eval_protocol/.*\.py$ + additional_dependencies: ["pre-commit>=3.7.0"] diff --git a/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py b/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py index 03a61be4..b279dffd 100644 --- a/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +++ b/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py @@ -747,7 +747,7 @@ def tau2_airline_eval( elif role == "user": trajectory_objects.append(UserMessage(role=role, content=content)) elif role == "tool": - tool_id = msg.tool_call_id or "" + tool_id = msg.tool_call_id if isinstance(msg.tool_call_id, str) else "" trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content, requestor="assistant")) reward = 1.0 diff --git a/eval_protocol/rewards/function_calling.py b/eval_protocol/rewards/function_calling.py index 14e8bfe8..b87483b4 100644 --- a/eval_protocol/rewards/function_calling.py +++ b/eval_protocol/rewards/function_calling.py @@ -2,7 +2,7 @@ import os import re import warnings -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union, Callable, cast # Import OpenAI at module level for mocking in tests try: @@ -451,7 +451,8 @@ def schema_jaccard_reward( DeprecationWarning, stacklevel=2, ) - return exact_tool_match_reward(messages=messages, ground_truth=ground_truth, **kwargs) + _exact_tool_match: Callable[..., EvaluateResult] = cast(Callable[..., EvaluateResult], exact_tool_match_reward) + return _exact_tool_match(messages=messages, ground_truth=ground_truth, **kwargs) @reward_function @@ -493,7 +494,8 @@ def llm_judge_reward( DeprecationWarning, stacklevel=2, ) - return exact_tool_match_reward(messages=messages, ground_truth=ground_truth, **kwargs) + _exact_tool_match: Callable[..., EvaluateResult] = cast(Callable[..., EvaluateResult], exact_tool_match_reward) + return _exact_tool_match(messages=messages, ground_truth=ground_truth, **kwargs) @reward_function @@ -537,7 +539,8 @@ def composite_function_call_reward( DeprecationWarning, stacklevel=2, ) - return exact_tool_match_reward(messages=messages, ground_truth=ground_truth, **kwargs) + _exact_tool_match: Callable[..., EvaluateResult] = cast(Callable[..., EvaluateResult], exact_tool_match_reward) + return _exact_tool_match(messages=messages, ground_truth=ground_truth, **kwargs) # JSON schema reward functions have been moved to json_schema.py module diff --git a/eval_protocol/rewards/lean_prover.py b/eval_protocol/rewards/lean_prover.py index 45b23bf5..85c1ca85 100644 --- a/eval_protocol/rewards/lean_prover.py +++ b/eval_protocol/rewards/lean_prover.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional from eval_protocol.models import EvaluateResult, Message, MetricResult -from eval_protocol.reward_function import reward_function +from eval_protocol.typed_interface import reward_function @reward_function