eval-protocol · benjibc · Sep 3, 2025 · Sep 3, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,13 +48,9 @@ jobs:
       - name: Ruff lint
         run: uv run ruff check .
 
-      - name: Type check with pyright
+      - name: Run pre-commit (format, lint, type check)
         run: |
-          # 'set +e' disables immediate exit on error so we can capture and report errors but exit 0
-          # Note: We currently suppress pyright failures to allow CI to pass while we iteratively fix all type issues.
-          # Once all type errors are resolved, we will remove this suppression and enforce strict type checking.
-          set +e
-          uv run basedpyright || true
+          uv run pre-commit run --all-files
 
   test-core:
     name: Core Tests (Python ${{ matrix.python-version }})

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,3 +31,4 @@ repos:
             NODE_OPTIONS: "--max-old-space-size=4096"
         # Only check Python files in the main package to reduce memory usage
         files: ^eval_protocol/.*\.py$
+    additional_dependencies: ["pre-commit>=3.7.0"]
diff --git a/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py b/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py
@@ -747,7 +747,7 @@ def tau2_airline_eval(
         elif role == "user":
             trajectory_objects.append(UserMessage(role=role, content=content))
         elif role == "tool":
-            tool_id = msg.tool_call_id or ""
+            tool_id = msg.tool_call_id if isinstance(msg.tool_call_id, str) else ""
             trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content, requestor="assistant"))
 
     reward = 1.0

diff --git a/eval_protocol/rewards/function_calling.py b/eval_protocol/rewards/function_calling.py
@@ -2,7 +2,7 @@
 import os
 import re
 import warnings
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union, Callable, cast
 
 # Import OpenAI at module level for mocking in tests
 try:
@@ -451,7 +451,8 @@ def schema_jaccard_reward(
         DeprecationWarning,
         stacklevel=2,
     )
-    return exact_tool_match_reward(messages=messages, ground_truth=ground_truth, **kwargs)
+    _exact_tool_match: Callable[..., EvaluateResult] = cast(Callable[..., EvaluateResult], exact_tool_match_reward)
+    return _exact_tool_match(messages=messages, ground_truth=ground_truth, **kwargs)
 
 
 @reward_function
@@ -493,7 +494,8 @@ def llm_judge_reward(
         DeprecationWarning,
         stacklevel=2,
     )
-    return exact_tool_match_reward(messages=messages, ground_truth=ground_truth, **kwargs)
+    _exact_tool_match: Callable[..., EvaluateResult] = cast(Callable[..., EvaluateResult], exact_tool_match_reward)
+    return _exact_tool_match(messages=messages, ground_truth=ground_truth, **kwargs)
 
 
 @reward_function
@@ -537,7 +539,8 @@ def composite_function_call_reward(
         DeprecationWarning,
         stacklevel=2,
     )
-    return exact_tool_match_reward(messages=messages, ground_truth=ground_truth, **kwargs)
+    _exact_tool_match: Callable[..., EvaluateResult] = cast(Callable[..., EvaluateResult], exact_tool_match_reward)
+    return _exact_tool_match(messages=messages, ground_truth=ground_truth, **kwargs)
 
 
 # JSON schema reward functions have been moved to json_schema.py module
diff --git a/eval_protocol/rewards/lean_prover.py b/eval_protocol/rewards/lean_prover.py
@@ -3,7 +3,7 @@
 from typing import Any, Dict, List, Optional
 
 from eval_protocol.models import EvaluateResult, Message, MetricResult
-from eval_protocol.reward_function import reward_function
+from eval_protocol.typed_interface import reward_function
 
 
 @reward_function