Skip to content

Commit e551be6

Browse files
author
Dylan Huang
committed
Merge branch 'main' into multi-agent-rollout-processor-changes-to-api
# Conflicts: # .vscode/settings.json # eval_protocol/pytest/default_pydantic_ai_rollout_processor.py # pyproject.toml # tests/pytest/test_pydantic_multi_agent.py
2 parents 5aac93b + d563336 commit e551be6

File tree

137 files changed

+4216
-2816
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

137 files changed

+4216
-2816
lines changed

.github/workflows/ci.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,12 @@ jobs:
4949
run: uv run ruff check .
5050

5151
- name: Type check with pyright
52-
run: uv run pyright
52+
run: |
53+
# 'set +e' disables immediate exit on error so we can capture and report errors but exit 0
54+
# Note: We currently suppress pyright failures to allow CI to pass while we iteratively fix all type issues.
55+
# Once all type errors are resolved, we will remove this suppression and enforce strict type checking.
56+
set +e
57+
uv run basedpyright || true
5358
5459
test-core:
5560
name: Core Tests (Python ${{ matrix.python-version }})

.github/workflows/e2e-smoke-test.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,17 @@ jobs:
186186
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
187187
echo " - Within acceptable range: 36%-60%"
188188
fi
189+
190+
- name: Send failure notification to Slack
191+
uses: act10ns/slack@v1
192+
if: failure()
193+
with:
194+
status: failure
195+
message: |
196+
E2E Smoke Test failed
197+
Success Rate: ${{ steps.run_test.outputs.success_rate || 'Unknown' }}
198+
Expected: 36%-60% to pass
199+
Test Exit Code: ${{ steps.run_test.outputs.test_exit_code || 'Unknown' }}
200+
Job: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
201+
env:
202+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
experiment_results/
2+
13
# Byte-compiled / optimized / DLL files
24
__pycache__/
35
*.py[cod]

.pre-commit-config.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@ repos:
2222
- id: ruff-format
2323
- id: ruff
2424
args: ["--fix"]
25-
26-
- repo: https://github.com/RobertCraigie/pyright-python
27-
rev: v1.1.403
25+
- repo: https://github.com/DetachHead/basedpyright-pre-commit-mirror
26+
rev: 1.31.3
2827
hooks:
29-
- id: pyright
28+
- id: basedpyright
29+
args: ["--level", "error"]
30+
env:
31+
NODE_OPTIONS: "--max-old-space-size=4096"
32+
# Only check Python files in the main package to reduce memory usage
33+
files: ^eval_protocol/.*\.py$

.vscode/extensions.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"recommendations": [
3+
"anysphere.cursorpyright",
4+
"ms-python.python",
5+
"ms-python.debugpy"
6+
]
7+
}

.vscode/settings.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
"python.testing.autoTestDiscoverOnSaveEnabled": true,
66
"python.defaultInterpreterPath": "./.venv/bin/python",
77
"python.testing.cwd": "${workspaceFolder}",
8-
"python.analysis.typeCheckingMode": "strict",
9-
"python.analysis.diagnosticMode": "workspace"
8+
"cursorpyright.analysis.diagnosticMode": "openFilesOnly",
9+
"editor.defaultFormatter": "charliermarsh.ruff",
10+
"editor.formatOnSave": true,
11+
"[python]": {
12+
"editor.defaultFormatter": "charliermarsh.ruff"
13+
}
1014
}

development/notes/pytest_integration_proposal.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def frozen_lake_rollout_processor(row: EvaluationRow, model: str, input_params:
149149
"""
150150
env_url = env_urls[0] if env_urls else None
151151
# ep.rollout handles the core interaction loop with the game environment.
152-
trajectories = ep.rollout(row, model, input_params, env_url)
152+
trajectories = await ep.rollout(row, model, input_params, env_url)
153153
return [t.to_evaluation_row() for t in trajectories]
154154

155155
@evaluation_test(

eval_protocol/_version.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,9 @@ def run_command(
121121
if verbose:
122122
print("unable to find command, tried %s" % (commands,))
123123
return None, None
124-
stdout = process.communicate()[0].strip().decode()
124+
stdout_bytes = process.communicate()[0]
125+
stdout_raw = stdout_bytes.decode() if isinstance(stdout_bytes, (bytes, bytearray)) else stdout_bytes
126+
stdout = str(stdout_raw).strip()
125127
if process.returncode != 0:
126128
if verbose:
127129
print("unable to run %s (error)" % dispcmd)

eval_protocol/adapters/bigquery.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,36 @@
77
from __future__ import annotations
88

99
import logging
10-
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Union
10+
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Union, cast, TypeAlias
1111

1212
from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
1313

1414
logger = logging.getLogger(__name__)
1515

1616
try:
17+
# Import at runtime if available
1718
from google.auth.exceptions import DefaultCredentialsError
18-
from google.cloud import bigquery
19+
from google.cloud import bigquery as _bigquery_runtime # type: ignore
1920
from google.cloud.exceptions import Forbidden, NotFound
2021
from google.oauth2 import service_account
2122

2223
BIGQUERY_AVAILABLE = True
2324
except ImportError:
25+
# Provide fallbacks for type checking/runtime when package is missing
26+
DefaultCredentialsError = Exception # type: ignore[assignment]
27+
Forbidden = Exception # type: ignore[assignment]
28+
NotFound = Exception # type: ignore[assignment]
29+
service_account: Any
30+
service_account = None
31+
_bigquery_runtime = None # type: ignore[assignment]
2432
BIGQUERY_AVAILABLE = False
2533
# Optional dependency: avoid noisy warnings during import
2634
logger.debug("Google Cloud BigQuery not installed. Optional feature disabled.")
2735

28-
# Avoid importing BigQuery types at runtime for annotations when not installed
29-
if TYPE_CHECKING:
30-
from google.cloud import bigquery as _bigquery_type
31-
32-
QueryParameterType = Union[
33-
_bigquery_type.ScalarQueryParameter,
34-
_bigquery_type.ArrayQueryParameter,
35-
]
36-
else:
37-
QueryParameterType = Any
36+
# Simple type aliases to avoid importing optional google types under pyright
37+
QueryParameterType: TypeAlias = Any
38+
BigQueryClient: TypeAlias = Any
39+
QueryJobConfig: TypeAlias = Any
3840

3941
# Type alias for transformation function
4042
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
@@ -98,7 +100,13 @@ def __init__(
98100
client_args["location"] = location
99101

100102
client_args.update(client_kwargs)
101-
self.client = bigquery.Client(**client_args)
103+
# Use runtime alias to avoid basedpyright import symbol error when lib is missing
104+
if _bigquery_runtime is None:
105+
raise ImportError(
106+
"google-cloud-bigquery is not installed. Install with: pip install 'eval-protocol[bigquery]'"
107+
)
108+
# Avoid strict typing on optional dependency
109+
self.client = _bigquery_runtime.Client(**client_args) # type: ignore[no-untyped-call, assignment]
102110

103111
except DefaultCredentialsError as e:
104112
logger.error("Failed to authenticate with BigQuery: %s", e)
@@ -139,7 +147,9 @@ def get_evaluation_rows(
139147
"""
140148
try:
141149
# Configure query job
142-
job_config = bigquery.QueryJobConfig()
150+
if _bigquery_runtime is None:
151+
raise RuntimeError("BigQuery runtime not available")
152+
job_config = _bigquery_runtime.QueryJobConfig() # type: ignore[no-untyped-call]
143153
if query_params:
144154
job_config.query_parameters = query_params
145155
if self.location:
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
from __future__ import annotations
2+
3+
import os
4+
from typing import Any, Dict, List, Optional
5+
6+
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, ToolMessage
7+
8+
from eval_protocol.models import Message
9+
10+
11+
def _dbg_enabled() -> bool:
12+
return os.getenv("EP_DEBUG_SERIALIZATION", "0").strip() == "1"
13+
14+
15+
def _dbg_print(*args):
16+
if _dbg_enabled():
17+
try:
18+
print(*args)
19+
except Exception:
20+
pass
21+
22+
23+
def serialize_lc_message_to_ep(msg: BaseMessage) -> Message:
24+
_dbg_print(
25+
"[EP-Ser] Input LC msg:",
26+
type(msg).__name__,
27+
{
28+
"has_additional_kwargs": isinstance(getattr(msg, "additional_kwargs", None), dict),
29+
"content_type": type(getattr(msg, "content", None)).__name__,
30+
},
31+
)
32+
33+
if isinstance(msg, HumanMessage):
34+
ep_msg = Message(role="user", content=str(msg.content))
35+
_dbg_print("[EP-Ser] -> EP Message:", {"role": ep_msg.role, "len": len(ep_msg.content or "")})
36+
return ep_msg
37+
38+
if isinstance(msg, AIMessage):
39+
content = ""
40+
if isinstance(msg.content, str):
41+
content = msg.content
42+
elif isinstance(msg.content, list):
43+
parts: List[str] = []
44+
for item in msg.content:
45+
if isinstance(item, dict):
46+
if item.get("type") == "text":
47+
parts.append(str(item.get("text", "")))
48+
elif isinstance(item, str):
49+
parts.append(item)
50+
content = "\n".join(parts)
51+
52+
tool_calls_payload: Optional[List[Dict[str, Any]]] = None
53+
54+
def _normalize_tool_calls(tc_list: List[Any]) -> List[Dict[str, Any]]:
55+
mapped: List[Dict[str, Any]] = []
56+
for call in tc_list:
57+
if not isinstance(call, dict):
58+
continue
59+
try:
60+
call_id = call.get("id") or "toolcall_0"
61+
if isinstance(call.get("function"), dict):
62+
fn = call["function"]
63+
fn_name = fn.get("name") or call.get("name") or "tool"
64+
fn_args = fn.get("arguments")
65+
else:
66+
fn_name = call.get("name") or "tool"
67+
fn_args = call.get("arguments") if call.get("arguments") is not None else call.get("args")
68+
if not isinstance(fn_args, str):
69+
import json as _json
70+
71+
fn_args = _json.dumps(fn_args or {}, ensure_ascii=False)
72+
mapped.append(
73+
{
74+
"id": call_id,
75+
"type": "function",
76+
"function": {"name": fn_name, "arguments": fn_args},
77+
}
78+
)
79+
except Exception:
80+
continue
81+
return mapped
82+
83+
ak = getattr(msg, "additional_kwargs", None)
84+
if isinstance(ak, dict):
85+
tc = ak.get("tool_calls")
86+
if isinstance(tc, list) and tc:
87+
mapped = _normalize_tool_calls(tc)
88+
if mapped:
89+
tool_calls_payload = mapped
90+
91+
if tool_calls_payload is None:
92+
raw_attr_tc = getattr(msg, "tool_calls", None)
93+
if isinstance(raw_attr_tc, list) and raw_attr_tc:
94+
mapped = _normalize_tool_calls(raw_attr_tc)
95+
if mapped:
96+
tool_calls_payload = mapped
97+
98+
# Extract reasoning/thinking parts into reasoning_content
99+
reasoning_content = None
100+
if isinstance(msg.content, list):
101+
collected = [
102+
it.get("thinking", "") for it in msg.content if isinstance(it, dict) and it.get("type") == "thinking"
103+
]
104+
if collected:
105+
reasoning_content = "\n\n".join([s for s in collected if s]) or None
106+
107+
# Message.tool_calls expects List[ChatCompletionMessageToolCall] | None.
108+
# We pass through Dicts at runtime but avoid type error by casting.
109+
ep_msg = Message(
110+
role="assistant",
111+
content=content,
112+
tool_calls=tool_calls_payload, # type: ignore[arg-type]
113+
reasoning_content=reasoning_content,
114+
)
115+
_dbg_print(
116+
"[EP-Ser] -> EP Message:",
117+
{
118+
"role": ep_msg.role,
119+
"content_len": len(ep_msg.content or ""),
120+
"tool_calls": len(ep_msg.tool_calls or []) if isinstance(ep_msg.tool_calls, list) else 0,
121+
},
122+
)
123+
return ep_msg
124+
125+
if isinstance(msg, ToolMessage):
126+
tool_name = msg.name or "tool"
127+
status = msg.status or "success"
128+
content = str(msg.content)
129+
tool_call_id = getattr(msg, "tool_call_id", None)
130+
ep_msg = Message(
131+
role="tool",
132+
name=tool_name,
133+
tool_call_id=tool_call_id,
134+
content=f'<{tool_name} status="{status}">\n{content}\n</{tool_name}>',
135+
)
136+
_dbg_print(
137+
"[EP-Ser] -> EP Message:", {"role": ep_msg.role, "name": ep_msg.name, "has_id": bool(ep_msg.tool_call_id)}
138+
)
139+
return ep_msg
140+
141+
ep_msg = Message(role=getattr(msg, "type", "assistant"), content=str(getattr(msg, "content", "")))
142+
_dbg_print("[EP-Ser] -> EP Message (fallback):", {"role": ep_msg.role, "len": len(ep_msg.content or "")})
143+
return ep_msg

0 commit comments

Comments
 (0)