Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ MCPMark provides a reproducible, extensible benchmark for researchers and engine

## News

- 📌 **21 Jan** — Pinned MCP server versions for reproducible benchmarks: GitHub MCP Server `v0.15.0` (switched to Docker for version control), Notion MCP Server `@1.9.1` (Notion released 2.0 but it has many bugs, not recommended). See [#246](https://github.com/eval-sys/mcpmark/pull/246).
- 🔥 **13 Dec** — Added auto-compaction support (`--compaction-token`) to summarize long conversations and avoid context overflow during evaluation ([#236](https://github.com/eval-sys/mcpmark/pull/236])).
- 🏅 **02 Dec** — Evaluated `gemini-3-pro-preview` (thinking: low): **Pass@1 50.6%** ± 2.3% — so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` 36.8% and `deepseek-v3.2-chat` 29.7%
- 🔥 **02 Dec** — Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))
Expand Down
3 changes: 2 additions & 1 deletion src/agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ class BaseMCPAgent(ABC):
"playwright_webarena",
"postgres",
"insforge",
"github",
]
HTTP_SERVICES = ["github", "supabase"]
HTTP_SERVICES = ["supabase"]
DEFAULT_TIMEOUT = 600
COMPACTION_DISABLED_TOKEN = 999_999_999

Expand Down
30 changes: 16 additions & 14 deletions src/agents/mcpmark_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,7 @@ def _create_stdio_server(self) -> MCPStdioServer:

return MCPStdioServer(
command="npx",
args=["-y", "@notionhq/notion-mcp-server"],
args=["-y", "@notionhq/notion-mcp-server@1.9.1"],
env={
"OPENAPI_MCP_HEADERS": (
'{"Authorization": "Bearer ' + notion_key + '", '
Expand Down Expand Up @@ -1194,25 +1194,27 @@ def _create_stdio_server(self) -> MCPStdioServer:
},
)

else:
raise ValueError(f"Unsupported stdio service: {self.mcp_service}")

def _create_http_server(self) -> MCPHttpServer:
"""Create HTTP-based MCP server."""
if self.mcp_service == "github":
elif self.mcp_service == "github":
github_token = self.service_config.get("github_token")
if not github_token:
raise ValueError("GitHub token required")

return MCPHttpServer(
url="https://api.githubcopilot.com/mcp/",
headers={
"Authorization": f"Bearer {github_token}",
"User-Agent": "MCPMark/1.0",
},
return MCPStdioServer(
command="docker",
args=[
"run", "-i", "--rm",
"-e", "GITHUB_PERSONAL_ACCESS_TOKEN",
"ghcr.io/github/github-mcp-server:v0.15.0",
],
env={"GITHUB_PERSONAL_ACCESS_TOKEN": github_token},
)

elif self.mcp_service == "supabase":
else:
raise ValueError(f"Unsupported stdio service: {self.mcp_service}")

def _create_http_server(self) -> MCPHttpServer:
"""Create HTTP-based MCP server."""
if self.mcp_service == "supabase":
# Use built-in MCP server from Supabase CLI
api_url = self.service_config.get("api_url", "http://localhost:54321")
api_key = self.service_config.get("api_key", "")
Expand Down
Loading