From 65b25a413b6bed175530e30ebc25ba61731d21ea Mon Sep 17 00:00:00 2001
From: zjwu0522 <zijian.wu@u.nus.edu>
Date: Wed, 21 Jan 2026 09:39:13 +0000
Subject: [PATCH 1/2] Pin MCP server versions for reproducible benchmarks

MCP servers are constantly evolving, and version changes can affect
benchmark results. To ensure reproducibility, we pin to specific versions:

- GitHub MCP Server: v0.15.0 (93 tools)
  - Switched from HTTP remote API to Docker-based STDIO for version control
  - Remote API at api.githubcopilot.com doesn't support version pinning

- Notion MCP Server: @notionhq/notion-mcp-server@1.9.1
  - Already pinned, no change needed

Also includes improvements to NotionStateManager:
- Add browser instance reuse within session for better performance
- Add source hub orphan cleanup to prevent duplicate page accumulation
- Add UI-based recovery for duplicate page detection
- Improve error handling and logging

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/agents/base_agent.py                      |   3 +-
 src/agents/mcpmark_agent.py                   |  30 +-
 .../notion/notion_state_manager.py            | 328 +++++++++++++++---
 3 files changed, 295 insertions(+), 66 deletions(-)

diff --git a/src/agents/base_agent.py b/src/agents/base_agent.py
index 4db4ccee..c49b4cda 100644
--- a/src/agents/base_agent.py
+++ b/src/agents/base_agent.py
@@ -26,8 +26,9 @@ class BaseMCPAgent(ABC):
         "playwright_webarena",
         "postgres",
         "insforge",
+        "github",
     ]
-    HTTP_SERVICES = ["github", "supabase"]
+    HTTP_SERVICES = ["supabase"]
     DEFAULT_TIMEOUT = 600
     COMPACTION_DISABLED_TOKEN = 999_999_999
 
diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py
index ad176958..6f072c5c 100644
--- a/src/agents/mcpmark_agent.py
+++ b/src/agents/mcpmark_agent.py
@@ -1113,7 +1113,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
 
             return MCPStdioServer(
                 command="npx",
-                args=["-y", "@notionhq/notion-mcp-server"],
+                args=["-y", "@notionhq/notion-mcp-server@1.9.1"],
                 env={
                     "OPENAPI_MCP_HEADERS": (
                         '{"Authorization": "Bearer ' + notion_key + '", '
@@ -1194,25 +1194,27 @@ def _create_stdio_server(self) -> MCPStdioServer:
                 },
             )
 
-        else:
-            raise ValueError(f"Unsupported stdio service: {self.mcp_service}")
-
-    def _create_http_server(self) -> MCPHttpServer:
-        """Create HTTP-based MCP server."""
-        if self.mcp_service == "github":
+        elif self.mcp_service == "github":
             github_token = self.service_config.get("github_token")
             if not github_token:
                 raise ValueError("GitHub token required")
 
-            return MCPHttpServer(
-                url="https://api.githubcopilot.com/mcp/",
-                headers={
-                    "Authorization": f"Bearer {github_token}",
-                    "User-Agent": "MCPMark/1.0",
-                },
+            return MCPStdioServer(
+                command="docker",
+                args=[
+                    "run", "-i", "--rm",
+                    "-e", "GITHUB_PERSONAL_ACCESS_TOKEN",
+                    "ghcr.io/github/github-mcp-server:v0.15.0",
+                ],
+                env={"GITHUB_PERSONAL_ACCESS_TOKEN": github_token},
             )
 
-        elif self.mcp_service == "supabase":
+        else:
+            raise ValueError(f"Unsupported stdio service: {self.mcp_service}")
+
+    def _create_http_server(self) -> MCPHttpServer:
+        """Create HTTP-based MCP server."""
+        if self.mcp_service == "supabase":
             # Use built-in MCP server from Supabase CLI
             api_url = self.service_config.get("api_url", "http://localhost:54321")
             api_key = self.service_config.get("api_key", "")
diff --git a/src/mcp_services/notion/notion_state_manager.py b/src/mcp_services/notion/notion_state_manager.py
index 8e985863..0d6b4eee 100644
--- a/src/mcp_services/notion/notion_state_manager.py
+++ b/src/mcp_services/notion/notion_state_manager.py
@@ -8,12 +8,14 @@
 
 import time
 from pathlib import Path
-from typing import Optional, Tuple, Dict, Any
+from typing import Optional, Tuple, Dict, Any, Set
 
 from notion_client import Client
 from playwright.sync_api import (
     Browser,
+    BrowserContext,
     Page,
+    Playwright,
     TimeoutError as PlaywrightTimeoutError,
     sync_playwright,
 )
@@ -27,6 +29,9 @@
 # Initialize logger
 logger = get_logger(__name__)
 
+# Pattern to match orphan pages with "(n)" suffix, e.g., "Title (1)", "Title (2)"
+ORPHAN_PAGE_PATTERN = re.compile(r".+\s+\(\d+\)$")
+
 # Selectors for Notion UI elements
 PAGE_MENU_BUTTON_SELECTOR = '[data-testid="more-button"], div.notion-topbar-more-button, [aria-label="More"], button[aria-label="More"]'
 DUPLICATE_MENU_ITEM_SELECTOR = 'text="Duplicate"'
@@ -90,6 +95,11 @@ def __init__(
         self._eval_parent_page_id: Optional[str] = None
         self._source_hub_page_id: Optional[str] = None
 
+        # Browser instance management for reuse within session
+        self._playwright: Optional[Playwright] = None
+        self._browser: Optional[Browser] = None
+        self._context: Optional[BrowserContext] = None
+
         # Validate initialization
         if not self.source_notion_client or not self.eval_notion_client:
             raise ValueError(
@@ -146,6 +156,64 @@ def _cleanup_eval_hub_orphans(self) -> None:
             logger.warning("Orphan cleanup failed (non-critical, continuing): %s", e)
             # Don't raise exception - allow execution to continue
 
+    def _cleanup_source_hub_orphans(self, exclude_page_ids: Optional[Set[str]] = None) -> int:
+        """Clean up all orphan pages in source hub matching 'xxx (n)' pattern.
+
+        Args:
+            exclude_page_ids: Page IDs to exclude from cleanup (e.g., pages currently being operated on)
+
+        Returns:
+            Number of pages archived
+        """
+        exclude_page_ids = exclude_page_ids or set()
+        source_hub_id = self._ensure_source_hub_page_id()
+        if not source_hub_id:
+            return 0
+
+        orphan_count = 0
+        next_cursor = None
+
+        try:
+            while True:
+                kwargs: Dict[str, Any] = {"block_id": source_hub_id}
+                if next_cursor:
+                    kwargs["start_cursor"] = next_cursor
+
+                children = self.source_notion_client.blocks.children.list(**kwargs)
+
+                for child in children.get("results", []):
+                    if child.get("type") != "child_page":
+                        continue
+
+                    child_id = child.get("id")
+                    if child_id in exclude_page_ids:
+                        continue
+
+                    child_title = (child.get("child_page", {}) or {}).get("title", "").strip()
+
+                    # Match "xxx (n)" pattern where n is any digit(s)
+                    if ORPHAN_PAGE_PATTERN.match(child_title):
+                        try:
+                            self.source_notion_client.pages.update(
+                                page_id=child_id, archived=True
+                            )
+                            orphan_count += 1
+                            logger.info("| ✓ Archived source hub orphan: %s (%s)", child_title, child_id)
+                        except Exception as e:
+                            logger.warning("| ✗ Failed to archive orphan %s: %s", child_id, e)
+
+                if not children.get("has_more"):
+                    break
+                next_cursor = children.get("next_cursor")
+
+            if orphan_count > 0:
+                logger.info("| ✓ Cleaned up %d orphan page(s) from source hub", orphan_count)
+
+        except Exception as e:
+            logger.warning("Source hub orphan cleanup failed (non-critical, continuing): %s", e)
+
+        return orphan_count
+
     def _ensure_eval_parent_page_id(self) -> Optional[str]:
         """Resolve and cache the evaluation hub parent page ID."""
         if self._eval_parent_page_id:
@@ -283,6 +351,9 @@ def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
         # Clean up any orphan pages in eval hub before creating new state
         self._cleanup_eval_hub_orphans()
 
+        # Clean up orphan pages in source hub before duplication
+        self._cleanup_source_hub_orphans()
+
         try:
             initial_state_title = self._category_to_initial_state_title(task.category_id)
             initial_state_info = self._find_initial_state_by_title(initial_state_title)
@@ -417,6 +488,141 @@ def _rename_initial_state_via_api(
     # Playwright helpers
     # ------------------------------------------------------------------
 
+    def _ensure_browser(self) -> Tuple[Browser, BrowserContext]:
+        """Ensure browser instance is available, reusing existing or creating new.
+
+        Returns:
+            Tuple of (Browser, BrowserContext)
+        """
+        if self._playwright is None:
+            self._playwright = sync_playwright().start()
+
+        if self._browser is None:
+            browser_type = getattr(self._playwright, self.browser_name)
+            self._browser = browser_type.launch(headless=self.headless)
+
+        if self._context is None:
+            self._context = self._browser.new_context(
+                storage_state=str(self.state_file),
+                locale="en-US",
+            )
+
+        return self._browser, self._context
+
+    def close(self) -> None:
+        """Clean up browser resources. Should be called when session ends."""
+        if self._context:
+            try:
+                # Save storage state before closing
+                self._context.storage_state(path=str(self.state_file))
+                self._context.close()
+            except Exception:
+                pass
+            self._context = None
+
+        if self._browser:
+            try:
+                self._browser.close()
+            except Exception:
+                pass
+            self._browser = None
+
+        if self._playwright:
+            try:
+                self._playwright.stop()
+            except Exception:
+                pass
+            self._playwright = None
+
+    def _recover_duplicate_via_ui(
+        self,
+        page: Page,
+        original_title: str,
+        *,
+        timeout: int = 30_000,
+    ) -> Optional[str]:
+        """Recover duplicate page URL by navigating via UI when API-based recovery fails.
+
+        This method navigates to the source hub and locates the duplicate page
+        (e.g., "Title (1)") in the Notion sidebar, then clicks on it to obtain
+        the URL directly from the browser.
+
+        Args:
+            page: The Playwright page instance
+            original_title: The original page title (without suffix)
+            timeout: Timeout for UI operations in milliseconds
+
+        Returns:
+            The URL of the duplicate page if found, None otherwise
+        """
+        try:
+            source_hub_id = self._ensure_source_hub_page_id()
+            if not source_hub_id:
+                logger.warning("| ✗ Cannot resolve source hub for UI-based recovery")
+                return None
+
+            # Build URL to navigate to source hub
+            # Format: https://www.notion.so/<hub-id>
+            clean_hub_id = source_hub_id.replace("-", "")
+            source_hub_url = f"https://www.notion.so/{clean_hub_id}"
+
+            logger.info("| ○ Navigating to source hub for UI-based recovery...")
+            page.goto(source_hub_url, wait_until="domcontentloaded", timeout=60_000)
+            time.sleep(3)  # Allow page to settle
+
+            # Look for page title with "(n)" suffix pattern in sidebar or page content
+            # The duplicate will be named "Original Title (1)" or similar
+            duplicate_pattern = re.compile(rf"^{re.escape(original_title)}\s*\(\d+\)$")
+
+            # Try to find the duplicate page in the page list/sidebar
+            # Notion uses different selectors for page links, try common patterns
+            page_link_selectors = [
+                f'a:has-text("{original_title} (1)")',
+                f'div[data-block-id]:has-text("{original_title} (1)")',
+                f'[role="treeitem"]:has-text("{original_title} (1)")',
+            ]
+
+            for selector in page_link_selectors:
+                try:
+                    locator = page.locator(selector).first
+                    if locator.is_visible(timeout=5000):
+                        logger.info("| ○ Found duplicate page in UI, clicking...")
+                        locator.click()
+                        page.wait_for_load_state("domcontentloaded", timeout=timeout)
+                        time.sleep(3)
+                        recovered_url = page.url
+                        logger.info("| ✓ Recovered duplicate URL via UI: %s", recovered_url)
+                        return recovered_url
+                except Exception:
+                    continue
+
+            # If specific selectors didn't work, try a broader search
+            try:
+                # Look for any visible text matching the pattern and click it
+                all_text_elements = page.locator(f'text="{original_title} ("')
+                count = all_text_elements.count()
+                if count > 0:
+                    for i in range(count):
+                        element = all_text_elements.nth(i)
+                        text_content = element.text_content() or ""
+                        if duplicate_pattern.match(text_content.strip()):
+                            logger.info("| ○ Found duplicate via text search, clicking...")
+                            element.click()
+                            page.wait_for_load_state("domcontentloaded", timeout=timeout)
+                            time.sleep(3)
+                            recovered_url = page.url
+                            logger.info("| ✓ Recovered duplicate URL via UI text search: %s", recovered_url)
+                            return recovered_url
+            except Exception as e:
+                logger.debug("| ✗ Broad text search failed: %s", e)
+
+            logger.warning("| ✗ Could not locate duplicate '%s (n)' in UI", original_title)
+            return None
+
+        except Exception as e:
+            logger.warning("| ✗ UI-based recovery failed: %s", e)
+            return None
+
     # =========================================================================
     # Playwright Automation Methods
     # =========================================================================
@@ -700,7 +906,7 @@ def _duplicate_current_initial_state(
                                         "| ○ Navigating directly to latest '%s' duplicate via children list...",
                                         target_title,
                                     )
-                                    page.goto(fallback_url, wait_until="load", timeout=60_000)
+                                    page.goto(fallback_url, wait_until="domcontentloaded", timeout=120_000)
                                     time.sleep(5)
                                     duplicated_url = page.url
                                     break
@@ -716,19 +922,32 @@ def _duplicate_current_initial_state(
 
                     # Re-validate after attempted recovery
                     if not self._is_valid_duplicate_url(original_url, duplicated_url):
-                        logger.error(
-                            "| ✗ Could not locate a valid '%s' duplicate after recovery attempt.\n|  Original: %s\n|  Observed: %s",
-                            target_title,
-                            original_url,
-                            duplicated_url,
-                        )
-                        # Attempt to clean up stray duplicate before propagating error.
-                        self._cleanup_orphan_duplicate(
-                            original_initial_state_id, original_initial_state_title
+                        # API-based recovery failed, try UI-based recovery as last resort
+                        logger.warning(
+                            "| ✗ API-based recovery failed. Trying UI-based recovery..."
                         )
-                        raise RuntimeError(
-                            "Duplicate URL pattern mismatch – duplication likely failed"
+                        ui_recovered_url = self._recover_duplicate_via_ui(
+                            page,
+                            original_initial_state_title,
+                            timeout=wait_timeout,
                         )
+                        if ui_recovered_url and self._is_valid_duplicate_url(original_url, ui_recovered_url):
+                            duplicated_url = ui_recovered_url
+                            logger.info("| ✓ UI-based recovery successful")
+                        else:
+                            logger.error(
+                                "| ✗ Could not locate a valid '%s' duplicate after all recovery attempts.\n|  Original: %s\n|  Observed: %s",
+                                target_title,
+                                original_url,
+                                duplicated_url,
+                            )
+                            # Attempt to clean up stray duplicate before propagating error.
+                            self._cleanup_orphan_duplicate(
+                                original_initial_state_id, original_initial_state_title
+                            )
+                            raise RuntimeError(
+                                "Duplicate URL pattern mismatch – duplication likely failed"
+                            )
                 except Exception as search_exc:
                     logger.error(
                         "| ✗ Failed during recovery search for '%s': %s",
@@ -804,8 +1023,8 @@ def _cleanup_orphan_duplicate(
                 )
                 return False
 
-            # Only consider exactly one copy "Title (1)".
-            title_regex = re.compile(rf"^{re.escape(initial_state_title)}\s*\(1\)$")
+            # Match any numbered duplicate "Title (n)" where n is any digit(s)
+            title_regex = re.compile(rf"^{re.escape(initial_state_title)}\s*\(\d+\)$")
 
             archived_any = False
             next_cursor = None
@@ -869,44 +1088,43 @@ def _duplicate_initial_state_for_task(
         last_exc = None
         for attempt in range(max_retries + 1):
             wait_timeout = initial_wait_ms * (attempt + 1)
+            page = None
             try:
-                with sync_playwright() as p:
-                    browser_type = getattr(p, self.browser_name)
-                    browser: Browser = browser_type.launch(headless=self.headless)
-                    context = browser.new_context(storage_state=str(self.state_file))
-                    page = context.new_page()
-
-                    logger.info("| ○ Navigating to initial state for %s...", category)
-                    # Start timing from the moment we begin navigating to the initial state page.
-                    start_time = time.time()
-                    page.goto(initial_state_url, wait_until="load", timeout=60_000)
-                    context.storage_state(path=str(self.state_file))
-
-                    initial_state_id = self._extract_initial_state_id_from_url(
-                        initial_state_url
-                    )
-                    initial_state_title = self._category_to_initial_state_title(
-                        category
-                    )
+                # Reuse browser instance within session
+                _, context = self._ensure_browser()
+                page = context.new_page()
+
+                logger.info("| ○ Navigating to initial state for %s...", category)
+                # Start timing from the moment we begin navigating to the initial state page.
+                start_time = time.time()
+                page.goto(initial_state_url, wait_until="domcontentloaded", timeout=120_000)
+                context.storage_state(path=str(self.state_file))
+
+                initial_state_id = self._extract_initial_state_id_from_url(
+                    initial_state_url
+                )
+                initial_state_title = self._category_to_initial_state_title(
+                    category
+                )
 
-                    duplicated_id = self._duplicate_current_initial_state(
-                        page,
-                        new_title=initial_state_title,  # Use original initial state name without (1) suffix
-                        original_initial_state_id=initial_state_id,
-                        original_initial_state_title=initial_state_title,
-                        wait_timeout=wait_timeout,
-                    )
-                    duplicated_url = page.url
-                    # Validate URL pattern again at this higher level (should already be validated inside).
-                    context.storage_state(path=str(self.state_file))
-                    # Log how long the whole duplication (navigate → duplicate) took.
-                    elapsed = time.time() - start_time
-                    logger.info(
-                        "| ✓ Initial state duplicated successfully in %.2f seconds (task: %s).",
-                        elapsed,
-                        task_name,
-                    )
-                    return duplicated_url, duplicated_id
+                duplicated_id = self._duplicate_current_initial_state(
+                    page,
+                    new_title=initial_state_title,  # Use original initial state name without (1) suffix
+                    original_initial_state_id=initial_state_id,
+                    original_initial_state_title=initial_state_title,
+                    wait_timeout=wait_timeout,
+                )
+                duplicated_url = page.url
+                # Validate URL pattern again at this higher level (should already be validated inside).
+                context.storage_state(path=str(self.state_file))
+                # Log how long the whole duplication (navigate → duplicate) took.
+                elapsed = time.time() - start_time
+                logger.info(
+                    "| ✓ Initial state duplicated successfully in %.2f seconds (task: %s).",
+                    elapsed,
+                    task_name,
+                )
+                return duplicated_url, duplicated_id
             except Exception as e:
                 # No additional cleanup here—handled inside _duplicate_current_template.
                 last_exc = e
@@ -917,6 +1135,13 @@ def _duplicate_initial_state_for_task(
                         e,
                     )
                 time.sleep(120 * attempt + 120)
+            finally:
+                # Close the page to prevent accumulation within reused context
+                if page:
+                    try:
+                        page.close()
+                    except Exception:
+                        pass
 
         raise RuntimeError(
             f"Initial state duplication failed for task '{task_name}' after {max_retries + 1} attempts: {last_exc}"
@@ -939,3 +1164,4 @@ def get_service_config_for_agent(self) -> dict:
             service_config["notion_key"] = config["eval_api_key"]
 
         return service_config
+

From eb51fbbb8b99036adbaf845a1dfddc32d0bd2848 Mon Sep 17 00:00:00 2001
From: zjwu0522 <zijian.wu@u.nus.edu>
Date: Wed, 21 Jan 2026 09:40:19 +0000
Subject: [PATCH 2/2] docs: Add news entry for MCP server version pinning

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 89e3eede..0dc65c8a 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ MCPMark provides a reproducible, extensible benchmark for researchers and engine
 
 ## News
 
+- 📌 **21 Jan** — Pinned MCP server versions for reproducible benchmarks: GitHub MCP Server `v0.15.0` (switched to Docker for version control), Notion MCP Server `@1.9.1` (Notion released 2.0 but it has many bugs, not recommended). See [#246](https://github.com/eval-sys/mcpmark/pull/246).
 - 🔥 **13 Dec** — Added auto-compaction support (`--compaction-token`) to summarize long conversations and avoid context overflow during evaluation ([#236](https://github.com/eval-sys/mcpmark/pull/236])).
 - 🏅 **02 Dec** — Evaluated `gemini-3-pro-preview` (thinking: low): **Pass@1 50.6%** ± 2.3% — so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` 36.8% and `deepseek-v3.2-chat` 29.7%
 - 🔥 **02 Dec** — Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))