From 1817e80d25fa62708715e114b9b89af67b59bcbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Czjwu0522=E2=80=9D?= <zijian.wu@u.nus.edu>
Date: Tue, 2 Dec 2025 16:25:22 +0000
Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20feat:=20obfuscate=20GitHub=20@?=
 =?UTF-8?q?=20mentions=20to=20prevent=20notification=20spam?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add _obfuscate_mentions() method that replaces @username with @username_XXXX
  (random suffix) to prevent notifications to real GitHub users
- Add safety check that prevents importing templates to public repositories
- Apply obfuscation to issue bodies, PR bodies, and all comment content
- Add README notice explaining the privacy requirement and changes

This addresses feedback from original GitHub authors who were inadvertently
notified when evaluation repositories were created as public.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 README.md                                     | 16 +++++
 .../github/github_state_manager.py            | 69 +++++++++++++++++--
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 68ba69aa..2473a2a9 100644
--- a/README.md
+++ b/README.md
@@ -152,6 +152,22 @@ Tip: MCPMark supports **auto-resume**. When re-running, only unfinished tasks wi
 
 You can also follow [Quickstart](docs/quickstart.md) for the shortest end-to-end path.
 
+### Important Notice: GitHub Repository Privacy
+
+> **Please ensure your evaluation repositories are set to PRIVATE.**
+
+GitHub state templates are now automatically downloaded from our CDN during evaluation — no manual download is required. However, because these templates contain issues and pull requests from real open-source repositories, the recreation process includes `@username` mentions of the original authors.
+
+**We have received feedback from original GitHub authors who were inadvertently notified** when evaluation repositories were created as public. To be a responsible member of the open-source community, we urge all users to:
+
+1. **Always keep evaluation repositories private** during the evaluation process.
+2. **In the latest version**, we have added random suffixes to all `@username` mentions (e.g., `@user` becomes `@user_x7k2`) and implemented a safety check that prevents importing templates to public repositories.
+3. **If you are using an older version of MCPMark**, please either:
+   - Pull the latest code immediately, or
+   - Manually ensure all GitHub evaluation repositories are set to private.
+
+Thank you for helping us maintain a respectful relationship with the open-source community.
+
 ---
 
 ## Results and metrics
diff --git a/src/mcp_services/github/github_state_manager.py b/src/mcp_services/github/github_state_manager.py
index 516cbaaf..ae07f0a9 100644
--- a/src/mcp_services/github/github_state_manager.py
+++ b/src/mcp_services/github/github_state_manager.py
@@ -253,6 +253,21 @@ def _push_repo(
         html_url = resp.json()["html_url"]
         logger.info("| [import] Target repository created: %s", html_url)
 
+        # Safety check: Prevent importing to public repositories
+        # Public repos would send @ mention notifications to real users, causing spam
+        if not private:
+            error_msg = (
+                "ERROR: Cannot import template to a public repository.\n\n"
+                "Reason: The template contains @ mentions of real GitHub users from the original\n"
+                "repository. Importing to a public repository would send notifications to these\n"
+                "users, which is disruptive and inappropriate.\n\n"
+                "Solution: Set private=True when calling _import_template_repo()."
+            )
+            logger.error(error_msg)
+            # Clean up the created repo before raising
+            self._delete_repository(owner, repo_name)
+            raise RuntimeError(error_msg)
+
         # Immediately disable GitHub Actions for ALL repositories to prevent any accidental triggers
         # We'll re-enable it later only for mcpmark-cicd
         logger.info(
@@ -317,7 +332,7 @@ def _create_comment(issue_number: int, body: str):
         def _create_issue(item: dict) -> Optional[int]:
             data = {
                 "title": item["title"],
-                "body": item.get("body", ""),
+                "body": self._obfuscate_mentions(item.get("body", "")),
                 "labels": item.get("labels", []),
             }
             r = self._request_with_retry(
@@ -337,7 +352,7 @@ def _create_issue(item: dict) -> Optional[int]:
             return new_no
 
         def _create_pull(pr_itm: dict) -> Optional[int]:
-            body = pr_itm.get("body", "")
+            body = self._obfuscate_mentions(pr_itm.get("body", ""))
             if pr_itm.get("is_from_fork", False):
                 fork_note = f"\n\n---\n_This PR was originally from a fork: **{pr_itm.get('fork_owner')}/{pr_itm.get('fork_repo')}** (branch: `{pr_itm['head']}`)_"
                 body = body + fork_note if body else fork_note[2:]
@@ -366,7 +381,10 @@ def _create_pull(pr_itm: dict) -> Optional[int]:
                 created_issues += 1
                 for c in itm.get("comments", []):
                     _create_comment(
-                        new_no, f"*Original author: @{c['user']}*\n\n{c['body']}"
+                        new_no,
+                        self._obfuscate_mentions(
+                            f"*Original author: @{c['user']}*\n\n{c['body']}"
+                        ),
                     )
         logger.info(
             "| [phase] Created %d out of %d issues", created_issues, len(issues_data)
@@ -382,12 +400,17 @@ def _create_pull(pr_itm: dict) -> Optional[int]:
                 created_prs += 1
                 for c in pr.get("comments", []):
                     _create_comment(
-                        new_pr_no, f"*Original author: @{c['user']}*\n\n{c['body']}"
+                        new_pr_no,
+                        self._obfuscate_mentions(
+                            f"*Original author: @{c['user']}*\n\n{c['body']}"
+                        ),
                     )
                 for rc in pr.get("review_comments", []):
                     _create_comment(
                         new_pr_no,
-                        f"*Original author: @{rc['user']}* (review)\n\n{rc['body']}",
+                        self._obfuscate_mentions(
+                            f"*Original author: @{rc['user']}* (review)\n\n{rc['body']}"
+                        ),
                     )
             else:
                 skipped_prs += 1
@@ -523,6 +546,42 @@ def _delete_repository(self, owner: str, repo_name: str):
         else:
             logger.info(f"| Successfully deleted repository {owner}/{repo_name}")
 
+    def _obfuscate_mentions(self, text: str) -> str:
+        """
+        Obfuscate @ mentions to prevent notifications to real users.
+
+        Replaces @username with @username_XXXX (random suffix) to ensure the mentioned
+        user does not exist on GitHub. This prevents notification spam when importing
+        templates that contain @ mentions from original repositories.
+
+        Args:
+            text: The text content that may contain @ mentions
+
+        Returns:
+            Text with obfuscated @ mentions
+        """
+        import re
+        import random
+        import string
+
+        if not text:
+            return text
+
+        # Pattern matches @username (GitHub usernames: alphanumeric, hyphens, max 39 chars)
+        # Negative lookbehind (?<![a-zA-Z0-9]) ensures @ is not preceded by alphanumeric,
+        # which excludes emails like user@example.com
+        pattern = r"(?<![a-zA-Z0-9])@([a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)"
+
+        def replace_mention(match):
+            username = match.group(1)
+            # Generate random 4-char suffix
+            suffix = "".join(
+                random.choices(string.ascii_lowercase + string.digits, k=4)
+            )
+            return f"@{username}_{suffix}"
+
+        return re.sub(pattern, replace_mention, text)
+
     # ---------------------------------------------------------------------
     # Helper utilities (organisation vs user)
     # ---------------------------------------------------------------------

From 3eda2e91b5d1849ff7dc7e86033464de85a99d0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Czjwu0522=E2=80=9D?= <zijian.wu@u.nus.edu>
Date: Wed, 3 Dec 2025 15:21:18 +0000
Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=9D=20docs:=20add=20News=20section?=
 =?UTF-8?q?=20to=20README?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 2473a2a9..c8f655ac 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,18 @@ MCPMark provides a reproducible, extensible benchmark for researchers and engine
 
 [![MCPMark](https://github.com/user-attachments/assets/dfc06a41-e387-45e3-bc98-db7097ffa3dc)](https://mcpmark.ai)
 
+## News
+
+- **02/Dec/2025** - Evaluated `gemini-3-pro-preview` (thinking: low): Pass@1 50.6% ± 2.3%, Pass@4 67.7%, Pass^4 31.5% - so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` Pass@1 36.8% ± 1.8%, Pass@4 51.2%, Pass^4 21.3% and `deepseek-v3.2-chat` Pass@1 29.7% ± 1.5%, Pass@4 46.5%, Pass^4 13.4%
+- **02/Dec/2025** - Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))
+- **01/Dec/2025** - DeepSeek v3.2 release uses MCPMark! Kudos to the DeepSeek team on securing the best open-source model with a significant performance gain. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://cas-bridge.xethub.hf.co/xet-bridge-us/692cfec93b25b81d09307b94/2d0aa38511b9df084d12a00fe04a96595496af772cb766c516c4e6aee1e21246?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251203%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251203T145756Z&X-Amz-Expires=3600&X-Amz-Signature=31d39c39a42319dba189c1164f8a8bff69e4211b7520b75b7f3d4013a23b3022&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=634c72e6fe1bfa967d6c2b5c&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27paper.pdf%3B+filename%3D%22paper.pdf%22%3B&response-content-type=application%2Fpdf&x-id=GetObject&Expires=1764777476&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc2NDc3NzQ3Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82OTJjZmVjOTNiMjViODFkMDkzMDdiOTQvMmQwYWEzODUxMWI5ZGYwODRkMTJhMDBmZTA0YTk2NTk1NDk2YWY3NzJjYjc2NmM1MTZjNGU2YWVlMWUyMTI0NioifV19&Signature=HxFnQM7j%7EnuD9Qr81qqbXkunCc4nLLmTHv-5EosJu8EqlQ3VRyBibLNz0ur1d9h2SFp1Lvji3tNOQSWZsW%7EMS6wbmN5E4jjgbcXR40oxG4nhcq8Hy5jnHlEcQ9GyV9B0HTeXmQJ32AjkEDymEl9iVISRzEzwiu9J8wQL659QHSU5v81eexEk7LTfETikOdKCUQJy0uNqdDb3N%7Elfegq6XrxuZU5UawtlJYV57g1afkLln0ZYxqkYSEqxRdGwIAbfd1Te2Yi60I%7ELEB3qok4LM2%7E4gBWDBaB%7ESN902sbutiQYuvk6V5tFlSVq3MHaRJfJBCMTZiNtb5JAHKZSyVlGuw__&Key-Pair-Id=K2L8F4GPSG1IFC)
+- **17/Nov/2025** - Added 50 easy tasks (10 per MCP server) for smaller (<100B) open-source models ([#225](https://github.com/eval-sys/mcpmark/pull/225))
+- **31/Oct/2025** - Community PR from insforge: better MCP servers achieve better results with fewer tokens! ([#214](https://github.com/eval-sys/mcpmark/pull/214))
+- **13/Oct/2025** - Added ReAct agent support. PRs for new agent scaffolds welcome! ([#209](https://github.com/eval-sys/mcpmark/pull/209))
+- **10/Sep/2025** - `qwen-3-coder-plus` is the best open-source model! Kudos to Qwen team. [X Post](https://x.com/Alibaba_Qwen/status/1965457023438651532)
+
+---
+
 ## What you can do with MCPMark
 
 - **Evaluate real tool usage** across multiple MCP services: `Notion`, `GitHub`, `Filesystem`, `Postgres`, `Playwright`.

From 1291e91083ab2188ded465b85bde69957e99aeee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Czjwu0522=E2=80=9D?= <zijian.wu@u.nus.edu>
Date: Wed, 3 Dec 2025 16:10:39 +0000
Subject: [PATCH 3/5] Add Gemini 3 reasoning-effort warning and beautify README
 News section
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 README.md                | 14 +++++++-------
 src/agents/base_agent.py | 13 +++++++++++++
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index c8f655ac..b2f615ea 100644
--- a/README.md
+++ b/README.md
@@ -18,13 +18,13 @@ MCPMark provides a reproducible, extensible benchmark for researchers and engine
 
 ## News
 
-- **02/Dec/2025** - Evaluated `gemini-3-pro-preview` (thinking: low): Pass@1 50.6% ± 2.3%, Pass@4 67.7%, Pass^4 31.5% - so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` Pass@1 36.8% ± 1.8%, Pass@4 51.2%, Pass^4 21.3% and `deepseek-v3.2-chat` Pass@1 29.7% ± 1.5%, Pass@4 46.5%, Pass^4 13.4%
-- **02/Dec/2025** - Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))
-- **01/Dec/2025** - DeepSeek v3.2 release uses MCPMark! Kudos to the DeepSeek team on securing the best open-source model with a significant performance gain. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://cas-bridge.xethub.hf.co/xet-bridge-us/692cfec93b25b81d09307b94/2d0aa38511b9df084d12a00fe04a96595496af772cb766c516c4e6aee1e21246?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251203%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251203T145756Z&X-Amz-Expires=3600&X-Amz-Signature=31d39c39a42319dba189c1164f8a8bff69e4211b7520b75b7f3d4013a23b3022&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=634c72e6fe1bfa967d6c2b5c&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27paper.pdf%3B+filename%3D%22paper.pdf%22%3B&response-content-type=application%2Fpdf&x-id=GetObject&Expires=1764777476&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc2NDc3NzQ3Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82OTJjZmVjOTNiMjViODFkMDkzMDdiOTQvMmQwYWEzODUxMWI5ZGYwODRkMTJhMDBmZTA0YTk2NTk1NDk2YWY3NzJjYjc2NmM1MTZjNGU2YWVlMWUyMTI0NioifV19&Signature=HxFnQM7j%7EnuD9Qr81qqbXkunCc4nLLmTHv-5EosJu8EqlQ3VRyBibLNz0ur1d9h2SFp1Lvji3tNOQSWZsW%7EMS6wbmN5E4jjgbcXR40oxG4nhcq8Hy5jnHlEcQ9GyV9B0HTeXmQJ32AjkEDymEl9iVISRzEzwiu9J8wQL659QHSU5v81eexEk7LTfETikOdKCUQJy0uNqdDb3N%7Elfegq6XrxuZU5UawtlJYV57g1afkLln0ZYxqkYSEqxRdGwIAbfd1Te2Yi60I%7ELEB3qok4LM2%7E4gBWDBaB%7ESN902sbutiQYuvk6V5tFlSVq3MHaRJfJBCMTZiNtb5JAHKZSyVlGuw__&Key-Pair-Id=K2L8F4GPSG1IFC)
-- **17/Nov/2025** - Added 50 easy tasks (10 per MCP server) for smaller (<100B) open-source models ([#225](https://github.com/eval-sys/mcpmark/pull/225))
-- **31/Oct/2025** - Community PR from insforge: better MCP servers achieve better results with fewer tokens! ([#214](https://github.com/eval-sys/mcpmark/pull/214))
-- **13/Oct/2025** - Added ReAct agent support. PRs for new agent scaffolds welcome! ([#209](https://github.com/eval-sys/mcpmark/pull/209))
-- **10/Sep/2025** - `qwen-3-coder-plus` is the best open-source model! Kudos to Qwen team. [X Post](https://x.com/Alibaba_Qwen/status/1965457023438651532)
+- 🏅 **02 Dec** — Evaluated `gemini-3-pro-preview` (thinking: low): **Pass@1 50.6%** ± 2.3% — so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` 36.8% and `deepseek-v3.2-chat` 29.7%
+- 🔥 **02 Dec** — Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))
+- 🤝 **01 Dec** — DeepSeek v3.2 uses MCPMark! Kudos on securing the best open-source model. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf)
+- 🔥 **17 Nov** — Added 50 easy tasks (10 per MCP server) for smaller open-source models ([#225](https://github.com/eval-sys/mcpmark/pull/225))
+- 🤝 **31 Oct** — Community PR from insforge: better MCP servers achieve better results with fewer tokens! ([#214](https://github.com/eval-sys/mcpmark/pull/214))
+- 🔥 **13 Oct** — Added ReAct agent support. PRs for new agent scaffolds welcome! ([#209](https://github.com/eval-sys/mcpmark/pull/209))
+- 🤝 **10 Sep** — `qwen-3-coder-plus` is the best open-source model! Kudos to Qwen team. [X Post](https://x.com/Alibaba_Qwen/status/1965457023438651532)
 
 ---
 
diff --git a/src/agents/base_agent.py b/src/agents/base_agent.py
index 661a1994..faf0420b 100644
--- a/src/agents/base_agent.py
+++ b/src/agents/base_agent.py
@@ -66,6 +66,14 @@ def __init__(
             self.litellm_input_model_name,
         )
 
+        # Warn if Gemini 3 model uses unsupported reasoning_effort value
+        if self._is_gemini_3_model() and self.reasoning_effort not in ["default", "low", "high"]:
+            logger.warning(
+                "Gemini 3 models only support reasoning_effort 'low' or 'high', "
+                "got '%s'. LiteLLM may map this to the nearest supported value.",
+                self.reasoning_effort,
+            )
+
     def __repr__(self) -> str:  # pragma: no cover - debug helper
         return (
             f"{self.__class__.__name__}(service='{self.mcp_service}', "
@@ -418,6 +426,11 @@ def _is_gemini_model(self) -> bool:
         model_lower = self.litellm_input_model_name.lower()
         return "gemini" in model_lower or "bison" in model_lower
 
+    def _is_gemini_3_model(self) -> bool:
+        """Check if this is a Gemini 3 series model."""
+        model_lower = self.litellm_input_model_name.lower()
+        return "gemini-3" in model_lower or "gemini/gemini-3" in model_lower
+
     def _simplify_schema_for_gemini(self, schema: Optional[Dict[str, Any]]) -> Dict[str, Any]:
         if not isinstance(schema, dict):
             return schema or {}

From 0722ede53fa51d42308cbbae088d45c4df5bf462 Mon Sep 17 00:00:00 2001
From: Zijian Wu <50308536+zjwu0522@users.noreply.github.com>
Date: Thu, 4 Dec 2025 00:17:09 +0800
Subject: [PATCH 4/5] hot fix

Updated the link to the DeepSeek v3.2 technical report.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b2f615ea..eecef943 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ MCPMark provides a reproducible, extensible benchmark for researchers and engine
 
 - 🏅 **02 Dec** — Evaluated `gemini-3-pro-preview` (thinking: low): **Pass@1 50.6%** ± 2.3% — so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` 36.8% and `deepseek-v3.2-chat` 29.7%
 - 🔥 **02 Dec** — Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))
-- 🤝 **01 Dec** — DeepSeek v3.2 uses MCPMark! Kudos on securing the best open-source model. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf)
+- 🤝 **01 Dec** — DeepSeek v3.2 uses MCPMark! Kudos on securing the best open-source model. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf)
 - 🔥 **17 Nov** — Added 50 easy tasks (10 per MCP server) for smaller open-source models ([#225](https://github.com/eval-sys/mcpmark/pull/225))
 - 🤝 **31 Oct** — Community PR from insforge: better MCP servers achieve better results with fewer tokens! ([#214](https://github.com/eval-sys/mcpmark/pull/214))
 - 🔥 **13 Oct** — Added ReAct agent support. PRs for new agent scaffolds welcome! ([#209](https://github.com/eval-sys/mcpmark/pull/209))

From de3cfa1aaa3b35a696e6fd1ae516384bebbaaebb Mon Sep 17 00:00:00 2001
From: Zijian Wu <50308536+zjwu0522@users.noreply.github.com>
Date: Thu, 4 Dec 2025 00:18:04 +0800
Subject: [PATCH 5/5] hot fix

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index eecef943..4dc26fc9 100644
--- a/README.md
+++ b/README.md
@@ -20,11 +20,11 @@ MCPMark provides a reproducible, extensible benchmark for researchers and engine
 
 - 🏅 **02 Dec** — Evaluated `gemini-3-pro-preview` (thinking: low): **Pass@1 50.6%** ± 2.3% — so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` 36.8% and `deepseek-v3.2-chat` 29.7%
 - 🔥 **02 Dec** — Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))
-- 🤝 **01 Dec** — DeepSeek v3.2 uses MCPMark! Kudos on securing the best open-source model. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf)
+- 🏅 **01 Dec** — DeepSeek v3.2 uses MCPMark! Kudos on securing the best open-source model. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf)
 - 🔥 **17 Nov** — Added 50 easy tasks (10 per MCP server) for smaller open-source models ([#225](https://github.com/eval-sys/mcpmark/pull/225))
 - 🤝 **31 Oct** — Community PR from insforge: better MCP servers achieve better results with fewer tokens! ([#214](https://github.com/eval-sys/mcpmark/pull/214))
 - 🔥 **13 Oct** — Added ReAct agent support. PRs for new agent scaffolds welcome! ([#209](https://github.com/eval-sys/mcpmark/pull/209))
-- 🤝 **10 Sep** — `qwen-3-coder-plus` is the best open-source model! Kudos to Qwen team. [X Post](https://x.com/Alibaba_Qwen/status/1965457023438651532)
+- 🏅 **10 Sep** — `qwen-3-coder-plus` is the best open-source model! Kudos to Qwen team. [X Post](https://x.com/Alibaba_Qwen/status/1965457023438651532)
 
 ---