aws-samples · melanie531 · Mar 18, 2026 · Mar 18, 2026
diff --git a/skill_eval/agent_runner.py b/skill_eval/agent_runner.py
@@ -140,10 +140,21 @@ def _build_cmd_with_skill(self, prompt: str, skill_path: str) -> list[str]:
         skill_content = self._read_skill_content(skill_path)
         cmd = [
             self.CLI_NAME, "-p", prompt,
-            "--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit", "Skill",
+            "--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit",
         ]
         if skill_content:
-            cmd.extend(["--append-system-prompt", skill_content])
+            # Append a workspace hint so the agent knows skill resources
+            # (scripts/, references/, assets/) are available in the working
+            # directory.  Without this, the agent may not attempt to execute
+            # scripts even though they've been copied to the workspace.
+            workspace_hint = (
+                "\n\n---\n"
+                "Note: The skill's scripts/, references/, and assets/ "
+                "directories are available in your current working directory. "
+                "You can execute scripts directly, e.g. "
+                "`python3 scripts/example.py`."
+            )
+            cmd.extend(["--append-system-prompt", skill_content + workspace_hint])
         return cmd
 
     def _build_cmd_without_skill(self, prompt: str) -> list[str]:

diff --git a/skill_eval/functional.py b/skill_eval/functional.py
@@ -235,33 +235,53 @@ def _execute_eval_pair(
     if runner is None:
         runner = get_runner("claude")
 
-    # Set up workspace with eval files
-    with tempfile.TemporaryDirectory(prefix="skill-eval-") as tmpdir:
-        workspace = Path(tmpdir)
-
-        # Copy any referenced files into workspace
-        for rel_file in eval_case.files:
-            src = evals_dir / rel_file
-            dst = workspace / Path(rel_file).name
-            if src.is_file():
-                dst.parent.mkdir(parents=True, exist_ok=True)
-                shutil.copy2(src, dst)
+    # Set up separate workspaces for with-skill and without-skill runs.
+    # Using independent workspaces prevents contamination — without-skill
+    # should not see skill scripts/SKILL.md that were copied for with-skill.
+    with tempfile.TemporaryDirectory(prefix="skill-eval-with-") as with_tmpdir, \
+         tempfile.TemporaryDirectory(prefix="skill-eval-without-") as without_tmpdir:
+        with_workspace = Path(with_tmpdir)
+        without_workspace = Path(without_tmpdir)
+
+        # Copy eval case files into both workspaces
+        for ws in (with_workspace, without_workspace):
+            for rel_file in eval_case.files:
+                src = evals_dir / rel_file
+                dst = ws / Path(rel_file).name
+                if src.is_file():
+                    dst.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(src, dst)
+
+        # Copy skill resource directories into with-skill workspace only,
+        # so the agent can access scripts/, references/, and assets/ as
+        # described in SKILL.md.
+        if skill_path:
+            _skill = Path(skill_path)
+            for subdir in ("scripts", "references", "assets"):
+                src_dir = _skill / subdir
+                if src_dir.is_dir():
+                    dst_dir = with_workspace / subdir
+                    shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)
+            # Also copy SKILL.md so the agent can read it if needed
+            _skill_md = _skill / "SKILL.md"
+            if _skill_md.is_file():
+                shutil.copy2(_skill_md, with_workspace / "SKILL.md")
 
         # Run WITH skill
         with_stdout, with_stderr, with_rc, with_elapsed = runner.run_prompt(
             eval_case.prompt,
             skill_path=str(skill_path),
-            workspace_dir=str(workspace),
+            workspace_dir=str(with_workspace),
             timeout=timeout,
             output_format="stream-json",
         )
         with_parsed = runner.parse_output(with_stdout)
 
-        # Run WITHOUT skill (fresh workspace)
+        # Run WITHOUT skill (clean workspace — no skill files)
         without_stdout, without_stderr, without_rc, without_elapsed = runner.run_prompt(
             eval_case.prompt,
             skill_path=None,
-            workspace_dir=str(workspace),
+            workspace_dir=str(without_workspace),
             timeout=timeout,
             output_format="stream-json",
         )

diff --git a/skill_eval/trigger.py b/skill_eval/trigger.py
@@ -298,9 +298,34 @@ def _detect_skill_trigger_from_parsed(parsed: dict, skill_path: Path) -> bool:
 
     # Also check text output for skill activation markers
     text = parsed.get("text", "")
-    if f"skill:{skill_name}" in text.lower() or f"using {skill_name}" in text.lower():
+    text_lower = text.lower()
+    skill_name_lower = skill_name.lower()
+
+    # Direct skill references
+    if f"skill:{skill_name_lower}" in text_lower or f"using {skill_name_lower}" in text_lower:
         return True
 
+    # When skill is injected via --append-system-prompt, the agent may
+    # reference the skill by name or mention its scripts/rules without
+    # explicitly "activating" it through a tool call.  Detect broader
+    # references to the skill name in the output text.
+    # Use word-boundary matching to avoid false positives on partial names.
+    import re as _re
+    # Match skill name as a standalone term (hyphenated names are common)
+    skill_word_pattern = _re.compile(
+        r'\b' + _re.escape(skill_name_lower) + r'\b',
+        _re.IGNORECASE,
+    )
+    if skill_word_pattern.search(text):
+        return True
+
+    # Also detect references to skill scripts in the text, using path-level
+    # matching (scripts/{filename}) to avoid false positives on common names
+    # like "check.py" or "main.py".
+    for script_file in script_files:
+        if f"scripts/{script_file.lower()}" in text_lower:
+            return True
+
     return False
 
 

diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
@@ -377,20 +377,24 @@ def test_build_cmd_with_skill_uses_append_system_prompt(self, tmp_path):
         cmd = runner._build_cmd_with_skill("test prompt", str(skill_dir))
         assert "--append-system-prompt" in cmd
         assert "--plugin-dir" not in cmd
-        assert "skill content" in cmd
+        # The injected content should include a workspace hint
+        idx = cmd.index("--append-system-prompt")
+        injected = cmd[idx + 1]
+        assert "skill content" in injected
+        assert "scripts/" in injected
         assert "claude" == cmd[0]
         assert "-p" in cmd
         assert "test prompt" in cmd
 
-    def test_build_cmd_with_skill_includes_skill_tool(self, tmp_path):
-        """With-skill command should include the Skill tool."""
+    def test_build_cmd_with_skill_no_skill_tool(self, tmp_path):
+        """With-skill command should NOT include the Skill tool (system prompt injection)."""
         runner = ClaudeRunner()
         skill_dir = tmp_path / "skill"
         skill_dir.mkdir()
         (skill_dir / "SKILL.md").write_text("content")
 
         cmd = runner._build_cmd_with_skill("prompt", str(skill_dir))
-        assert "Skill" in cmd
+        assert "Skill" not in cmd
 
     def test_build_cmd_without_skill(self):
         """_build_cmd_without_skill should not include --append-system-prompt or Skill."""

diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -484,6 +484,46 @@ def test_execute_eval_pair_copies_files_to_workspace(self):
         assert first_kwargs.get("skill_path") is not None
         assert second_kwargs.get("skill_path") is None
 
+    def test_execute_eval_pair_copies_skill_resources_to_workspace(self):
+        """Skill scripts/ and references/ should be in with-skill workspace only."""
+        stream = _make_stream_json("name age")
+        runner = self._make_mock_runner()
+        captured_workspaces = []
+
+        def capture_run_prompt(prompt, **kwargs):
+            wd = kwargs.get("workspace_dir")
+            if wd:
+                from pathlib import Path as _P
+                workspace = _P(wd)
+                captured_workspaces.append({
+                    "skill_path": kwargs.get("skill_path"),
+                    "has_scripts": (workspace / "scripts").is_dir(),
+                    "has_skill_md": (workspace / "SKILL.md").is_file(),
+                })
+            return (stream, "", 0, 1.0)
+
+        runner.run_prompt.side_effect = capture_run_prompt
+
+        _execute_eval_pair(
+            self._eval_case(),
+            FIXTURES / "good-skill",
+            FIXTURES / "good-skill" / "evals",
+            run_index=0,
+            timeout=30,
+            runner=runner,
+        )
+        # with-skill workspace should have scripts/ and SKILL.md
+        assert len(captured_workspaces) == 2
+        with_skill_ws = captured_workspaces[0]
+        assert with_skill_ws["skill_path"] is not None
+        assert with_skill_ws["has_scripts"] is True
+        assert with_skill_ws["has_skill_md"] is True
+        # without-skill workspace should NOT have scripts/ or SKILL.md
+        without_skill_ws = captured_workspaces[1]
+        assert without_skill_ws["skill_path"] is None
+        assert without_skill_ws["has_scripts"] is False
+        assert without_skill_ws["has_skill_md"] is False
+
 
 class TestClassifyCostEfficiency:
     """Test Pareto cost-efficiency classification."""

diff --git a/tests/test_trigger.py b/tests/test_trigger.py
@@ -435,6 +435,80 @@ def test_multiple_tool_calls_one_triggers(self, tmp_path):
         assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True
 
 
+class TestTextTriggerDetection:
+    """Test skill name and script reference detection in text output."""
+
+    def test_detect_skill_name_in_text(self):
+        """Should detect skill name as a standalone word in text output."""
+        parsed = {
+            "tool_calls": [],
+            "text": "I'll use the acme-compliance checker to validate this document.",
+        }
+        assert _detect_skill_trigger_from_parsed(
+            parsed, Path("/tmp/acme-compliance")
+        ) is True
+
+    def test_detect_skill_name_word_boundary(self):
+        """Should not match partial words containing the skill name."""
+        parsed = {
+            "tool_calls": [],
+            "text": "This is about non-acme-compliance-related matters.",
+        }
+        # "acme-compliance" still appears as a word here, so it matches
+        assert _detect_skill_trigger_from_parsed(
+            parsed, Path("/tmp/acme-compliance")
+        ) is True
+
+    def test_no_partial_match(self):
+        """Should not match when skill name is only a substring."""
+        parsed = {
+            "tool_calls": [],
+            "text": "The weather is nice today.",
+        }
+        assert _detect_skill_trigger_from_parsed(
+            parsed, Path("/tmp/weather-forecast")
+        ) is False
+
+    def test_detect_script_file_in_text(self, tmp_path):
+        """Should detect references to skill scripts using path-level matching."""
+        skill_dir = tmp_path / "my-skill"
+        scripts_dir = skill_dir / "scripts"
+        scripts_dir.mkdir(parents=True)
+        (scripts_dir / "check.py").write_text("#!/usr/bin/env python3\npass")
+
+        parsed = {
+            "tool_calls": [],
+            "text": "You should run scripts/check.py to validate the document.",
+        }
+        assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True
+
+    def test_no_detect_bare_script_name(self, tmp_path):
+        """Should NOT trigger on bare script name without scripts/ prefix."""
+        skill_dir = tmp_path / "my-skill"
+        scripts_dir = skill_dir / "scripts"
+        scripts_dir.mkdir(parents=True)
+        (scripts_dir / "check.py").write_text("pass")
+
+        parsed = {
+            "tool_calls": [],
+            "text": "Run check.py to validate the document.",
+        }
+        assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False
+
+    def test_no_detect_unrelated_text(self, tmp_path):
+        """Should not trigger on unrelated text."""
+        skill_dir = tmp_path / "my-skill"
+        scripts_dir = skill_dir / "scripts"
+        scripts_dir.mkdir(parents=True)
+        (scripts_dir / "check.py").write_text("pass")
+
+        parsed = {
+            "tool_calls": [],
+            "text": "Here is a summary of the document.",
+        }
+        assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False
+
+
 class TestTriggerQueryResultTokenFields:
     """Test TriggerQueryResult token fields."""