From 274ee1f045d19dc1c306379dfc0578e3b5d90af8 Mon Sep 17 00:00:00 2001 From: Melanie Li Date: Wed, 18 Mar 2026 22:56:37 +0000 Subject: [PATCH] fix: copy skill resources to workspace, improve trigger detection, add workspace hint Three issues that significantly impact functional and trigger scores: 1. Functional eval creates a temp workspace but doesn't copy the skill's scripts/, references/, or assets/ directories into it. When the agent tries to execute skill scripts (e.g., 'python3 scripts/check.py'), the files don't exist, causing script-based assertions to fail. Fix: Copy scripts/, references/, assets/, and SKILL.md from the skill directory into the with-skill workspace. Use separate workspaces for with-skill and without-skill runs to prevent contamination. 2. Trigger detection only recognizes skill activation through Read tool calls targeting SKILL.md. However, ClaudeRunner injects skill content via --append-system-prompt, so the agent never reads SKILL.md from disk. Fix: Add word-boundary matching for skill name in agent text output, and path-level matching (scripts/{filename}) for script references. Bare filenames like 'check.py' no longer trigger false positives. 3. The system prompt injection doesn't tell the agent that skill scripts are available in the working directory, so the agent may not attempt to execute them even when they're present. Fix: Append a workspace hint to the injected system prompt informing the agent that scripts/ is available in the working directory. Also removes 'Skill' from --allowedTools since it refers to Claude Code's ~/.claude/commands/ mechanism which is not used with --append-system-prompt. All 652 tests pass (8 new tests added). --- skill_eval/agent_runner.py | 15 ++++++-- skill_eval/functional.py | 48 +++++++++++++++++-------- skill_eval/trigger.py | 27 +++++++++++++- tests/test_agent_runner.py | 12 ++++--- tests/test_functional.py | 40 +++++++++++++++++++++ tests/test_trigger.py | 74 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 195 insertions(+), 21 deletions(-) diff --git a/skill_eval/agent_runner.py b/skill_eval/agent_runner.py index 605ec2d..9dd92ba 100644 --- a/skill_eval/agent_runner.py +++ b/skill_eval/agent_runner.py @@ -140,10 +140,21 @@ def _build_cmd_with_skill(self, prompt: str, skill_path: str) -> list[str]: skill_content = self._read_skill_content(skill_path) cmd = [ self.CLI_NAME, "-p", prompt, - "--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit", "Skill", + "--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit", ] if skill_content: - cmd.extend(["--append-system-prompt", skill_content]) + # Append a workspace hint so the agent knows skill resources + # (scripts/, references/, assets/) are available in the working + # directory. Without this, the agent may not attempt to execute + # scripts even though they've been copied to the workspace. + workspace_hint = ( + "\n\n---\n" + "Note: The skill's scripts/, references/, and assets/ " + "directories are available in your current working directory. " + "You can execute scripts directly, e.g. " + "`python3 scripts/example.py`." + ) + cmd.extend(["--append-system-prompt", skill_content + workspace_hint]) return cmd def _build_cmd_without_skill(self, prompt: str) -> list[str]: diff --git a/skill_eval/functional.py b/skill_eval/functional.py index 1441cae..cf96a85 100644 --- a/skill_eval/functional.py +++ b/skill_eval/functional.py @@ -235,33 +235,53 @@ def _execute_eval_pair( if runner is None: runner = get_runner("claude") - # Set up workspace with eval files - with tempfile.TemporaryDirectory(prefix="skill-eval-") as tmpdir: - workspace = Path(tmpdir) - - # Copy any referenced files into workspace - for rel_file in eval_case.files: - src = evals_dir / rel_file - dst = workspace / Path(rel_file).name - if src.is_file(): - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dst) + # Set up separate workspaces for with-skill and without-skill runs. + # Using independent workspaces prevents contamination — without-skill + # should not see skill scripts/SKILL.md that were copied for with-skill. + with tempfile.TemporaryDirectory(prefix="skill-eval-with-") as with_tmpdir, \ + tempfile.TemporaryDirectory(prefix="skill-eval-without-") as without_tmpdir: + with_workspace = Path(with_tmpdir) + without_workspace = Path(without_tmpdir) + + # Copy eval case files into both workspaces + for ws in (with_workspace, without_workspace): + for rel_file in eval_case.files: + src = evals_dir / rel_file + dst = ws / Path(rel_file).name + if src.is_file(): + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + + # Copy skill resource directories into with-skill workspace only, + # so the agent can access scripts/, references/, and assets/ as + # described in SKILL.md. + if skill_path: + _skill = Path(skill_path) + for subdir in ("scripts", "references", "assets"): + src_dir = _skill / subdir + if src_dir.is_dir(): + dst_dir = with_workspace / subdir + shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True) + # Also copy SKILL.md so the agent can read it if needed + _skill_md = _skill / "SKILL.md" + if _skill_md.is_file(): + shutil.copy2(_skill_md, with_workspace / "SKILL.md") # Run WITH skill with_stdout, with_stderr, with_rc, with_elapsed = runner.run_prompt( eval_case.prompt, skill_path=str(skill_path), - workspace_dir=str(workspace), + workspace_dir=str(with_workspace), timeout=timeout, output_format="stream-json", ) with_parsed = runner.parse_output(with_stdout) - # Run WITHOUT skill (fresh workspace) + # Run WITHOUT skill (clean workspace — no skill files) without_stdout, without_stderr, without_rc, without_elapsed = runner.run_prompt( eval_case.prompt, skill_path=None, - workspace_dir=str(workspace), + workspace_dir=str(without_workspace), timeout=timeout, output_format="stream-json", ) diff --git a/skill_eval/trigger.py b/skill_eval/trigger.py index 7d11e0b..58f40d8 100644 --- a/skill_eval/trigger.py +++ b/skill_eval/trigger.py @@ -298,9 +298,34 @@ def _detect_skill_trigger_from_parsed(parsed: dict, skill_path: Path) -> bool: # Also check text output for skill activation markers text = parsed.get("text", "") - if f"skill:{skill_name}" in text.lower() or f"using {skill_name}" in text.lower(): + text_lower = text.lower() + skill_name_lower = skill_name.lower() + + # Direct skill references + if f"skill:{skill_name_lower}" in text_lower or f"using {skill_name_lower}" in text_lower: return True + # When skill is injected via --append-system-prompt, the agent may + # reference the skill by name or mention its scripts/rules without + # explicitly "activating" it through a tool call. Detect broader + # references to the skill name in the output text. + # Use word-boundary matching to avoid false positives on partial names. + import re as _re + # Match skill name as a standalone term (hyphenated names are common) + skill_word_pattern = _re.compile( + r'\b' + _re.escape(skill_name_lower) + r'\b', + _re.IGNORECASE, + ) + if skill_word_pattern.search(text): + return True + + # Also detect references to skill scripts in the text, using path-level + # matching (scripts/{filename}) to avoid false positives on common names + # like "check.py" or "main.py". + for script_file in script_files: + if f"scripts/{script_file.lower()}" in text_lower: + return True + return False diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py index a45936c..81b5736 100644 --- a/tests/test_agent_runner.py +++ b/tests/test_agent_runner.py @@ -377,20 +377,24 @@ def test_build_cmd_with_skill_uses_append_system_prompt(self, tmp_path): cmd = runner._build_cmd_with_skill("test prompt", str(skill_dir)) assert "--append-system-prompt" in cmd assert "--plugin-dir" not in cmd - assert "skill content" in cmd + # The injected content should include a workspace hint + idx = cmd.index("--append-system-prompt") + injected = cmd[idx + 1] + assert "skill content" in injected + assert "scripts/" in injected assert "claude" == cmd[0] assert "-p" in cmd assert "test prompt" in cmd - def test_build_cmd_with_skill_includes_skill_tool(self, tmp_path): - """With-skill command should include the Skill tool.""" + def test_build_cmd_with_skill_no_skill_tool(self, tmp_path): + """With-skill command should NOT include the Skill tool (system prompt injection).""" runner = ClaudeRunner() skill_dir = tmp_path / "skill" skill_dir.mkdir() (skill_dir / "SKILL.md").write_text("content") cmd = runner._build_cmd_with_skill("prompt", str(skill_dir)) - assert "Skill" in cmd + assert "Skill" not in cmd def test_build_cmd_without_skill(self): """_build_cmd_without_skill should not include --append-system-prompt or Skill.""" diff --git a/tests/test_functional.py b/tests/test_functional.py index d179c75..dfbf5eb 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -484,6 +484,46 @@ def test_execute_eval_pair_copies_files_to_workspace(self): assert first_kwargs.get("skill_path") is not None assert second_kwargs.get("skill_path") is None + def test_execute_eval_pair_copies_skill_resources_to_workspace(self): + """Skill scripts/ and references/ should be in with-skill workspace only.""" + stream = _make_stream_json("name age") + runner = self._make_mock_runner() + captured_workspaces = [] + + def capture_run_prompt(prompt, **kwargs): + wd = kwargs.get("workspace_dir") + if wd: + from pathlib import Path as _P + workspace = _P(wd) + captured_workspaces.append({ + "skill_path": kwargs.get("skill_path"), + "has_scripts": (workspace / "scripts").is_dir(), + "has_skill_md": (workspace / "SKILL.md").is_file(), + }) + return (stream, "", 0, 1.0) + + runner.run_prompt.side_effect = capture_run_prompt + + _execute_eval_pair( + self._eval_case(), + FIXTURES / "good-skill", + FIXTURES / "good-skill" / "evals", + run_index=0, + timeout=30, + runner=runner, + ) + # with-skill workspace should have scripts/ and SKILL.md + assert len(captured_workspaces) == 2 + with_skill_ws = captured_workspaces[0] + assert with_skill_ws["skill_path"] is not None + assert with_skill_ws["has_scripts"] is True + assert with_skill_ws["has_skill_md"] is True + # without-skill workspace should NOT have scripts/ or SKILL.md + without_skill_ws = captured_workspaces[1] + assert without_skill_ws["skill_path"] is None + assert without_skill_ws["has_scripts"] is False + assert without_skill_ws["has_skill_md"] is False + class TestClassifyCostEfficiency: """Test Pareto cost-efficiency classification.""" diff --git a/tests/test_trigger.py b/tests/test_trigger.py index f3bcccc..edaa563 100644 --- a/tests/test_trigger.py +++ b/tests/test_trigger.py @@ -435,6 +435,80 @@ def test_multiple_tool_calls_one_triggers(self, tmp_path): assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True +class TestTextTriggerDetection: + """Test skill name and script reference detection in text output.""" + + def test_detect_skill_name_in_text(self): + """Should detect skill name as a standalone word in text output.""" + parsed = { + "tool_calls": [], + "text": "I'll use the acme-compliance checker to validate this document.", + } + assert _detect_skill_trigger_from_parsed( + parsed, Path("/tmp/acme-compliance") + ) is True + + def test_detect_skill_name_word_boundary(self): + """Should not match partial words containing the skill name.""" + parsed = { + "tool_calls": [], + "text": "This is about non-acme-compliance-related matters.", + } + # "acme-compliance" still appears as a word here, so it matches + assert _detect_skill_trigger_from_parsed( + parsed, Path("/tmp/acme-compliance") + ) is True + + def test_no_partial_match(self): + """Should not match when skill name is only a substring.""" + parsed = { + "tool_calls": [], + "text": "The weather is nice today.", + } + assert _detect_skill_trigger_from_parsed( + parsed, Path("/tmp/weather-forecast") + ) is False + + def test_detect_script_file_in_text(self, tmp_path): + """Should detect references to skill scripts using path-level matching.""" + skill_dir = tmp_path / "my-skill" + scripts_dir = skill_dir / "scripts" + scripts_dir.mkdir(parents=True) + (scripts_dir / "check.py").write_text("#!/usr/bin/env python3\npass") + + parsed = { + "tool_calls": [], + "text": "You should run scripts/check.py to validate the document.", + } + assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True + + def test_no_detect_bare_script_name(self, tmp_path): + """Should NOT trigger on bare script name without scripts/ prefix.""" + skill_dir = tmp_path / "my-skill" + scripts_dir = skill_dir / "scripts" + scripts_dir.mkdir(parents=True) + (scripts_dir / "check.py").write_text("pass") + + parsed = { + "tool_calls": [], + "text": "Run check.py to validate the document.", + } + assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False + + def test_no_detect_unrelated_text(self, tmp_path): + """Should not trigger on unrelated text.""" + skill_dir = tmp_path / "my-skill" + scripts_dir = skill_dir / "scripts" + scripts_dir.mkdir(parents=True) + (scripts_dir / "check.py").write_text("pass") + + parsed = { + "tool_calls": [], + "text": "Here is a summary of the document.", + } + assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False + + class TestTriggerQueryResultTokenFields: """Test TriggerQueryResult token fields."""