diff --git a/skill_eval/agent_runner.py b/skill_eval/agent_runner.py index 605ec2d..9dd92ba 100644 --- a/skill_eval/agent_runner.py +++ b/skill_eval/agent_runner.py @@ -140,10 +140,21 @@ def _build_cmd_with_skill(self, prompt: str, skill_path: str) -> list[str]: skill_content = self._read_skill_content(skill_path) cmd = [ self.CLI_NAME, "-p", prompt, - "--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit", "Skill", + "--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit", ] if skill_content: - cmd.extend(["--append-system-prompt", skill_content]) + # Append a workspace hint so the agent knows skill resources + # (scripts/, references/, assets/) are available in the working + # directory. Without this, the agent may not attempt to execute + # scripts even though they've been copied to the workspace. + workspace_hint = ( + "\n\n---\n" + "Note: The skill's scripts/, references/, and assets/ " + "directories are available in your current working directory. " + "You can execute scripts directly, e.g. " + "`python3 scripts/example.py`." + ) + cmd.extend(["--append-system-prompt", skill_content + workspace_hint]) return cmd def _build_cmd_without_skill(self, prompt: str) -> list[str]: diff --git a/skill_eval/functional.py b/skill_eval/functional.py index 1441cae..cf96a85 100644 --- a/skill_eval/functional.py +++ b/skill_eval/functional.py @@ -235,33 +235,53 @@ def _execute_eval_pair( if runner is None: runner = get_runner("claude") - # Set up workspace with eval files - with tempfile.TemporaryDirectory(prefix="skill-eval-") as tmpdir: - workspace = Path(tmpdir) - - # Copy any referenced files into workspace - for rel_file in eval_case.files: - src = evals_dir / rel_file - dst = workspace / Path(rel_file).name - if src.is_file(): - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dst) + # Set up separate workspaces for with-skill and without-skill runs. + # Using independent workspaces prevents contamination — without-skill + # should not see skill scripts/SKILL.md that were copied for with-skill. + with tempfile.TemporaryDirectory(prefix="skill-eval-with-") as with_tmpdir, \ + tempfile.TemporaryDirectory(prefix="skill-eval-without-") as without_tmpdir: + with_workspace = Path(with_tmpdir) + without_workspace = Path(without_tmpdir) + + # Copy eval case files into both workspaces + for ws in (with_workspace, without_workspace): + for rel_file in eval_case.files: + src = evals_dir / rel_file + dst = ws / Path(rel_file).name + if src.is_file(): + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + + # Copy skill resource directories into with-skill workspace only, + # so the agent can access scripts/, references/, and assets/ as + # described in SKILL.md. + if skill_path: + _skill = Path(skill_path) + for subdir in ("scripts", "references", "assets"): + src_dir = _skill / subdir + if src_dir.is_dir(): + dst_dir = with_workspace / subdir + shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True) + # Also copy SKILL.md so the agent can read it if needed + _skill_md = _skill / "SKILL.md" + if _skill_md.is_file(): + shutil.copy2(_skill_md, with_workspace / "SKILL.md") # Run WITH skill with_stdout, with_stderr, with_rc, with_elapsed = runner.run_prompt( eval_case.prompt, skill_path=str(skill_path), - workspace_dir=str(workspace), + workspace_dir=str(with_workspace), timeout=timeout, output_format="stream-json", ) with_parsed = runner.parse_output(with_stdout) - # Run WITHOUT skill (fresh workspace) + # Run WITHOUT skill (clean workspace — no skill files) without_stdout, without_stderr, without_rc, without_elapsed = runner.run_prompt( eval_case.prompt, skill_path=None, - workspace_dir=str(workspace), + workspace_dir=str(without_workspace), timeout=timeout, output_format="stream-json", ) diff --git a/skill_eval/trigger.py b/skill_eval/trigger.py index 7d11e0b..58f40d8 100644 --- a/skill_eval/trigger.py +++ b/skill_eval/trigger.py @@ -298,9 +298,34 @@ def _detect_skill_trigger_from_parsed(parsed: dict, skill_path: Path) -> bool: # Also check text output for skill activation markers text = parsed.get("text", "") - if f"skill:{skill_name}" in text.lower() or f"using {skill_name}" in text.lower(): + text_lower = text.lower() + skill_name_lower = skill_name.lower() + + # Direct skill references + if f"skill:{skill_name_lower}" in text_lower or f"using {skill_name_lower}" in text_lower: return True + # When skill is injected via --append-system-prompt, the agent may + # reference the skill by name or mention its scripts/rules without + # explicitly "activating" it through a tool call. Detect broader + # references to the skill name in the output text. + # Use word-boundary matching to avoid false positives on partial names. + import re as _re + # Match skill name as a standalone term (hyphenated names are common) + skill_word_pattern = _re.compile( + r'\b' + _re.escape(skill_name_lower) + r'\b', + _re.IGNORECASE, + ) + if skill_word_pattern.search(text): + return True + + # Also detect references to skill scripts in the text, using path-level + # matching (scripts/{filename}) to avoid false positives on common names + # like "check.py" or "main.py". + for script_file in script_files: + if f"scripts/{script_file.lower()}" in text_lower: + return True + return False diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py index a45936c..81b5736 100644 --- a/tests/test_agent_runner.py +++ b/tests/test_agent_runner.py @@ -377,20 +377,24 @@ def test_build_cmd_with_skill_uses_append_system_prompt(self, tmp_path): cmd = runner._build_cmd_with_skill("test prompt", str(skill_dir)) assert "--append-system-prompt" in cmd assert "--plugin-dir" not in cmd - assert "skill content" in cmd + # The injected content should include a workspace hint + idx = cmd.index("--append-system-prompt") + injected = cmd[idx + 1] + assert "skill content" in injected + assert "scripts/" in injected assert "claude" == cmd[0] assert "-p" in cmd assert "test prompt" in cmd - def test_build_cmd_with_skill_includes_skill_tool(self, tmp_path): - """With-skill command should include the Skill tool.""" + def test_build_cmd_with_skill_no_skill_tool(self, tmp_path): + """With-skill command should NOT include the Skill tool (system prompt injection).""" runner = ClaudeRunner() skill_dir = tmp_path / "skill" skill_dir.mkdir() (skill_dir / "SKILL.md").write_text("content") cmd = runner._build_cmd_with_skill("prompt", str(skill_dir)) - assert "Skill" in cmd + assert "Skill" not in cmd def test_build_cmd_without_skill(self): """_build_cmd_without_skill should not include --append-system-prompt or Skill.""" diff --git a/tests/test_functional.py b/tests/test_functional.py index d179c75..dfbf5eb 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -484,6 +484,46 @@ def test_execute_eval_pair_copies_files_to_workspace(self): assert first_kwargs.get("skill_path") is not None assert second_kwargs.get("skill_path") is None + def test_execute_eval_pair_copies_skill_resources_to_workspace(self): + """Skill scripts/ and references/ should be in with-skill workspace only.""" + stream = _make_stream_json("name age") + runner = self._make_mock_runner() + captured_workspaces = [] + + def capture_run_prompt(prompt, **kwargs): + wd = kwargs.get("workspace_dir") + if wd: + from pathlib import Path as _P + workspace = _P(wd) + captured_workspaces.append({ + "skill_path": kwargs.get("skill_path"), + "has_scripts": (workspace / "scripts").is_dir(), + "has_skill_md": (workspace / "SKILL.md").is_file(), + }) + return (stream, "", 0, 1.0) + + runner.run_prompt.side_effect = capture_run_prompt + + _execute_eval_pair( + self._eval_case(), + FIXTURES / "good-skill", + FIXTURES / "good-skill" / "evals", + run_index=0, + timeout=30, + runner=runner, + ) + # with-skill workspace should have scripts/ and SKILL.md + assert len(captured_workspaces) == 2 + with_skill_ws = captured_workspaces[0] + assert with_skill_ws["skill_path"] is not None + assert with_skill_ws["has_scripts"] is True + assert with_skill_ws["has_skill_md"] is True + # without-skill workspace should NOT have scripts/ or SKILL.md + without_skill_ws = captured_workspaces[1] + assert without_skill_ws["skill_path"] is None + assert without_skill_ws["has_scripts"] is False + assert without_skill_ws["has_skill_md"] is False + class TestClassifyCostEfficiency: """Test Pareto cost-efficiency classification.""" diff --git a/tests/test_trigger.py b/tests/test_trigger.py index f3bcccc..edaa563 100644 --- a/tests/test_trigger.py +++ b/tests/test_trigger.py @@ -435,6 +435,80 @@ def test_multiple_tool_calls_one_triggers(self, tmp_path): assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True +class TestTextTriggerDetection: + """Test skill name and script reference detection in text output.""" + + def test_detect_skill_name_in_text(self): + """Should detect skill name as a standalone word in text output.""" + parsed = { + "tool_calls": [], + "text": "I'll use the acme-compliance checker to validate this document.", + } + assert _detect_skill_trigger_from_parsed( + parsed, Path("/tmp/acme-compliance") + ) is True + + def test_detect_skill_name_word_boundary(self): + """Should not match partial words containing the skill name.""" + parsed = { + "tool_calls": [], + "text": "This is about non-acme-compliance-related matters.", + } + # "acme-compliance" still appears as a word here, so it matches + assert _detect_skill_trigger_from_parsed( + parsed, Path("/tmp/acme-compliance") + ) is True + + def test_no_partial_match(self): + """Should not match when skill name is only a substring.""" + parsed = { + "tool_calls": [], + "text": "The weather is nice today.", + } + assert _detect_skill_trigger_from_parsed( + parsed, Path("/tmp/weather-forecast") + ) is False + + def test_detect_script_file_in_text(self, tmp_path): + """Should detect references to skill scripts using path-level matching.""" + skill_dir = tmp_path / "my-skill" + scripts_dir = skill_dir / "scripts" + scripts_dir.mkdir(parents=True) + (scripts_dir / "check.py").write_text("#!/usr/bin/env python3\npass") + + parsed = { + "tool_calls": [], + "text": "You should run scripts/check.py to validate the document.", + } + assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True + + def test_no_detect_bare_script_name(self, tmp_path): + """Should NOT trigger on bare script name without scripts/ prefix.""" + skill_dir = tmp_path / "my-skill" + scripts_dir = skill_dir / "scripts" + scripts_dir.mkdir(parents=True) + (scripts_dir / "check.py").write_text("pass") + + parsed = { + "tool_calls": [], + "text": "Run check.py to validate the document.", + } + assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False + + def test_no_detect_unrelated_text(self, tmp_path): + """Should not trigger on unrelated text.""" + skill_dir = tmp_path / "my-skill" + scripts_dir = skill_dir / "scripts" + scripts_dir.mkdir(parents=True) + (scripts_dir / "check.py").write_text("pass") + + parsed = { + "tool_calls": [], + "text": "Here is a summary of the document.", + } + assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False + + class TestTriggerQueryResultTokenFields: """Test TriggerQueryResult token fields."""