Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions skill_eval/agent_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,21 @@ def _build_cmd_with_skill(self, prompt: str, skill_path: str) -> list[str]:
skill_content = self._read_skill_content(skill_path)
cmd = [
self.CLI_NAME, "-p", prompt,
"--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit", "Skill",
"--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit",
]
if skill_content:
cmd.extend(["--append-system-prompt", skill_content])
# Append a workspace hint so the agent knows skill resources
# (scripts/, references/, assets/) are available in the working
# directory. Without this, the agent may not attempt to execute
# scripts even though they've been copied to the workspace.
workspace_hint = (
"\n\n---\n"
"Note: The skill's scripts/, references/, and assets/ "
"directories are available in your current working directory. "
"You can execute scripts directly, e.g. "
"`python3 scripts/example.py`."
)
cmd.extend(["--append-system-prompt", skill_content + workspace_hint])
return cmd

def _build_cmd_without_skill(self, prompt: str) -> list[str]:
Expand Down
48 changes: 34 additions & 14 deletions skill_eval/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,33 +235,53 @@ def _execute_eval_pair(
if runner is None:
runner = get_runner("claude")

# Set up workspace with eval files
with tempfile.TemporaryDirectory(prefix="skill-eval-") as tmpdir:
workspace = Path(tmpdir)

# Copy any referenced files into workspace
for rel_file in eval_case.files:
src = evals_dir / rel_file
dst = workspace / Path(rel_file).name
if src.is_file():
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dst)
# Set up separate workspaces for with-skill and without-skill runs.
# Using independent workspaces prevents contamination — without-skill
# should not see skill scripts/SKILL.md that were copied for with-skill.
with tempfile.TemporaryDirectory(prefix="skill-eval-with-") as with_tmpdir, \
tempfile.TemporaryDirectory(prefix="skill-eval-without-") as without_tmpdir:
with_workspace = Path(with_tmpdir)
without_workspace = Path(without_tmpdir)

# Copy eval case files into both workspaces
for ws in (with_workspace, without_workspace):
for rel_file in eval_case.files:
src = evals_dir / rel_file
dst = ws / Path(rel_file).name
if src.is_file():
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dst)

# Copy skill resource directories into with-skill workspace only,
# so the agent can access scripts/, references/, and assets/ as
# described in SKILL.md.
if skill_path:
_skill = Path(skill_path)
for subdir in ("scripts", "references", "assets"):
src_dir = _skill / subdir
if src_dir.is_dir():
dst_dir = with_workspace / subdir
shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)
# Also copy SKILL.md so the agent can read it if needed
_skill_md = _skill / "SKILL.md"
if _skill_md.is_file():
shutil.copy2(_skill_md, with_workspace / "SKILL.md")

# Run WITH skill
with_stdout, with_stderr, with_rc, with_elapsed = runner.run_prompt(
eval_case.prompt,
skill_path=str(skill_path),
workspace_dir=str(workspace),
workspace_dir=str(with_workspace),
timeout=timeout,
output_format="stream-json",
)
with_parsed = runner.parse_output(with_stdout)

# Run WITHOUT skill (fresh workspace)
# Run WITHOUT skill (clean workspace — no skill files)
without_stdout, without_stderr, without_rc, without_elapsed = runner.run_prompt(
eval_case.prompt,
skill_path=None,
workspace_dir=str(workspace),
workspace_dir=str(without_workspace),
timeout=timeout,
output_format="stream-json",
)
Expand Down
27 changes: 26 additions & 1 deletion skill_eval/trigger.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,9 +298,34 @@ def _detect_skill_trigger_from_parsed(parsed: dict, skill_path: Path) -> bool:

# Also check text output for skill activation markers
text = parsed.get("text", "")
if f"skill:{skill_name}" in text.lower() or f"using {skill_name}" in text.lower():
text_lower = text.lower()
skill_name_lower = skill_name.lower()

# Direct skill references
if f"skill:{skill_name_lower}" in text_lower or f"using {skill_name_lower}" in text_lower:
return True

# When skill is injected via --append-system-prompt, the agent may
# reference the skill by name or mention its scripts/rules without
# explicitly "activating" it through a tool call. Detect broader
# references to the skill name in the output text.
# Use word-boundary matching to avoid false positives on partial names.
import re as _re
# Match skill name as a standalone term (hyphenated names are common)
skill_word_pattern = _re.compile(
r'\b' + _re.escape(skill_name_lower) + r'\b',
_re.IGNORECASE,
)
if skill_word_pattern.search(text):
return True

# Also detect references to skill scripts in the text, using path-level
# matching (scripts/{filename}) to avoid false positives on common names
# like "check.py" or "main.py".
for script_file in script_files:
if f"scripts/{script_file.lower()}" in text_lower:
return True

return False


Expand Down
12 changes: 8 additions & 4 deletions tests/test_agent_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,20 +377,24 @@ def test_build_cmd_with_skill_uses_append_system_prompt(self, tmp_path):
cmd = runner._build_cmd_with_skill("test prompt", str(skill_dir))
assert "--append-system-prompt" in cmd
assert "--plugin-dir" not in cmd
assert "skill content" in cmd
# The injected content should include a workspace hint
idx = cmd.index("--append-system-prompt")
injected = cmd[idx + 1]
assert "skill content" in injected
assert "scripts/" in injected
assert "claude" == cmd[0]
assert "-p" in cmd
assert "test prompt" in cmd

def test_build_cmd_with_skill_includes_skill_tool(self, tmp_path):
"""With-skill command should include the Skill tool."""
def test_build_cmd_with_skill_no_skill_tool(self, tmp_path):
"""With-skill command should NOT include the Skill tool (system prompt injection)."""
runner = ClaudeRunner()
skill_dir = tmp_path / "skill"
skill_dir.mkdir()
(skill_dir / "SKILL.md").write_text("content")

cmd = runner._build_cmd_with_skill("prompt", str(skill_dir))
assert "Skill" in cmd
assert "Skill" not in cmd

def test_build_cmd_without_skill(self):
"""_build_cmd_without_skill should not include --append-system-prompt or Skill."""
Expand Down
40 changes: 40 additions & 0 deletions tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,46 @@ def test_execute_eval_pair_copies_files_to_workspace(self):
assert first_kwargs.get("skill_path") is not None
assert second_kwargs.get("skill_path") is None

def test_execute_eval_pair_copies_skill_resources_to_workspace(self):
"""Skill scripts/ and references/ should be in with-skill workspace only."""
stream = _make_stream_json("name age")
runner = self._make_mock_runner()
captured_workspaces = []

def capture_run_prompt(prompt, **kwargs):
wd = kwargs.get("workspace_dir")
if wd:
from pathlib import Path as _P
workspace = _P(wd)
captured_workspaces.append({
"skill_path": kwargs.get("skill_path"),
"has_scripts": (workspace / "scripts").is_dir(),
"has_skill_md": (workspace / "SKILL.md").is_file(),
})
return (stream, "", 0, 1.0)

runner.run_prompt.side_effect = capture_run_prompt

_execute_eval_pair(
self._eval_case(),
FIXTURES / "good-skill",
FIXTURES / "good-skill" / "evals",
run_index=0,
timeout=30,
runner=runner,
)
# with-skill workspace should have scripts/ and SKILL.md
assert len(captured_workspaces) == 2
with_skill_ws = captured_workspaces[0]
assert with_skill_ws["skill_path"] is not None
assert with_skill_ws["has_scripts"] is True
assert with_skill_ws["has_skill_md"] is True
# without-skill workspace should NOT have scripts/ or SKILL.md
without_skill_ws = captured_workspaces[1]
assert without_skill_ws["skill_path"] is None
assert without_skill_ws["has_scripts"] is False
assert without_skill_ws["has_skill_md"] is False


class TestClassifyCostEfficiency:
"""Test Pareto cost-efficiency classification."""
Expand Down
74 changes: 74 additions & 0 deletions tests/test_trigger.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,80 @@ def test_multiple_tool_calls_one_triggers(self, tmp_path):
assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True


class TestTextTriggerDetection:
"""Test skill name and script reference detection in text output."""

def test_detect_skill_name_in_text(self):
"""Should detect skill name as a standalone word in text output."""
parsed = {
"tool_calls": [],
"text": "I'll use the acme-compliance checker to validate this document.",
}
assert _detect_skill_trigger_from_parsed(
parsed, Path("/tmp/acme-compliance")
) is True

def test_detect_skill_name_word_boundary(self):
"""Should not match partial words containing the skill name."""
parsed = {
"tool_calls": [],
"text": "This is about non-acme-compliance-related matters.",
}
# "acme-compliance" still appears as a word here, so it matches
assert _detect_skill_trigger_from_parsed(
parsed, Path("/tmp/acme-compliance")
) is True

def test_no_partial_match(self):
"""Should not match when skill name is only a substring."""
parsed = {
"tool_calls": [],
"text": "The weather is nice today.",
}
assert _detect_skill_trigger_from_parsed(
parsed, Path("/tmp/weather-forecast")
) is False

def test_detect_script_file_in_text(self, tmp_path):
"""Should detect references to skill scripts using path-level matching."""
skill_dir = tmp_path / "my-skill"
scripts_dir = skill_dir / "scripts"
scripts_dir.mkdir(parents=True)
(scripts_dir / "check.py").write_text("#!/usr/bin/env python3\npass")

parsed = {
"tool_calls": [],
"text": "You should run scripts/check.py to validate the document.",
}
assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True

def test_no_detect_bare_script_name(self, tmp_path):
"""Should NOT trigger on bare script name without scripts/ prefix."""
skill_dir = tmp_path / "my-skill"
scripts_dir = skill_dir / "scripts"
scripts_dir.mkdir(parents=True)
(scripts_dir / "check.py").write_text("pass")

parsed = {
"tool_calls": [],
"text": "Run check.py to validate the document.",
}
assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False

def test_no_detect_unrelated_text(self, tmp_path):
"""Should not trigger on unrelated text."""
skill_dir = tmp_path / "my-skill"
scripts_dir = skill_dir / "scripts"
scripts_dir.mkdir(parents=True)
(scripts_dir / "check.py").write_text("pass")

parsed = {
"tool_calls": [],
"text": "Here is a summary of the document.",
}
assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False


class TestTriggerQueryResultTokenFields:
"""Test TriggerQueryResult token fields."""

Expand Down
Loading