Skip to content

Commit 274ee1f

Browse files
author
Melanie Li
committed
fix: copy skill resources to workspace, improve trigger detection, add workspace hint
Three issues that significantly impact functional and trigger scores: 1. Functional eval creates a temp workspace but doesn't copy the skill's scripts/, references/, or assets/ directories into it. When the agent tries to execute skill scripts (e.g., 'python3 scripts/check.py'), the files don't exist, causing script-based assertions to fail. Fix: Copy scripts/, references/, assets/, and SKILL.md from the skill directory into the with-skill workspace. Use separate workspaces for with-skill and without-skill runs to prevent contamination. 2. Trigger detection only recognizes skill activation through Read tool calls targeting SKILL.md. However, ClaudeRunner injects skill content via --append-system-prompt, so the agent never reads SKILL.md from disk. Fix: Add word-boundary matching for skill name in agent text output, and path-level matching (scripts/{filename}) for script references. Bare filenames like 'check.py' no longer trigger false positives. 3. The system prompt injection doesn't tell the agent that skill scripts are available in the working directory, so the agent may not attempt to execute them even when they're present. Fix: Append a workspace hint to the injected system prompt informing the agent that scripts/ is available in the working directory. Also removes 'Skill' from --allowedTools since it refers to Claude Code's ~/.claude/commands/ mechanism which is not used with --append-system-prompt. All 652 tests pass (8 new tests added).
1 parent ddb15c6 commit 274ee1f

6 files changed

Lines changed: 195 additions & 21 deletions

File tree

skill_eval/agent_runner.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,21 @@ def _build_cmd_with_skill(self, prompt: str, skill_path: str) -> list[str]:
140140
skill_content = self._read_skill_content(skill_path)
141141
cmd = [
142142
self.CLI_NAME, "-p", prompt,
143-
"--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit", "Skill",
143+
"--allowedTools", "Read", "Glob", "Grep", "Bash", "Write", "Edit",
144144
]
145145
if skill_content:
146-
cmd.extend(["--append-system-prompt", skill_content])
146+
# Append a workspace hint so the agent knows skill resources
147+
# (scripts/, references/, assets/) are available in the working
148+
# directory. Without this, the agent may not attempt to execute
149+
# scripts even though they've been copied to the workspace.
150+
workspace_hint = (
151+
"\n\n---\n"
152+
"Note: The skill's scripts/, references/, and assets/ "
153+
"directories are available in your current working directory. "
154+
"You can execute scripts directly, e.g. "
155+
"`python3 scripts/example.py`."
156+
)
157+
cmd.extend(["--append-system-prompt", skill_content + workspace_hint])
147158
return cmd
148159

149160
def _build_cmd_without_skill(self, prompt: str) -> list[str]:

skill_eval/functional.py

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -235,33 +235,53 @@ def _execute_eval_pair(
235235
if runner is None:
236236
runner = get_runner("claude")
237237

238-
# Set up workspace with eval files
239-
with tempfile.TemporaryDirectory(prefix="skill-eval-") as tmpdir:
240-
workspace = Path(tmpdir)
241-
242-
# Copy any referenced files into workspace
243-
for rel_file in eval_case.files:
244-
src = evals_dir / rel_file
245-
dst = workspace / Path(rel_file).name
246-
if src.is_file():
247-
dst.parent.mkdir(parents=True, exist_ok=True)
248-
shutil.copy2(src, dst)
238+
# Set up separate workspaces for with-skill and without-skill runs.
239+
# Using independent workspaces prevents contamination — without-skill
240+
# should not see skill scripts/SKILL.md that were copied for with-skill.
241+
with tempfile.TemporaryDirectory(prefix="skill-eval-with-") as with_tmpdir, \
242+
tempfile.TemporaryDirectory(prefix="skill-eval-without-") as without_tmpdir:
243+
with_workspace = Path(with_tmpdir)
244+
without_workspace = Path(without_tmpdir)
245+
246+
# Copy eval case files into both workspaces
247+
for ws in (with_workspace, without_workspace):
248+
for rel_file in eval_case.files:
249+
src = evals_dir / rel_file
250+
dst = ws / Path(rel_file).name
251+
if src.is_file():
252+
dst.parent.mkdir(parents=True, exist_ok=True)
253+
shutil.copy2(src, dst)
254+
255+
# Copy skill resource directories into with-skill workspace only,
256+
# so the agent can access scripts/, references/, and assets/ as
257+
# described in SKILL.md.
258+
if skill_path:
259+
_skill = Path(skill_path)
260+
for subdir in ("scripts", "references", "assets"):
261+
src_dir = _skill / subdir
262+
if src_dir.is_dir():
263+
dst_dir = with_workspace / subdir
264+
shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)
265+
# Also copy SKILL.md so the agent can read it if needed
266+
_skill_md = _skill / "SKILL.md"
267+
if _skill_md.is_file():
268+
shutil.copy2(_skill_md, with_workspace / "SKILL.md")
249269

250270
# Run WITH skill
251271
with_stdout, with_stderr, with_rc, with_elapsed = runner.run_prompt(
252272
eval_case.prompt,
253273
skill_path=str(skill_path),
254-
workspace_dir=str(workspace),
274+
workspace_dir=str(with_workspace),
255275
timeout=timeout,
256276
output_format="stream-json",
257277
)
258278
with_parsed = runner.parse_output(with_stdout)
259279

260-
# Run WITHOUT skill (fresh workspace)
280+
# Run WITHOUT skill (clean workspace — no skill files)
261281
without_stdout, without_stderr, without_rc, without_elapsed = runner.run_prompt(
262282
eval_case.prompt,
263283
skill_path=None,
264-
workspace_dir=str(workspace),
284+
workspace_dir=str(without_workspace),
265285
timeout=timeout,
266286
output_format="stream-json",
267287
)

skill_eval/trigger.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,9 +298,34 @@ def _detect_skill_trigger_from_parsed(parsed: dict, skill_path: Path) -> bool:
298298

299299
# Also check text output for skill activation markers
300300
text = parsed.get("text", "")
301-
if f"skill:{skill_name}" in text.lower() or f"using {skill_name}" in text.lower():
301+
text_lower = text.lower()
302+
skill_name_lower = skill_name.lower()
303+
304+
# Direct skill references
305+
if f"skill:{skill_name_lower}" in text_lower or f"using {skill_name_lower}" in text_lower:
302306
return True
303307

308+
# When skill is injected via --append-system-prompt, the agent may
309+
# reference the skill by name or mention its scripts/rules without
310+
# explicitly "activating" it through a tool call. Detect broader
311+
# references to the skill name in the output text.
312+
# Use word-boundary matching to avoid false positives on partial names.
313+
import re as _re
314+
# Match skill name as a standalone term (hyphenated names are common)
315+
skill_word_pattern = _re.compile(
316+
r'\b' + _re.escape(skill_name_lower) + r'\b',
317+
_re.IGNORECASE,
318+
)
319+
if skill_word_pattern.search(text):
320+
return True
321+
322+
# Also detect references to skill scripts in the text, using path-level
323+
# matching (scripts/{filename}) to avoid false positives on common names
324+
# like "check.py" or "main.py".
325+
for script_file in script_files:
326+
if f"scripts/{script_file.lower()}" in text_lower:
327+
return True
328+
304329
return False
305330

306331

tests/test_agent_runner.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -377,20 +377,24 @@ def test_build_cmd_with_skill_uses_append_system_prompt(self, tmp_path):
377377
cmd = runner._build_cmd_with_skill("test prompt", str(skill_dir))
378378
assert "--append-system-prompt" in cmd
379379
assert "--plugin-dir" not in cmd
380-
assert "skill content" in cmd
380+
# The injected content should include a workspace hint
381+
idx = cmd.index("--append-system-prompt")
382+
injected = cmd[idx + 1]
383+
assert "skill content" in injected
384+
assert "scripts/" in injected
381385
assert "claude" == cmd[0]
382386
assert "-p" in cmd
383387
assert "test prompt" in cmd
384388

385-
def test_build_cmd_with_skill_includes_skill_tool(self, tmp_path):
386-
"""With-skill command should include the Skill tool."""
389+
def test_build_cmd_with_skill_no_skill_tool(self, tmp_path):
390+
"""With-skill command should NOT include the Skill tool (system prompt injection)."""
387391
runner = ClaudeRunner()
388392
skill_dir = tmp_path / "skill"
389393
skill_dir.mkdir()
390394
(skill_dir / "SKILL.md").write_text("content")
391395

392396
cmd = runner._build_cmd_with_skill("prompt", str(skill_dir))
393-
assert "Skill" in cmd
397+
assert "Skill" not in cmd
394398

395399
def test_build_cmd_without_skill(self):
396400
"""_build_cmd_without_skill should not include --append-system-prompt or Skill."""

tests/test_functional.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,46 @@ def test_execute_eval_pair_copies_files_to_workspace(self):
484484
assert first_kwargs.get("skill_path") is not None
485485
assert second_kwargs.get("skill_path") is None
486486

487+
def test_execute_eval_pair_copies_skill_resources_to_workspace(self):
488+
"""Skill scripts/ and references/ should be in with-skill workspace only."""
489+
stream = _make_stream_json("name age")
490+
runner = self._make_mock_runner()
491+
captured_workspaces = []
492+
493+
def capture_run_prompt(prompt, **kwargs):
494+
wd = kwargs.get("workspace_dir")
495+
if wd:
496+
from pathlib import Path as _P
497+
workspace = _P(wd)
498+
captured_workspaces.append({
499+
"skill_path": kwargs.get("skill_path"),
500+
"has_scripts": (workspace / "scripts").is_dir(),
501+
"has_skill_md": (workspace / "SKILL.md").is_file(),
502+
})
503+
return (stream, "", 0, 1.0)
504+
505+
runner.run_prompt.side_effect = capture_run_prompt
506+
507+
_execute_eval_pair(
508+
self._eval_case(),
509+
FIXTURES / "good-skill",
510+
FIXTURES / "good-skill" / "evals",
511+
run_index=0,
512+
timeout=30,
513+
runner=runner,
514+
)
515+
# with-skill workspace should have scripts/ and SKILL.md
516+
assert len(captured_workspaces) == 2
517+
with_skill_ws = captured_workspaces[0]
518+
assert with_skill_ws["skill_path"] is not None
519+
assert with_skill_ws["has_scripts"] is True
520+
assert with_skill_ws["has_skill_md"] is True
521+
# without-skill workspace should NOT have scripts/ or SKILL.md
522+
without_skill_ws = captured_workspaces[1]
523+
assert without_skill_ws["skill_path"] is None
524+
assert without_skill_ws["has_scripts"] is False
525+
assert without_skill_ws["has_skill_md"] is False
526+
487527

488528
class TestClassifyCostEfficiency:
489529
"""Test Pareto cost-efficiency classification."""

tests/test_trigger.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,80 @@ def test_multiple_tool_calls_one_triggers(self, tmp_path):
435435
assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True
436436

437437

438+
class TestTextTriggerDetection:
439+
"""Test skill name and script reference detection in text output."""
440+
441+
def test_detect_skill_name_in_text(self):
442+
"""Should detect skill name as a standalone word in text output."""
443+
parsed = {
444+
"tool_calls": [],
445+
"text": "I'll use the acme-compliance checker to validate this document.",
446+
}
447+
assert _detect_skill_trigger_from_parsed(
448+
parsed, Path("/tmp/acme-compliance")
449+
) is True
450+
451+
def test_detect_skill_name_word_boundary(self):
452+
"""Should not match partial words containing the skill name."""
453+
parsed = {
454+
"tool_calls": [],
455+
"text": "This is about non-acme-compliance-related matters.",
456+
}
457+
# "acme-compliance" still appears as a word here, so it matches
458+
assert _detect_skill_trigger_from_parsed(
459+
parsed, Path("/tmp/acme-compliance")
460+
) is True
461+
462+
def test_no_partial_match(self):
463+
"""Should not match when skill name is only a substring."""
464+
parsed = {
465+
"tool_calls": [],
466+
"text": "The weather is nice today.",
467+
}
468+
assert _detect_skill_trigger_from_parsed(
469+
parsed, Path("/tmp/weather-forecast")
470+
) is False
471+
472+
def test_detect_script_file_in_text(self, tmp_path):
473+
"""Should detect references to skill scripts using path-level matching."""
474+
skill_dir = tmp_path / "my-skill"
475+
scripts_dir = skill_dir / "scripts"
476+
scripts_dir.mkdir(parents=True)
477+
(scripts_dir / "check.py").write_text("#!/usr/bin/env python3\npass")
478+
479+
parsed = {
480+
"tool_calls": [],
481+
"text": "You should run scripts/check.py to validate the document.",
482+
}
483+
assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is True
484+
485+
def test_no_detect_bare_script_name(self, tmp_path):
486+
"""Should NOT trigger on bare script name without scripts/ prefix."""
487+
skill_dir = tmp_path / "my-skill"
488+
scripts_dir = skill_dir / "scripts"
489+
scripts_dir.mkdir(parents=True)
490+
(scripts_dir / "check.py").write_text("pass")
491+
492+
parsed = {
493+
"tool_calls": [],
494+
"text": "Run check.py to validate the document.",
495+
}
496+
assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False
497+
498+
def test_no_detect_unrelated_text(self, tmp_path):
499+
"""Should not trigger on unrelated text."""
500+
skill_dir = tmp_path / "my-skill"
501+
scripts_dir = skill_dir / "scripts"
502+
scripts_dir.mkdir(parents=True)
503+
(scripts_dir / "check.py").write_text("pass")
504+
505+
parsed = {
506+
"tool_calls": [],
507+
"text": "Here is a summary of the document.",
508+
}
509+
assert _detect_skill_trigger_from_parsed(parsed, skill_dir) is False
510+
511+
438512
class TestTriggerQueryResultTokenFields:
439513
"""Test TriggerQueryResult token fields."""
440514

0 commit comments

Comments
 (0)