Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions skill_eval/audit/structure_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
Checks:
- SKILL.md exists and has valid YAML frontmatter
- name field: required, 1-64 chars, lowercase+hyphens, matches directory name
- name field: must not contain reserved words ('anthropic', 'claude')
- description field: required, 1-1024 chars, non-empty
- description field: must not contain XML tags
- description field: should use third person (not first/second person)
- Optional fields: license, compatibility (max 500 chars), metadata, allowed-tools
- Directory structure conventions
- Progressive disclosure (SKILL.md size, reference file sizes)
Expand Down Expand Up @@ -270,6 +273,22 @@ def check_structure(skill_path: str | Path) -> tuple[list[Finding], Optional[dic
fix=f"Either rename the directory to '{name}/' or change the name field to '{dir_name}'.",
))

# Check name does not contain reserved words (agentskills.io spec)
_RESERVED_WORDS = ["anthropic", "claude"]
for reserved in _RESERVED_WORDS:
if reserved in name:
findings.append(Finding(
code="STR-018",
severity=Severity.WARNING,
category=Category.STRUCTURE,
title=f"Name contains reserved word '{reserved}'",
detail=f"The name '{name}' contains '{reserved}', which is reserved per the agentskills.io specification. "
f"Names must not contain 'anthropic' or 'claude'.",
file_path=str(skill_md),
fix=f"Remove '{reserved}' from the skill name. Use a descriptive name for what the skill does instead.",
))
break # One finding is enough

# --- Check 4: description field ---
desc = frontmatter.get("description")
if not desc:
Expand Down Expand Up @@ -306,6 +325,45 @@ def check_structure(skill_path: str | Path) -> tuple[list[Finding], Optional[dic
fix="Include both what the skill does and specific trigger contexts/phrases.",
))

# Check description does not contain XML tags (agentskills.io spec)
if re.search(r"<[a-zA-Z][^>]*>", desc):
findings.append(Finding(
code="STR-019",
severity=Severity.WARNING,
category=Category.STRUCTURE,
title="Description contains XML tags",
detail="The description field must not contain XML tags per the agentskills.io specification. "
"XML tags in the description can interfere with system prompt injection.",
file_path=str(skill_md),
fix="Remove all XML/HTML tags from the description. Use plain text only.",
))

# Check description uses third person (Anthropic best practice)
# Flag first-person ("I can", "I will", "I help") and second-person ("You can", "You will")
_FIRST_PERSON_RE = re.compile(
r"\b(I can|I will|I help|I am|I\'m|I process|I analyze|I generate|I create|I extract|I manage)\b",
re.IGNORECASE,
)
_SECOND_PERSON_RE = re.compile(
r"\b(You can|You will|You should|You may)\b",
re.IGNORECASE,
)
fp_match = _FIRST_PERSON_RE.search(desc)
sp_match = _SECOND_PERSON_RE.search(desc)
if fp_match or sp_match:
matched = fp_match.group(0) if fp_match else sp_match.group(0)
findings.append(Finding(
code="STR-020",
severity=Severity.INFO,
category=Category.STRUCTURE,
title="Description should use third person",
detail=f"Found '{matched}' in description. Per Anthropic best practices, descriptions should use "
f"third person (e.g., 'Processes Excel files') instead of first person ('I can help') "
f"or second person ('You can use this'). Inconsistent point-of-view can cause discovery problems.",
file_path=str(skill_md),
fix="Rewrite the description in third person: 'Analyzes data...' instead of 'I analyze data...'.",
))

# --- Check 5: compatibility field ---
compat = frontmatter.get("compatibility")
if compat and isinstance(compat, str) and len(compat) > _MAX_COMPAT_LEN:
Expand Down
106 changes: 106 additions & 0 deletions tests/test_structure_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,109 @@ def test_real_skill_weather(self):
assert fm["name"] == "weather"
critical = [f for f in findings if f.severity == Severity.CRITICAL]
assert len(critical) == 0


class TestBestPracticeChecks:
"""Test Anthropic best practice checks (STR-018, STR-019, STR-020)."""

def _make_skill(self, tmp_path, name="test-skill", description="A test skill that does useful things for testing purposes."):
"""Helper: create a minimal valid skill directory with given name/description."""
skill_dir = tmp_path / name
skill_dir.mkdir()
skill_md = skill_dir / "SKILL.md"
skill_md.write_text(f"---\nname: {name}\ndescription: \"{description}\"\n---\n\n# Test\n\nInstructions here.\n")
return skill_dir

# --- STR-018: Reserved words in name ---

def test_name_with_claude_reserved_word(self, tmp_path):
skill_dir = self._make_skill(tmp_path, name="claude-helper", description="Helps with code review and analysis tasks.")
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-018" in codes, "Should flag 'claude' in skill name"

def test_name_with_anthropic_reserved_word(self, tmp_path):
skill_dir = self._make_skill(tmp_path, name="anthropic-tools", description="Provides various analysis tools for development.")
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-018" in codes, "Should flag 'anthropic' in skill name"

def test_name_without_reserved_words(self, tmp_path):
skill_dir = self._make_skill(tmp_path, name="code-review", description="Reviews code for bugs, security issues, and best practices.")
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-018" not in codes, "Should not flag normal skill name"

# --- STR-019: XML tags in description ---

def test_description_with_xml_tags(self, tmp_path):
skill_dir = self._make_skill(
tmp_path,
name="xml-test",
description="Processes data <important>and generates reports</important>.",
)
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-019" in codes, "Should flag XML tags in description"

def test_description_with_html_tags(self, tmp_path):
skill_dir = self._make_skill(
tmp_path,
name="html-test",
description="<b>Analyzes</b> code for security vulnerabilities and best practices.",
)
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-019" in codes, "Should flag HTML tags in description"

def test_description_without_xml_tags(self, tmp_path):
skill_dir = self._make_skill(
tmp_path,
name="no-xml-test",
description="Analyzes code for security vulnerabilities and best practices.",
)
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-019" not in codes, "Should not flag clean description"

# --- STR-020: Third person description ---

def test_description_first_person(self, tmp_path):
skill_dir = self._make_skill(
tmp_path,
name="first-person-test",
description="I can help you process Excel files and generate reports.",
)
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-020" in codes, "Should flag first-person description"

def test_description_second_person(self, tmp_path):
skill_dir = self._make_skill(
tmp_path,
name="second-person-test",
description="You can use this to process Excel files and generate reports.",
)
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-020" in codes, "Should flag second-person description"

def test_description_third_person(self, tmp_path):
skill_dir = self._make_skill(
tmp_path,
name="third-person-test",
description="Processes Excel files and generates reports. Use when working with spreadsheet data.",
)
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-020" not in codes, "Should not flag third-person description"

def test_description_im_contraction(self, tmp_path):
skill_dir = self._make_skill(
tmp_path,
name="contraction-test",
description="I'm a helpful skill that processes data files and outputs summaries.",
)
findings, fm, _ = check_structure(skill_dir)
codes = [f.code for f in findings]
assert "STR-020" in codes, "Should flag I'm contraction"
Loading