diff --git a/skill_eval/audit/structure_check.py b/skill_eval/audit/structure_check.py index 50f99e9..b57adea 100644 --- a/skill_eval/audit/structure_check.py +++ b/skill_eval/audit/structure_check.py @@ -3,7 +3,10 @@ Checks: - SKILL.md exists and has valid YAML frontmatter - name field: required, 1-64 chars, lowercase+hyphens, matches directory name +- name field: must not contain reserved words ('anthropic', 'claude') - description field: required, 1-1024 chars, non-empty +- description field: must not contain XML tags +- description field: should use third person (not first/second person) - Optional fields: license, compatibility (max 500 chars), metadata, allowed-tools - Directory structure conventions - Progressive disclosure (SKILL.md size, reference file sizes) @@ -270,6 +273,22 @@ def check_structure(skill_path: str | Path) -> tuple[list[Finding], Optional[dic fix=f"Either rename the directory to '{name}/' or change the name field to '{dir_name}'.", )) + # Check name does not contain reserved words (agentskills.io spec) + _RESERVED_WORDS = ["anthropic", "claude"] + for reserved in _RESERVED_WORDS: + if reserved in name: + findings.append(Finding( + code="STR-018", + severity=Severity.WARNING, + category=Category.STRUCTURE, + title=f"Name contains reserved word '{reserved}'", + detail=f"The name '{name}' contains '{reserved}', which is reserved per the agentskills.io specification. " + f"Names must not contain 'anthropic' or 'claude'.", + file_path=str(skill_md), + fix=f"Remove '{reserved}' from the skill name. Use a descriptive name for what the skill does instead.", + )) + break # One finding is enough + # --- Check 4: description field --- desc = frontmatter.get("description") if not desc: @@ -306,6 +325,45 @@ def check_structure(skill_path: str | Path) -> tuple[list[Finding], Optional[dic fix="Include both what the skill does and specific trigger contexts/phrases.", )) + # Check description does not contain XML tags (agentskills.io spec) + if re.search(r"<[a-zA-Z][^>]*>", desc): + findings.append(Finding( + code="STR-019", + severity=Severity.WARNING, + category=Category.STRUCTURE, + title="Description contains XML tags", + detail="The description field must not contain XML tags per the agentskills.io specification. " + "XML tags in the description can interfere with system prompt injection.", + file_path=str(skill_md), + fix="Remove all XML/HTML tags from the description. Use plain text only.", + )) + + # Check description uses third person (Anthropic best practice) + # Flag first-person ("I can", "I will", "I help") and second-person ("You can", "You will") + _FIRST_PERSON_RE = re.compile( + r"\b(I can|I will|I help|I am|I\'m|I process|I analyze|I generate|I create|I extract|I manage)\b", + re.IGNORECASE, + ) + _SECOND_PERSON_RE = re.compile( + r"\b(You can|You will|You should|You may)\b", + re.IGNORECASE, + ) + fp_match = _FIRST_PERSON_RE.search(desc) + sp_match = _SECOND_PERSON_RE.search(desc) + if fp_match or sp_match: + matched = fp_match.group(0) if fp_match else sp_match.group(0) + findings.append(Finding( + code="STR-020", + severity=Severity.INFO, + category=Category.STRUCTURE, + title="Description should use third person", + detail=f"Found '{matched}' in description. Per Anthropic best practices, descriptions should use " + f"third person (e.g., 'Processes Excel files') instead of first person ('I can help') " + f"or second person ('You can use this'). Inconsistent point-of-view can cause discovery problems.", + file_path=str(skill_md), + fix="Rewrite the description in third person: 'Analyzes data...' instead of 'I analyze data...'.", + )) + # --- Check 5: compatibility field --- compat = frontmatter.get("compatibility") if compat and isinstance(compat, str) and len(compat) > _MAX_COMPAT_LEN: diff --git a/tests/test_structure_check.py b/tests/test_structure_check.py index f23ea14..262d12d 100644 --- a/tests/test_structure_check.py +++ b/tests/test_structure_check.py @@ -140,3 +140,109 @@ def test_real_skill_weather(self): assert fm["name"] == "weather" critical = [f for f in findings if f.severity == Severity.CRITICAL] assert len(critical) == 0 + + +class TestBestPracticeChecks: + """Test Anthropic best practice checks (STR-018, STR-019, STR-020).""" + + def _make_skill(self, tmp_path, name="test-skill", description="A test skill that does useful things for testing purposes."): + """Helper: create a minimal valid skill directory with given name/description.""" + skill_dir = tmp_path / name + skill_dir.mkdir() + skill_md = skill_dir / "SKILL.md" + skill_md.write_text(f"---\nname: {name}\ndescription: \"{description}\"\n---\n\n# Test\n\nInstructions here.\n") + return skill_dir + + # --- STR-018: Reserved words in name --- + + def test_name_with_claude_reserved_word(self, tmp_path): + skill_dir = self._make_skill(tmp_path, name="claude-helper", description="Helps with code review and analysis tasks.") + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-018" in codes, "Should flag 'claude' in skill name" + + def test_name_with_anthropic_reserved_word(self, tmp_path): + skill_dir = self._make_skill(tmp_path, name="anthropic-tools", description="Provides various analysis tools for development.") + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-018" in codes, "Should flag 'anthropic' in skill name" + + def test_name_without_reserved_words(self, tmp_path): + skill_dir = self._make_skill(tmp_path, name="code-review", description="Reviews code for bugs, security issues, and best practices.") + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-018" not in codes, "Should not flag normal skill name" + + # --- STR-019: XML tags in description --- + + def test_description_with_xml_tags(self, tmp_path): + skill_dir = self._make_skill( + tmp_path, + name="xml-test", + description="Processes data and generates reports.", + ) + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-019" in codes, "Should flag XML tags in description" + + def test_description_with_html_tags(self, tmp_path): + skill_dir = self._make_skill( + tmp_path, + name="html-test", + description="Analyzes code for security vulnerabilities and best practices.", + ) + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-019" in codes, "Should flag HTML tags in description" + + def test_description_without_xml_tags(self, tmp_path): + skill_dir = self._make_skill( + tmp_path, + name="no-xml-test", + description="Analyzes code for security vulnerabilities and best practices.", + ) + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-019" not in codes, "Should not flag clean description" + + # --- STR-020: Third person description --- + + def test_description_first_person(self, tmp_path): + skill_dir = self._make_skill( + tmp_path, + name="first-person-test", + description="I can help you process Excel files and generate reports.", + ) + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-020" in codes, "Should flag first-person description" + + def test_description_second_person(self, tmp_path): + skill_dir = self._make_skill( + tmp_path, + name="second-person-test", + description="You can use this to process Excel files and generate reports.", + ) + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-020" in codes, "Should flag second-person description" + + def test_description_third_person(self, tmp_path): + skill_dir = self._make_skill( + tmp_path, + name="third-person-test", + description="Processes Excel files and generates reports. Use when working with spreadsheet data.", + ) + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-020" not in codes, "Should not flag third-person description" + + def test_description_im_contraction(self, tmp_path): + skill_dir = self._make_skill( + tmp_path, + name="contraction-test", + description="I'm a helpful skill that processes data files and outputs summaries.", + ) + findings, fm, _ = check_structure(skill_dir) + codes = [f.code for f in findings] + assert "STR-020" in codes, "Should flag I'm contraction"