From 45210e13fe48f056ae8bf2cc720a6285f574024e Mon Sep 17 00:00:00 2001 From: "Jonathan D.A. Jewell" <6759885+hyperpolymath@users.noreply.github.com> Date: Tue, 28 Apr 2026 03:51:16 +0100 Subject: [PATCH] feat(bust): extend schema with alert_remediations + seed 11 entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a sibling array to the existing failure_modes schema for *discovered* (CI/CD/scanner-emitted) failure patterns, alongside the existing *injected* (operational chaos-drill) failure_modes. Changes: - bust.ncl: schema.alert_remediations array (id, title, severity, fingerprint, diagnosis, remediation, verification, preemption, references, last_seen, dismissed/dismissed_until). Backwards-compatible — existing failure_modes entries unchanged; default = [] for repos that don't populate it. - Bustfile.a2ml: 11 seed entries derived from the 2026-04-28 estate-wide alert audit (echidna, airborne-submarine-squadron, robodog-ecm, gossamer, boj-server, hypatia, file-soup, KnotTheory.jl, panll). Covers Scorecard (vulnerabilities, token-permissions, pinned-deps, branch-protection, maintained, binary-artifacts split into build-output vs tracked-asset, code-review), CodeQL (incomplete sanitization), and Dependabot (auto-closed-stale, merge-blocked-on-red-main). - EXTENSION-PROPOSAL-2026-04-28.adoc: full design (motivation, Fix-It rendering format, runner contract, inheritance model, consumer table, adoption path, open questions). Each entry carries: - rich human diagnosis (symptom / root cause / why-stuck) - machine-actionable remediation with auto_safe gating - verification check + CI signal to watch - preemption block naming files to add to rsr-template-repo Designed to be consumed by: - humans via `contractile bust diagnose ` (Fix-It-style render) - sustainabot for auto-safe remediations across the estate - hypatia for alert-to-fingerprint mapping - robot-repo-automaton for confidence-gated execution - rsr-template adoption test (preemption.template_addition is the source of truth for what should land in the template) Pre-existing drift between the runner schema's failure_modes shape and the existing Bustfile.a2ml content (which uses `scenarios`/`escalation-ladder`/ `backup-points` instead) is NOT addressed here — out of scope for this PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../contractiles/bust/Bustfile.a2ml | 469 ++++++++++++ .../bust/EXTENSION-PROPOSAL-2026-04-28.adoc | 715 ++++++++++++++++++ .machine_readable/contractiles/bust/bust.ncl | 60 ++ 3 files changed, 1244 insertions(+) create mode 100644 .machine_readable/contractiles/bust/EXTENSION-PROPOSAL-2026-04-28.adoc diff --git a/.machine_readable/contractiles/bust/Bustfile.a2ml b/.machine_readable/contractiles/bust/Bustfile.a2ml index 6afee70..0e2ab1e 100644 --- a/.machine_readable/contractiles/bust/Bustfile.a2ml +++ b/.machine_readable/contractiles/bust/Bustfile.a2ml @@ -25,4 +25,473 @@ Bust { "Meta-repo history on origin/main is the durable backup for pointer state", "Local backup tags (backup/pre--) retained on risky rewrites" ] + + // alert_remediations — discovered (CI/CD/scanner-emitted) failure patterns. + // Sibling array to scenarios; consumed by `contractile bust diagnose ` and + // `contractile bust remediate `. Seeded 2026-04-28 from estate-wide audit. + // See EXTENSION-PROPOSAL-2026-04-28.adoc for schema, rendering, and adoption path. + alert_remediations: [ + { + id: "scorecard.vulnerabilities.rustsec" + title: "Cargo dependency has a known RUSTSEC advisory" + severity: "high" + emitted_by: ["scorecard", "cargo-audit", "cargo-deny", "osv-scanner"] + fingerprint: { + source: "scorecard" + rule_id: "VulnerabilitiesID" + message_regex: "RUSTSEC-\\d{4}-\\d{4}" + } + diagnosis: { + short: "A Cargo dependency is on a version with an open RUSTSEC advisory." + root_cause: "Direct or transitive Cargo dep on a version flagged by RUSTSEC. Three classes: (a) vuln with fix available, (b) unmaintained crate with no fix, (c) yanked version." + why_humans_stuck: "Scorecard reports but no auto-fixer subscribes; cargo-audit not in template; dependabot-automerge skips HIGH severity." + class: "dep_advisory" + } + remediation: { + preferred_strategy: "auto_pr" + human_steps: [ + "cargo audit # confirm advisory + class" + "cargo update -p (class a) OR edit Cargo.toml to swap (class b) OR --precise (class c)" + "cargo test" + "git checkout -b fix/rustsec-" + "git commit -am 'deps: bust RUSTSEC- ()'" + "gh pr create --fill && gh pr merge --auto --squash" + ] + machine_runner: "reposystem/.machine_readable/contractiles/bust/runners/cargo-rustsec.sh" + auto_safe: true + needs_review_when: ["class == 'b' (no drop-in replacement)", "major-version bump required"] + } + verification: { + post_fix_check: "cargo audit --deny warnings" + expected_state: "no advisories" + ci_check_to_watch: "Scorecard / VulnerabilitiesID" + } + preemption: { + template_addition: [ + ".github/workflows/cargo-audit.yml" + ".github/workflows/cargo-deny-check.yml" + ] + template_config: [ + ".machine_readable/compliance/rust/deny.toml: unmaintained = 'deny'" + ] + prevents_recurrence: true + back_ratchet_target: "rust_only" + } + references: ["https://rustsec.org/advisories/", "https://github.com/EmbarkStudios/cargo-deny"] + last_seen: "2026-04-28" + } + + { + id: "scorecard.token-permissions.no-toplevel" + title: "Workflow lacks top-level GITHUB_TOKEN permissions" + severity: "high" + emitted_by: ["scorecard"] + fingerprint: { + source: "scorecard" + rule_id: "TokenPermissionsID" + message_regex: "no topLevel permission defined" + } + diagnosis: { + short: "A workflow YAML has no top-level permissions block, so jobs run with default broad token scope." + root_cause: "Default GITHUB_TOKEN permissions are write-all when no permissions: is declared. Scorecard wants read-all at minimum at workflow level." + why_humans_stuck: "Scorecard fires per-workflow per-commit. Template enforces this for new repos; older repos predate the rule." + class: "permission_misconfig" + } + remediation: { + preferred_strategy: "config_change" + human_steps: [ + "Open every .github/workflows/*.yml lacking top-level permissions" + "Add `permissions: read-all` directly under `name:` (or before `on:`)" + "Per-job override only where explicitly needed (e.g. `pages: write`)" + "Commit + push" + ] + machine_runner: "reposystem/.machine_readable/contractiles/bust/runners/inject-toplevel-permissions.sh" + auto_safe: true + } + verification: { + post_fix_check: "yq '.permissions' .github/workflows/*.yml | grep -v null | wc -l # all should be non-null" + expected_state: "every workflow has a top-level permissions key" + ci_check_to_watch: "Scorecard / TokenPermissionsID" + } + preemption: { + template_addition: [".github/workflows/workflow-linter.yml (already in template)"] + template_config: ["All template workflows already carry permissions: read-all"] + prevents_recurrence: true + back_ratchet_target: "all_repos" + } + references: ["https://github.com/ossf/scorecard/blob/main/docs/checks.md#token-permissions"] + last_seen: "2026-04-28" // airborne-submarine-squadron #11 + } + + { + id: "scorecard.pinned-dependencies.action-not-pinned" + title: "GitHub Action is referenced by tag/branch instead of SHA" + severity: "medium" + emitted_by: ["scorecard"] + fingerprint: { + source: "scorecard" + rule_id: "PinnedDependenciesID" + message_regex: "GitHubAction not pinned by hash" + } + diagnosis: { + short: "Workflow references like `actions/checkout@v6` instead of `actions/checkout@ # v6.0.2`." + root_cause: "Tag/branch refs let action authors silently re-publish the same ref pointing at different code. SHA pin freezes the artefact." + why_humans_stuck: "Many repos predate the SHA-pinning rule. 2026-04-17 estate-wide audit found ~12,000 v6 SHAs but ~2,100 v4 lingering. Template enforces forward; nobody back-ratchets." + class: "config_drift" + } + remediation: { + preferred_strategy: "auto_pr" + human_steps: [ + "For each unpinned action, look up the canonical SHA (CLAUDE.md SHA table)" + "sed-replace `@v` with `@ # v`" + "Verify CI still green" + "Commit + PR" + ] + machine_runner: "reposystem/.machine_readable/contractiles/bust/runners/sha-pin-actions.sh" + auto_safe: true + needs_review_when: ["action not in canonical SHA table", "flathub repos (deliberate v4 pin)"] + } + verification: { + post_fix_check: "grep -E '@v[0-9]' .github/workflows/*.yml # should be empty (or only in comments)" + expected_state: "every action @ ref is a 40-char SHA" + ci_check_to_watch: "Scorecard / PinnedDependenciesID" + } + preemption: { + template_addition: [".github/workflows/workflow-linter.yml"] + template_config: ["CLAUDE.md SHA table is the canonical reference"] + prevents_recurrence: true + back_ratchet_target: "all_repos" + } + references: ["CLAUDE.md - Common GitHub Actions SHA Pins section"] + last_seen: "2026-04-28" + } + + { + id: "scorecard.branch-protection.weak-or-missing" + title: "Branch protection on main is weak or absent" + severity: "high" + emitted_by: ["scorecard"] + fingerprint: { + source: "scorecard" + rule_id: "BranchProtectionID" + message_regex: "(branch protection.*disabled|does not require approvers|no status checks)" + } + diagnosis: { + short: "main lacks: required reviews, status-check requirements, admin enforcement, stale-review dismissal, or codeowners." + root_cause: "GitHub branch protection has many independent toggles; older repos were created before the standard was tightened." + why_humans_stuck: "GitHub Free blocks rulesets on private repos (verified 2026-04-10). Estate-wide ratchet stalled on Wave 3 (5 private repos)." + class: "config_drift" + } + remediation: { + preferred_strategy: "cli_command" + human_steps: [ + "julia ~/security-fixes/enable-branch-protection.jl " + "OR via gh api: PUT /repos/{owner}/{repo}/branches/main/protection" + ] + machine_runner: "~/security-fixes/enable-branch-protection.jl" + auto_safe: true + needs_review_when: ["repo is private and on GitHub Free (will fail)"] + } + verification: { + post_fix_check: "gh api /repos///branches/main/protection" + expected_state: "all required-checks/reviews/admin-enforcement enabled" + ci_check_to_watch: "Scorecard / BranchProtectionID" + } + preemption: { + template_addition: ["enable-branch-protection.jl on every repo create"] + prevents_recurrence: true + back_ratchet_target: "all_repos" + } + references: ["~/security-fixes/enable-branch-protection.jl"] + last_seen: "2026-04-28" // robodog-ecm #11 + } + + { + id: "scorecard.maintained.repo-too-young" + title: "Repo created within last 90 days" + severity: "high" + emitted_by: ["scorecard"] + fingerprint: { + source: "scorecard" + rule_id: "MaintainedID" + message_regex: "created within the last 90 days" + } + diagnosis: { + short: "Cosmetic, time-based alert that resolves itself after 90 days from repo creation." + root_cause: "Scorecard penalises new repos because malicious actors create burner repos. Heuristic, not a real defect." + why_humans_stuck: "There is no fix; alert sits open until time passes. Eats human attention every time it appears in a triage list." + class: "expected_after_repo_creation" + } + remediation: { + preferred_strategy: "manual_only" + human_steps: [ + "Acknowledge: this clears automatically 90 days after repo creation" + "Optional: dismiss in GitHub UI with reason 'won't fix - heuristic'" + ] + auto_safe: false + side_effects: ["dismissal is per-repo manual action"] + } + verification: { + post_fix_check: "true # nothing to verify" + expected_state: "alert age > 90 days OR dismissed" + ci_check_to_watch: "Scorecard / MaintainedID" + } + preemption: { + template_addition: ["scorecard-enforcer.yml configured to ignore MaintainedID for first 90 days"] + prevents_recurrence: true + back_ratchet_target: "all_repos" + } + references: ["https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained"] + last_seen: "2026-04-28" + } + + { + id: "dependabot.auto-closed.dep-no-longer-tracked" + title: "Dependabot auto-closed PR with comment 'no longer being updated'" + severity: "low" + emitted_by: ["dependabot"] + fingerprint: { + source: "dependabot" + rule_id: "auto-close" + message_regex: "no longer being updated by Dependabot" + } + diagnosis: { + short: "Dependabot itself closed a bump PR because the dep is no longer in the manifest tracking set." + root_cause: "Manifest dropped the dep, an ignore rule was added, or the dep was renamed/replaced. Closure is an intentional bot signal, not a rejection of the bump's contents." + why_humans_stuck: "Easy to mistake for 'user rejected this' and bulk-delete the head branch." + class: "dep_advisory" + } + remediation: { + preferred_strategy: "cli_command" + human_steps: [ + "Verify dependabot's close comment matches the regex" + "Delete the head branch: gh api -X DELETE repos///git/refs/heads/" + "Do NOT bulk-delete by closed-PR state alone; always check the close reason" + ] + machine_runner: "reposystem/.machine_readable/contractiles/bust/runners/cleanup-dependabot-stale-branch.sh" + auto_safe: true + } + verification: { + post_fix_check: "gh api repos///branches/ 2>&1 | grep -q 'Branch not found'" + expected_state: "branch absent" + } + preemption: { + template_config: ["dependabot.yml ignore rules tracked alongside Bustfile"] + prevents_recurrence: false + back_ratchet_target: "opt_in" + } + references: [] + last_seen: "2026-04-28" // file-soup PR #2 + } + + { + id: "dependabot.merge-blocked.unrelated-red-checks-on-main" + title: "Dependabot PR can't merge because main is red on unrelated checks" + severity: "medium" + emitted_by: ["dependabot", "github-checks"] + fingerprint: { + source: "github-checks" + rule_id: "BLOCKED-merge-state" + message_regex: "mergeStateStatus.*BLOCKED" + } + diagnosis: { + short: "PR is mergeable per git, but auto-merge is gated on 'all required checks green' and main itself has red checks." + root_cause: "Required-status-check policy treats main's failures as PR failures. One broken workflow on main blocks every Dependabot PR." + why_humans_stuck: "Each Dependabot PR appears 'broken' but the actual broken thing is upstream — red checks rotate among repos and never get attention." + class: "config_drift" + } + remediation: { + preferred_strategy: "manual_only" + human_steps: [ + "List failing checks: gh pr view --json statusCheckRollup" + "Identify which fail on main itself (vs. PR-introduced)" + "Fix the upstream broken workflow first" + "Then Dependabot PRs unblock automatically" + ] + auto_safe: false + needs_review_when: ["never auto - upstream cause needs human triage"] + } + verification: { + post_fix_check: "gh pr view --json mergeStateStatus | grep -v BLOCKED" + expected_state: "mergeStateStatus is CLEAN or HAS_HOOKS" + ci_check_to_watch: "all required checks on the affected workflows" + } + preemption: { + template_addition: ["main-health-monitor workflow that opens an issue when main goes red"] + prevents_recurrence: false + back_ratchet_target: "all_repos" + } + references: [] + last_seen: "2026-04-28" // hypatia PR #194 + } + + { + id: "codeql.js.incomplete-multi-character-sanitization" + title: "CodeQL: regex-based HTML sanitizer is incomplete" + severity: "high" + emitted_by: ["codeql"] + fingerprint: { + source: "codeql" + rule_id: "js/incomplete-multi-character-sanitization" + } + diagnosis: { + short: "JS code is using a regex to strip