From 2e443c4d7139f676a8464b0d90251acd93b4fd64 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 14:40:04 -0500 Subject: [PATCH] Fix SDE outage handling for main CI --- .github/workflows/ci.yml | 17 +-- .github/workflows/coverage.yml | 8 +- .github/workflows/release.yml | 70 +++++++++- CHANGELOG.md | 11 +- RELEASING.md | 15 ++- tests/release_publish_invariants.py | 150 +++++++++++++++------ tests/release_signed_release_invariants.sh | 4 +- 7 files changed, 205 insertions(+), 70 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee4354d..cee4d1e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -358,10 +358,11 @@ jobs: # Pattern adapted from microsoft/DiskANN's CI (also a vector-search crate). # The local setup-intel-sde action owns the fixed Intel downloadmirror build, # SHA256 verification, and x86_64 runner guard. The SHA gate fails closed for - # any archive we extract. Pull requests may soft-skip during Intel mirror - # outages, but push/workflow_dispatch runs fail closed; the release gate only - # accepts the post-merge push workflow result, so a release cannot proceed - # without the SDE probe and AVX-512 tests actually executing on main. + # any archive we extract. Pull requests and push runs may soft-skip during + # Intel mirror outages so external downloadmirror challenges do not hold the + # whole branch red. Manual workflow_dispatch runs remain fail-closed, and the + # tag-triggered release workflow has its own fail-closed AVX-512 proof before + # assets can be staged or published. avx512: name: avx512 (Intel SDE / Sapphire Rapids) runs-on: ubuntu-24.04 @@ -394,11 +395,11 @@ jobs: with: version: ${{ env.SDE_VERSION }} sha256: ${{ env.SDE_SHA256 }} - allow-unavailable: ${{ github.event_name == 'pull_request' }} - - name: note Intel SDE unavailable on PR - if: ${{ github.event_name == 'pull_request' && steps.sde.outputs.sde-available != 'true' }} + allow-unavailable: ${{ github.event_name != 'workflow_dispatch' }} + - name: note Intel SDE unavailable + if: ${{ steps.sde.outputs.sde-available != 'true' }} run: | - echo "::warning::Intel SDE archive unavailable on this pull request; push and release-gated runs fail closed." + echo "::warning::Intel SDE archive unavailable; SDE-dependent CI steps skipped. The release workflow has a separate fail-closed AVX-512 proof." - name: sanity-check AVX-512 detection under SDE if: ${{ steps.sde.outputs.sde-available == 'true' }} env: diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index e68c1f7..f732e13 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -54,11 +54,11 @@ jobs: with: version: ${{ env.SDE_VERSION }} sha256: ${{ env.SDE_SHA256 }} - allow-unavailable: ${{ github.event_name == 'pull_request' }} - - name: note Intel SDE unavailable on PR - if: ${{ github.event_name == 'pull_request' && steps.sde.outputs.sde-available != 'true' }} + allow-unavailable: ${{ github.event_name != 'workflow_dispatch' }} + - name: note Intel SDE unavailable + if: ${{ steps.sde.outputs.sde-available != 'true' }} run: | - echo "::warning::Intel SDE archive unavailable on this pull request; push and release-gated runs fail closed." + echo "::warning::Intel SDE archive unavailable; SDE-backed coverage skipped. The release workflow has a separate fail-closed AVX-512 proof." - name: Install cargo-llvm-cov (pinned) if: ${{ steps.sde.outputs.sde-available == 'true' }} run: cargo install cargo-llvm-cov --version 0.8.7 --locked diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ce0499f..051d574 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -65,6 +65,10 @@ # sets, so nothing is published unless the artifact source is verified; and # `publish-github-release` `needs:` every registry gate, so the Release stays # DRAFT unless all pass. +# `release-avx512` reruns the Intel SDE CPUID probe and AVX-512 tests inside +# this tag workflow and is a hard dependency of core asset staging. Routine +# CI may tolerate Intel mirror outages, but a release cannot publish on skipped +# AVX-512 coverage. # The signed-release graph is pinned in # `tests/release_signed_release_invariants.sh` (run by ci.yml's release-guard # on every push/PR) so a future commit can't silently dismantle it. @@ -191,6 +195,70 @@ jobs: fi done + release-avx512: + name: prove AVX-512 coverage under Intel SDE + needs: [guard, require-ci-green] + if: needs.guard.outputs.ok == 'true' + runs-on: ubuntu-24.04 + permissions: + contents: read + env: + SDE_VERSION: sde-external-10.8.0-2026-03-15-lin + SDE_SHA256: 50b320cd226acef7a491f5b321fc1be3c3c7984f9e27a456e64894b5b0979dd3 + steps: + - name: Harden the runner + uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable (2026-03-27) + with: + toolchain: stable + - name: Install Intel SDE + id: sde + uses: ./.github/actions/setup-intel-sde + with: + version: ${{ env.SDE_VERSION }} + sha256: ${{ env.SDE_SHA256 }} + allow-unavailable: "false" + - name: Sanity-check AVX-512 detection under SDE + env: + SDE_PATH: ${{ steps.sde.outputs.sde-path }} + run: | + set -euo pipefail + mkdir -p "${RUNNER_TEMP}/sde-probe/src" + cat > "${RUNNER_TEMP}/sde-probe/Cargo.toml" <<'EOF' + [package] + name = "sde-probe" + version = "0.0.0" + edition = "2021" + [[bin]] + name = "sde-probe" + path = "src/main.rs" + EOF + cat > "${RUNNER_TEMP}/sde-probe/src/main.rs" <<'EOF' + fn main() { + let f = is_x86_feature_detected!("avx512f"); + let p = is_x86_feature_detected!("avx512vpopcntdq"); + println!("avx512f={f} avx512vpopcntdq={p}"); + assert!(f, "SDE did not expose avx512f to the guest"); + assert!(p, "SDE did not expose avx512vpopcntdq to the guest"); + } + EOF + cargo build --release --manifest-path "${RUNNER_TEMP}/sde-probe/Cargo.toml" + "${SDE_PATH}" -spr -- \ + "${RUNNER_TEMP}/sde-probe/target/release/sde-probe" + - name: cargo test under SDE (AVX-512 kernels) + env: + CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER: ${{ steps.sde.outputs.sde-path }} -spr -- + ORDVEC_REQUIRE_AVX512: "1" + run: | + set -euo pipefail + cargo test + cargo test --features experimental + notes: name: release notes (git-cliff) + draft Release needs: guard @@ -962,7 +1030,7 @@ jobs: release-assets-draft: name: stage core/Python assets on the DRAFT Release (does NOT un-draft) - needs: [guard, notes, attest, provenance, pypi-canonical-dist, require-ci-green, smoke-linux-aarch64-wheel] + needs: [guard, notes, attest, provenance, pypi-canonical-dist, require-ci-green, release-avx512, smoke-linux-aarch64-wheel] if: needs.guard.outputs.ok == 'true' runs-on: ubuntu-latest permissions: diff --git a/CHANGELOG.md b/CHANGELOG.md index a526c0f..3b07035 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -119,11 +119,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- **Made Intel SDE AVX-512 coverage fail closed for release gating.** Pull - requests may emit a visible warning and skip SDE-dependent steps during an - Intel mirror outage, but the push/workflow-dispatch runs used by the release - gate still fail closed; setup must succeed, the AVX-512 CPUID probe must run, - and the SDE-backed test/coverage commands must execute before release. +- **Made Intel SDE AVX-512 coverage fail closed for release publishes.** Pull + requests and main pushes may emit a visible warning and skip SDE-dependent + steps during an Intel mirror outage, but the tag-triggered release workflow + reruns a fail-closed SDE proof before staging release assets; setup must + succeed, the AVX-512 CPUID probe must run, and SDE-backed tests must execute + before publish. - **Closed manifest verifier path-reopen drift.** Verification and SQLite cache-key construction now hash, probe, and validate the canonical path that was checked and recorded, rather than reopening the pre-canonical joined path. diff --git a/RELEASING.md b/RELEASING.md index 3f868c8..683ae87 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -169,12 +169,15 @@ the OIDC exchange (no risk of a bad publish; just a failed run). `main` HEAD's SHA — which needs a **completed, successful** (not `cancelled`, not in-progress) run of `ci.yml`, `python.yml`, `fuzz.yml`, `codeql.yml`, `actionlint.yml`, and `zizmor.yml`. - - The `ci.yml` AVX-512 job is release-blocking and installs Intel SDE. A - downloadmirror `403` / outage is external infrastructure, but it still means - the SHA is **not releasable** until that same SHA has a successful `ci.yml` - run on `main`. The setup action restores a SHA-verified archive cache when - available; if the cache misses and Intel's download path is unavailable, - wait, rerun, or land a reviewed SDE pin/cache update before tagging. + - Routine `ci.yml` / `coverage.yml` runs may warn and skip SDE-dependent + steps when Intel's downloadmirror challenges GitHub-hosted runners. That + keeps external mirror outages from holding `main` red, but it does **not** + make a release shippable by itself: `release.yml` has a fail-closed + `release-avx512` job that installs Intel SDE, runs the AVX-512 CPUID + probe, and runs the AVX-512 test lane before assets can be staged. + This release proof deliberately avoids writable workflow caches in the + tag workflow; if Intel's download path is unavailable, wait, rerun, or land + a reviewed SDE pin/update before tagging. - Before the final tag, spot-check `.github/actions/setup-intel-sde/action.yml` against Intel's SDE download page: version, Linux archive name, and SHA-256 must match the currently accepted pin. diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index 17c6b5d..4bf2637 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -28,11 +28,10 @@ SDE_ACTION_PATH = os.environ.get( "SDE_ACTION_PATH", ".github/actions/setup-intel-sde/action.yml" ) -PR_ONLY_SDE_ALLOW_UNAVAILABLE = "${{ github.event_name == 'pull_request' }}" +ROUTINE_CI_SDE_ALLOW_UNAVAILABLE = "${{ github.event_name != 'workflow_dispatch' }}" +RELEASE_SDE_ALLOW_UNAVAILABLE = "false" SDE_AVAILABLE_IF = "${{ steps.sde.outputs.sde-available == 'true' }}" -PR_SDE_UNAVAILABLE_IF = ( - "${{ github.event_name == 'pull_request' && steps.sde.outputs.sde-available != 'true' }}" -) +SDE_UNAVAILABLE_NOTICE_IF = "${{ steps.sde.outputs.sde-available != 'true' }}" PYPI_CANONICAL_EXPECTED_ARGS = ( "--expected-wheels 4", "--expected-sdists 1", @@ -1764,7 +1763,16 @@ def check_sde_setup_action(path: str) -> None: fail(f"{path}: Intel SDE outage softening must include {fragment!r}") -def check_sde_cache_job(workflow: dict[str, Any], path: str, job_name: str) -> None: +def check_sde_cache_job( + workflow: dict[str, Any], + path: str, + job_name: str, + *, + expected_allow_unavailable: str, + expected_notice_if: str | None, + require_cache: bool, + require_guarded_sde_steps: bool, +) -> None: jobs = mapping(workflow.get("jobs"), f"{path}: jobs") job = mapping(jobs.get(job_name), f"{path}: jobs.{job_name}") job_env = mapping(job.get("env"), f"{path}: jobs.{job_name}.env") @@ -1787,28 +1795,39 @@ def check_sde_cache_job(workflow: dict[str, Any], path: str, job_name: str) -> N with_map = mapping(step.get("with", {}), f"{path}: {step_label(index, step)} with") setup_steps.append((index, step, with_map)) - if len(cache_steps) != 1: - fail(f"{path}: jobs.{job_name} must restore exactly one Intel SDE archive cache") - _, _, cache_with = cache_steps[0] - key = cache_with.get("key") - expected_key = ( - "intel-sde-${{ runner.os }}-${{ runner.arch }}-" - "${{ env.SDE_VERSION }}-${{ env.SDE_SHA256 }}" - ) - if key != expected_key: - fail( - f"{path}: jobs.{job_name} Intel SDE cache key must be version+sha pinned, " - "not action-file-hash based" + if require_cache: + if len(cache_steps) != 1: + fail(f"{path}: jobs.{job_name} must restore exactly one Intel SDE archive cache") + _, _, cache_with = cache_steps[0] + key = cache_with.get("key") + expected_key = ( + "intel-sde-${{ runner.os }}-${{ runner.arch }}-" + "${{ env.SDE_VERSION }}-${{ env.SDE_SHA256 }}" ) - restore_keys = str(cache_with.get("restore-keys") or "") - expected_restore_key = "intel-sde-${{ runner.os }}-${{ runner.arch }}-" - if expected_restore_key not in {line.strip() for line in restore_keys.splitlines()}: - fail( - f"{path}: jobs.{job_name} Intel SDE cache restore-keys must include " - "the runner OS/arch prefix" - ) - if contains_text(key, "hashFiles") or contains_text(key, "setup-intel-sde/action.yml"): - fail(f"{path}: jobs.{job_name} Intel SDE cache key must not hash the action file") + if key != expected_key: + fail( + f"{path}: jobs.{job_name} Intel SDE cache key must be version+sha pinned, " + "not action-file-hash based" + ) + restore_keys = str(cache_with.get("restore-keys") or "") + expected_restore_key = "intel-sde-${{ runner.os }}-${{ runner.arch }}-" + if expected_restore_key not in {line.strip() for line in restore_keys.splitlines()}: + fail( + f"{path}: jobs.{job_name} Intel SDE cache restore-keys must include " + "the runner OS/arch prefix" + ) + if contains_text(key, "hashFiles") or contains_text(key, "setup-intel-sde/action.yml"): + fail(f"{path}: jobs.{job_name} Intel SDE cache key must not hash the action file") + else: + if cache_steps: + fail(f"{path}: jobs.{job_name} must not restore workflow caches in release context") + for index, step in enumerate(steps): + action = action_name(step) + if action in {"actions/cache", "swatinem/rust-cache"}: + fail( + f"{path}: {step_label(index, step)} must not use workflow caches " + "in the release fail-closed SDE proof" + ) if len(setup_steps) != 1: fail(f"{path}: jobs.{job_name} must use exactly one setup-intel-sde action") @@ -1817,24 +1836,30 @@ def check_sde_cache_job(workflow: dict[str, Any], path: str, job_name: str) -> N fail(f"{path}: jobs.{job_name} setup-intel-sde must receive env.SDE_VERSION") if setup_with.get("sha256") != "${{ env.SDE_SHA256 }}": fail(f"{path}: jobs.{job_name} setup-intel-sde must receive env.SDE_SHA256") - if setup_with.get("allow-unavailable") != PR_ONLY_SDE_ALLOW_UNAVAILABLE: + if setup_with.get("allow-unavailable") != expected_allow_unavailable: fail( - f"{path}: jobs.{job_name} may soften Intel SDE outages only on pull_request; " - "push and workflow_dispatch runs must fail closed" + f"{path}: jobs.{job_name} setup-intel-sde allow-unavailable must be " + f"{expected_allow_unavailable!r}" ) - outage_notice_steps = [] - for index, raw_step in enumerate(steps): - step = mapping(raw_step, f"{path}: jobs.{job_name}.steps[{index}]") - if step.get("if") == PR_SDE_UNAVAILABLE_IF and contains_text( - step.get("run"), "Intel SDE archive unavailable" - ): - outage_notice_steps.append(step) - if len(outage_notice_steps) != 1: - fail( - f"{path}: jobs.{job_name} must emit exactly one PR-only Intel SDE outage notice; " - "release-gated runs must not green-skip AVX-512 coverage" + outage_notice_steps = [ + mapping(raw_step, f"{path}: jobs.{job_name}.steps[{index}]") + for index, raw_step in enumerate(steps) + if contains_text( + mapping(raw_step, f"{path}: jobs.{job_name}.steps[{index}]").get("run"), + "Intel SDE archive unavailable", ) + ] + if expected_notice_if is None: + if outage_notice_steps: + fail(f"{path}: jobs.{job_name} must not contain a soft-skip Intel SDE outage notice") + else: + matching_notices = [step for step in outage_notice_steps if step.get("if") == expected_notice_if] + if len(matching_notices) != 1: + fail( + f"{path}: jobs.{job_name} must emit exactly one Intel SDE outage notice " + f"guarded by {expected_notice_if!r}" + ) sde_guarded_names = { "Install cargo-llvm-cov (pinned)", @@ -1852,17 +1877,54 @@ def check_sde_cache_job(workflow: dict[str, Any], path: str, job_name: str) -> N or contains_nested_text(step.get("env"), "steps.sde.outputs.sde-path") or contains_text(step.get("run"), "SDE_PATH") ): - if step.get("if") != SDE_AVAILABLE_IF: + if require_guarded_sde_steps and step.get("if") != SDE_AVAILABLE_IF: fail( f"{path}: {step_label(index, step)} must run after SDE setup succeeds, " - "and may be skipped only when PR-only SDE setup reports unavailable" + "and may be skipped only when SDE setup reports unavailable" + ) + if not require_guarded_sde_steps and step.get("if") is not None: + fail( + f"{path}: {step_label(index, step)} is in a release fail-closed SDE proof " + "and must not be guarded behind a green-skip condition" ) def check_sde_cache_invariants() -> None: check_sde_setup_action(SDE_ACTION_PATH) - check_sde_cache_job(load_workflow(CI_WORKFLOW_PATH), CI_WORKFLOW_PATH, "avx512") - check_sde_cache_job(load_workflow(COVERAGE_WORKFLOW_PATH), COVERAGE_WORKFLOW_PATH, "coverage") + check_sde_cache_job( + load_workflow(CI_WORKFLOW_PATH), + CI_WORKFLOW_PATH, + "avx512", + expected_allow_unavailable=ROUTINE_CI_SDE_ALLOW_UNAVAILABLE, + expected_notice_if=SDE_UNAVAILABLE_NOTICE_IF, + require_cache=True, + require_guarded_sde_steps=True, + ) + check_sde_cache_job( + load_workflow(COVERAGE_WORKFLOW_PATH), + COVERAGE_WORKFLOW_PATH, + "coverage", + expected_allow_unavailable=ROUTINE_CI_SDE_ALLOW_UNAVAILABLE, + expected_notice_if=SDE_UNAVAILABLE_NOTICE_IF, + require_cache=True, + require_guarded_sde_steps=True, + ) + release_workflow = load_workflow(WORKFLOW_PATH) + check_sde_cache_job( + release_workflow, + WORKFLOW_PATH, + "release-avx512", + expected_allow_unavailable=RELEASE_SDE_ALLOW_UNAVAILABLE, + expected_notice_if=None, + require_cache=False, + require_guarded_sde_steps=False, + ) + jobs = mapping(release_workflow.get("jobs"), f"{WORKFLOW_PATH}: jobs") + draft_job = mapping( + jobs.get("release-assets-draft"), f"{WORKFLOW_PATH}: jobs.release-assets-draft" + ) + if not has_need(draft_job, "release-avx512"): + fail(f"{WORKFLOW_PATH}: release-assets-draft must need release-avx512") def main() -> None: diff --git a/tests/release_signed_release_invariants.sh b/tests/release_signed_release_invariants.sh index 1e9bb79..2994fa7 100755 --- a/tests/release_signed_release_invariants.sh +++ b/tests/release_signed_release_invariants.sh @@ -114,9 +114,9 @@ job_downloads_artifact_to_path() { # ---------------------------------------------------------------------- # (1) release-assets-draft needs attest + provenance + require-ci-green + notes -# + exact linux/aarch64 wheel smoke +# + fail-closed release AVX-512 proof + exact linux/aarch64 wheel smoke # ---------------------------------------------------------------------- -for dep in attest provenance pypi-canonical-dist require-ci-green notes smoke-linux-aarch64-wheel; do +for dep in attest provenance pypi-canonical-dist require-ci-green release-avx512 notes smoke-linux-aarch64-wheel; do job_needs release-assets-draft "$dep" \ || fail "release-assets-draft must \`needs: $dep\` (fail-closed on missing provenance/CI)" done