diff --git a/.github/workflows/parity-auto.yml b/.github/workflows/parity-auto.yml new file mode 100644 index 0000000000000..7f60256257df3 --- /dev/null +++ b/.github/workflows/parity-auto.yml @@ -0,0 +1,358 @@ +name: Parity Auto Trigger +run-name: "Parity auto-trigger ยท pytorch/pytorch main" + +# Polls completed pytorch/pytorch trunk.yml pushes on main and dispatches +# parity.yml once for each SHA where all CI consumed by the report has finished, +# covering every arch whose test shards actually ran on it. +# +# Arch participation is detected at the workflow_run level: if rocm-mi300 +# never ran on a SHA, we don't wait for it or include it. Readiness is then +# evaluated at the *check-run* level because ROCm test shards post check-runs +# independently, and a single failing shard flips the parent workflow_run to +# conclusion=failure while siblings are still executing. +# +# Two gates: +# 1. Detect which ROCm arch workflows actually ran on this SHA. +# 2. Require every test check-run for those ROCm arch workflows, plus every +# CUDA test check-run consumed by download_testlogs, to be status=completed. +# We don't want to dispatch mi355 while a mi300 workflow that ran on the +# same SHA is still creating or running test shards. +# +# We dispatch at most once per SHA with the ready subset of arches, so mi355 +# (run as part of trunk) gets a parity report per commit, and mi300/ +# mi200 join the same dispatch whenever their periodic workflow +# happens to finish on that SHA. + +on: + schedule: + - cron: '*/10 * * * *' + pull_request: + paths: + - '.github/workflows/parity-auto.yml' + workflow_dispatch: + inputs: + max_commits: + description: 'How many of the most recent completed upstream trunk.yml pushes on main to scan.' + required: false + default: '200' + type: string + max_dispatches: + description: 'Maximum number of ready upstream commits to dispatch in one scan.' + required: false + default: '50' + type: string + max_age_hours: + description: 'Skip commits older than this (avoid back-filling ancient SHAs).' + required: false + default: '72' + type: string + archs: + description: 'Architectures to consider (comma/space separated).' + required: false + default: 'mi355, mi300, mi200' + type: string + arch_jobname_regex_map: + description: 'JSON: arch -> PCRE regex that matches the check-run names of that arch''s ROCm test shards on pytorch/pytorch. An arch is considered "ready" only when every check-run whose name matches has status=completed (so we wait for all test shards, not just workflow completion).' + required: false + default: '{"mi355":"rocm.*mi355.*/ test [(](default|distributed|inductor),","mi300":"rocm.*mi300.*/ test [(](default|distributed|inductor),","mi200":"(rocm.*(mi200|mi210).*/ test [(](default|distributed|inductor),|linux-jammy-rocm-py3[.]10 / test [(](default|distributed|inductor),)","navi31":"rocm.*navi31.*/ test [(]default,","nightly":"rocm-nightly.*/ test [(](default|distributed|inductor),"}' + type: string + arch_workflow_regex_map: + description: 'JSON: arch -> PCRE regex that matches workflow file paths for upstream ROCm workflows that mean this arch ran on the SHA. Missing workflows mean the arch is not expected for that commit.' + required: false + default: '{"mi355":"(^|/)(trunk|rocm-mi355|periodic-rocm-mi355|inductor-rocm-mi355)[.]yml$","mi300":"(^|/)(rocm-mi300|periodic-rocm-mi300|inductor-rocm-mi300)[.]yml$","mi200":"(^|/)(trunk-rocm-sandbox|rocm-mi200|periodic-rocm-mi200|inductor-rocm-mi200)[.]yml$","navi31":"(^|/)(rocm-navi31|periodic-rocm-navi31|inductor-rocm-navi31)[.]yml$","nightly":"(^|/)rocm-nightly[.]yml$"}' + type: string + target_ref: + description: 'Ref of this repo to dispatch parity.yml against. Leave blank to use this workflow run''s ref.' + required: false + default: '' + type: string + dry_run: + description: 'Scan and log, but do not actually dispatch parity.yml.' + required: false + default: false + type: boolean + +permissions: + contents: read + actions: write + +concurrency: + group: parity-auto-trigger + cancel-in-progress: false + +jobs: + scan-and-dispatch: + runs-on: ubuntu-latest + steps: + - name: Find ready arches per upstream commit and dispatch parity.yml + env: + GH_TOKEN: ${{ github.token }} + UPSTREAM: pytorch/pytorch + BRANCH: main + MAX_COMMITS: ${{ github.event_name == 'pull_request' && '20' || inputs.max_commits || '200' }} + MAX_DISPATCHES: ${{ github.event_name == 'pull_request' && '5' || inputs.max_dispatches || '50' }} + MAX_AGE_HOURS: ${{ inputs.max_age_hours || '72' }} + ARCHS_IN: ${{ inputs.archs || 'mi355, mi300, mi200' }} + ARCH_JOBNAME_REGEX_MAP: ${{ inputs.arch_jobname_regex_map || '{"mi355":"rocm.*mi355.*/ test [(](default|distributed|inductor),","mi300":"rocm.*mi300.*/ test [(](default|distributed|inductor),","mi200":"(rocm.*(mi200|mi210).*/ test [(](default|distributed|inductor),|linux-jammy-rocm-py3[.]10 / test [(](default|distributed|inductor),)","navi31":"rocm.*navi31.*/ test [(]default,","nightly":"rocm-nightly.*/ test [(](default|distributed|inductor),"}' }} + ARCH_WORKFLOW_REGEX_MAP: ${{ inputs.arch_workflow_regex_map || '{"mi355":"(^|/)(trunk|rocm-mi355|periodic-rocm-mi355|inductor-rocm-mi355)[.]yml$","mi300":"(^|/)(rocm-mi300|periodic-rocm-mi300|inductor-rocm-mi300)[.]yml$","mi200":"(^|/)(trunk-rocm-sandbox|rocm-mi200|periodic-rocm-mi200|inductor-rocm-mi200)[.]yml$","navi31":"(^|/)(rocm-navi31|periodic-rocm-navi31|inductor-rocm-navi31)[.]yml$","nightly":"(^|/)rocm-nightly[.]yml$"}' }} + TARGET_REF_IN: ${{ inputs.target_ref || '' }} + DRY_RUN: ${{ github.event_name == 'pull_request' && 'true' || inputs.dry_run || 'false' }} + run: | + # GitHub Actions launches this with `bash -e {0}`, so -e is already on + # from the shebang. It's too aggressive for the many pipelines here + # (grep -q returning 1, date -d edge cases, paginated API calls, + # etc.) and has caused the loop to silently abort after the first + # "no ready archs" commit. Explicitly turn -e OFF and keep -u + + # pipefail so undefined-variable bugs still surface. + set +e + set -uo pipefail + + NOW_EPOCH=$(date -u +%s) + MAX_AGE_EPOCH=$((NOW_EPOCH - MAX_AGE_HOURS * 3600)) + TARGET_REF="${TARGET_REF_IN:-$GITHUB_REF_NAME}" + ARCHS=$(echo "$ARCHS_IN" | tr ',' ' ' | xargs) + + echo "Upstream: $UPSTREAM@$BRANCH" + echo "Target ref: $TARGET_REF" + echo "Scope archs: $ARCHS" + echo "Max trunk runs: $MAX_COMMITS" + echo "Max dispatches: $MAX_DISPATCHES" + echo "Max age: ${MAX_AGE_HOURS}h" + echo "Dry run: $DRY_RUN" + echo "Arch->jobs: $ARCH_JOBNAME_REGEX_MAP" + echo "Arch->workflows: $ARCH_WORKFLOW_REGEX_MAP" + echo + + # --- 1. Recent completed upstream trunk pushes ----------------------- + # Use trunk.yml as the candidate source instead of raw main commits. + # The parity report consumes trunk's CUDA/ROCm jobs, so a completed + # trunk push is the first point where a SHA can reasonably be ready. + COMMITS_JSON='[]' + PAGE=1 + while [ "$(echo "$COMMITS_JSON" | jq 'length')" -lt "$MAX_COMMITS" ]; do + PAGE_RUNS=$(gh api \ + "repos/$UPSTREAM/actions/workflows/trunk.yml/runs?branch=$BRANCH&event=push&status=completed&per_page=100&page=$PAGE" \ + --jq '.workflow_runs | map({head_sha, created_at})') + if [ "$(echo "$PAGE_RUNS" | jq 'length')" -eq 0 ]; then + break + fi + COMMITS_JSON=$(jq -s --arg max "$MAX_COMMITS" ' + (.[0] + .[1]) as $runs + | reduce $runs[] as $run ({seen:{}, rows:[]}; + if .seen[$run.head_sha] then . + else .seen[$run.head_sha] = true | .rows += [$run] + end + ) + | .rows[:($max | tonumber)] + ' <(echo "$COMMITS_JSON") <(echo "$PAGE_RUNS")) + PAGE=$((PAGE + 1)) + done + COMMITS=$(echo "$COMMITS_JSON" | jq -r '.[] | "\(.head_sha) \(.created_at)"') + + if [ -z "$COMMITS" ]; then + echo "::warning::No completed trunk.yml push runs returned from $UPSTREAM@$BRANCH" + exit 0 + fi + + # --- 2. Already-dispatched SHAs in our repo -------------------------- + # Deduplicate auto-parity-created parity runs without changing + # parity.yml's own output naming. New auto-dispatched runs are created + # by github-actions[bot]; keep the old autoparity-* title match so + # runs created before this workflow stopped passing csv_name still + # suppress duplicate dispatches. + EXISTING=$(gh api --paginate \ + "repos/$GITHUB_REPOSITORY/actions/workflows/parity.yml/runs?event=workflow_dispatch&created=%3E%3D$(date -u -d "@$MAX_AGE_EPOCH" '+%Y-%m-%dT%H:%M:%SZ')&per_page=100" \ + --jq '.workflow_runs[] | {display_title, actor: .actor.login}' | + jq -s '.') + + sha_already_dispatched() { + local sha="$1" + echo "$EXISTING" | jq -e --arg sha "$sha" \ + 'any(.[]; ((.display_title // "") | contains($sha)) and (((.display_title // "") | startswith("autoparity-")) or (.actor == "github-actions[bot]")))' >/dev/null + } + + # --- 3. Walk trunk SHAs, dispatch each ready unprocessed SHA --------- + DISPATCHED_COUNT=0 + DISPATCHED_SUMMARY="" + while IFS=' ' read -r SHA DATE; do + [ -z "$SHA" ] && continue + SHORT=$(echo "$SHA" | cut -c1-8) + COMMIT_EPOCH=$(date -u -d "$DATE" +%s 2>/dev/null || echo 0) + + if [ "$COMMIT_EPOCH" -ne 0 ] && [ "$COMMIT_EPOCH" -lt "$MAX_AGE_EPOCH" ]; then + echo "[$SHORT] $DATE too old (>${MAX_AGE_HOURS}h) - stopping scan" + break + fi + + if sha_already_dispatched "$SHA"; then + echo "[$SHORT] parity report already exists for this SHA - skip" + continue + fi + + # First determine which ROCm arch workflows actually ran on this + # SHA. If a periodic arch workflow never ran, the arch is not + # expected for the report. If it did run, we must wait for its + # matching test shards below. + ALL_WORKFLOW_RUNS=$(gh api --paginate \ + "repos/$UPSTREAM/actions/runs?head_sha=$SHA&per_page=100" \ + --jq '.workflow_runs[] | {name,path,status,conclusion}' \ + 2>/dev/null | jq -s '.' || echo '[]') + + RUN_ARCHS="" + NOT_RUN_NOTES="" + for ARCH in $ARCHS; do + WF_REGEX=$(echo "$ARCH_WORKFLOW_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""') + if [ -z "$WF_REGEX" ]; then + NOT_RUN_NOTES="$NOT_RUN_NOTES $ARCH:no-workflow-regex" + continue + fi + WF_TOTAL=$(echo "$ALL_WORKFLOW_RUNS" | jq --arg rx "$WF_REGEX" \ + 'map(select((.path // "") | test($rx))) | length') + if [ "$WF_TOTAL" -eq 0 ]; then + NOT_RUN_NOTES="$NOT_RUN_NOTES $ARCH:no-workflow" + else + RUN_ARCHS="$RUN_ARCHS $ARCH" + fi + done + RUN_ARCHS=$(echo "$RUN_ARCHS" | xargs) + NOT_RUN_NOTES=$(echo "$NOT_RUN_NOTES" | xargs) + + if [ -z "$RUN_ARCHS" ]; then + echo "[$SHORT] $DATE no in-scope ROCm workflows ran on upstream (${NOT_RUN_NOTES:-none}) - skip" + continue + fi + + # Pull relevant upstream check-runs for this SHA. Test shards post + # check-runs independently, and workflow_run conclusion can flip to + # failure before sibling shards finish. We need per-shard state. + ALL_CHECK_RUNS=$(gh api --paginate \ + "repos/$UPSTREAM/commits/$SHA/check-runs?per_page=100" \ + --jq '.check_runs[] | {name,status,conclusion}' \ + 2>/dev/null | jq -s '.' || echo '[]') + + CHECK_RUNS='[]' + for ARCH in $RUN_ARCHS; do + REGEX=$(echo "$ARCH_JOBNAME_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""') + [ -z "$REGEX" ] && continue + ARCH_CHECK_RUNS=$(echo "$ALL_CHECK_RUNS" | jq --arg rx "$REGEX" \ + '[.[] | select((.name | test($rx)) and (.name | test("mem_leak_check|rerun_disabled_tests") | not))]') + CHECK_RUNS=$(jq -s 'add | unique_by(.name)' \ + <(echo "$CHECK_RUNS") \ + <(echo "$ARCH_CHECK_RUNS")) + done + + CUDA_JOBNAME_REGEX='(linux-jammy-cuda13[.]0-py3[.]10-gcc11 / (test-osdc|test) [(](default|distributed),|unit-test / inductor-test / (test-osdc|test) [(]inductor,)' + CUDA_CHECK_RUNS=$(echo "$ALL_CHECK_RUNS" | jq --arg rx "$CUDA_JOBNAME_REGEX" \ + '[.[] | select((.name | test($rx)) and (.name | test("mem_leak_check|rerun_disabled_tests") | not))]') + + if [ "$(echo "$CHECK_RUNS" | jq 'length')" -eq 0 ]; then + echo "[$SHORT] $DATE ROCm workflows ran ($RUN_ARCHS) but no parity check-runs yet - skip" + continue + fi + + if [ "$(echo "$CUDA_CHECK_RUNS" | jq 'length')" -eq 0 ]; then + echo "[$SHORT] $DATE no CUDA parity check-runs yet on upstream - skip" + continue + fi + + # Gate 1: require EVERY upstream check-run consumed by the + # parity report for this SHA to be status=completed (ROCm test + # shards for arch workflows that ran, plus CUDA default/ + # distributed/inductor tests). Once we dispatch for a SHA the + # parity report is authored, so dispatching before CUDA or + # another arch finishes produces partial reports. + GATE_CHECK_RUNS=$(jq -s 'add' \ + <(echo "$CHECK_RUNS") \ + <(echo "$CUDA_CHECK_RUNS")) + TOTAL_CR=$(echo "$GATE_CHECK_RUNS" | jq 'length') + PENDING_CR=$(echo "$GATE_CHECK_RUNS" | jq 'map(select(.status != "completed")) | length') + if [ "$PENDING_CR" -ne 0 ]; then + PENDING_SAMPLE=$(echo "$GATE_CHECK_RUNS" | jq -r ' + map(select(.status != "completed")) + | .[0:3] + | map(.name) + | join(", ")') + echo "[$SHORT] $DATE ${PENDING_CR}/${TOTAL_CR} parity check-runs still pending - skip (e.g. $PENDING_SAMPLE)" + continue + fi + + # Gate 2: every arch workflow that ran on this SHA must have + # matching test shards before we author the one-and-only report + # for the SHA. Missing arch workflows are not expected; missing + # shards for a workflow that ran means the workflow is not ready. + READY="" + NOT_READY_NOTES="" + for ARCH in $RUN_ARCHS; do + REGEX=$(echo "$ARCH_JOBNAME_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""') + if [ -z "$REGEX" ]; then + NOT_READY_NOTES="$NOT_READY_NOTES $ARCH:no-regex" + continue + fi + TOTAL=$(echo "$CHECK_RUNS" | jq --arg rx "$REGEX" \ + 'map(select(.name | test($rx))) | length') + if [ "$TOTAL" -eq 0 ]; then + NOT_READY_NOTES="$NOT_READY_NOTES $ARCH:workflow-run-no-shards-yet" + else + READY="$READY $ARCH" + fi + done + READY=$(echo "$READY" | xargs) + NOT_READY_NOTES=$(echo "$NOT_READY_NOTES" | xargs) + + if [ -n "$NOT_READY_NOTES" ]; then + echo "[$SHORT] $DATE ROCm workflows ran ($RUN_ARCHS) but some test shards are missing - skip (${NOT_READY_NOTES})" + continue + fi + + if [ -z "$READY" ]; then + echo "[$SHORT] $DATE ROCm workflows ran ($RUN_ARCHS) but no in-scope arches are ready" + continue + fi + + ARCH_DISPATCH=$(echo "$READY" | sed 's/ /, /g') + echo "[$SHORT] READY archs: '$(echo "$READY" | tr ' ' ',')' (committed $DATE; not-run: ${NOT_RUN_NOTES:-none})" + echo "[$SHORT] dispatching for: '$(echo "$READY" | tr ' ' ',')'" + + if [ "$DRY_RUN" = "true" ]; then + echo "[$SHORT] DRY_RUN=true - not dispatching" + else + gh workflow run parity.yml \ + --repo "$GITHUB_REPOSITORY" \ + --ref "$TARGET_REF" \ + -f sha="$SHA" \ + -f arch="$ARCH_DISPATCH" + fi + + DISPATCHED_COUNT=$((DISPATCHED_COUNT + 1)) + DISPATCHED_SUMMARY="${DISPATCHED_SUMMARY}${SHORT}:${ARCH_DISPATCH}"$'\n' + if [ "$DISPATCHED_COUNT" -ge "$MAX_DISPATCHES" ]; then + echo "Reached max dispatches for this scan ($MAX_DISPATCHES); stopping" + break + fi + done <<< "$COMMITS" + + # --- 4. Summary ------------------------------------------------------- + { + echo "### Parity auto-trigger" + echo "" + echo "- Upstream: \`$UPSTREAM@$BRANCH\`" + echo "- Scope archs: \`$ARCHS\`" + echo "- Max commits: $MAX_COMMITS" + echo "- Max dispatches: $MAX_DISPATCHES" + echo "- Max age: ${MAX_AGE_HOURS}h" + echo "- Target ref: \`$TARGET_REF\`" + if [ "$DISPATCHED_COUNT" -gt 0 ]; then + if [ "$DRY_RUN" = "true" ]; then + echo "- Result: would dispatch $DISPATCHED_COUNT parity run(s) (dry-run)" + else + echo "- Result: dispatched $DISPATCHED_COUNT parity run(s)" + fi + echo "" + echo "$DISPATCHED_SUMMARY" | while IFS= read -r LINE; do + [ -z "$LINE" ] && continue + echo "- $LINE" + done + else + echo "- Result: no ready unprocessed SHAs found" + fi + } >> "$GITHUB_STEP_SUMMARY"