ROCm · ethanwee1 · May 18, 2026 · May 27, 2026 · May 27, 2026 · May 28, 2026
diff --git a/.github/workflows/parity-auto.yml b/.github/workflows/parity-auto.yml
@@ -0,0 +1,358 @@
+name: Parity Auto Trigger
+run-name: "Parity auto-trigger · pytorch/pytorch main"
+
+# Polls completed pytorch/pytorch trunk.yml pushes on main and dispatches
+# parity.yml once for each SHA where all CI consumed by the report has finished,
+# covering every arch whose test shards actually ran on it.
+#
+# Arch participation is detected at the workflow_run level: if rocm-mi300
+# never ran on a SHA, we don't wait for it or include it. Readiness is then
+# evaluated at the *check-run* level because ROCm test shards post check-runs
+# independently, and a single failing shard flips the parent workflow_run to
+# conclusion=failure while siblings are still executing.
+#
+# Two gates:
+#   1. Detect which ROCm arch workflows actually ran on this SHA.
+#   2. Require every test check-run for those ROCm arch workflows, plus every
+#      CUDA test check-run consumed by download_testlogs, to be status=completed.
+#      We don't want to dispatch mi355 while a mi300 workflow that ran on the
+#      same SHA is still creating or running test shards.
+#
+# We dispatch at most once per SHA with the ready subset of arches, so mi355
+# (run as part of trunk) gets a parity report per commit, and mi300/
+# mi200 join the same dispatch whenever their periodic workflow
+# happens to finish on that SHA.
+
+on:
+  schedule:
+    - cron: '*/10 * * * *'
+  pull_request:
+    paths:
+      - '.github/workflows/parity-auto.yml'
+  workflow_dispatch:
+    inputs:
+      max_commits:
+        description: 'How many of the most recent completed upstream trunk.yml pushes on main to scan.'
+        required: false
+        default: '200'
+        type: string
+      max_dispatches:
+        description: 'Maximum number of ready upstream commits to dispatch in one scan.'
+        required: false
+        default: '50'
+        type: string
+      max_age_hours:
+        description: 'Skip commits older than this (avoid back-filling ancient SHAs).'
+        required: false
+        default: '72'
+        type: string
+      archs:
+        description: 'Architectures to consider (comma/space separated).'
+        required: false
+        default: 'mi355, mi300, mi200'
+        type: string
+      arch_jobname_regex_map:
+        description: 'JSON: arch -> PCRE regex that matches the check-run names of that arch''s ROCm test shards on pytorch/pytorch. An arch is considered "ready" only when every check-run whose name matches has status=completed (so we wait for all test shards, not just workflow completion).'
+        required: false
+        default: '{"mi355":"rocm.*mi355.*/ test [(](default|distributed|inductor),","mi300":"rocm.*mi300.*/ test [(](default|distributed|inductor),","mi200":"(rocm.*(mi200|mi210).*/ test [(](default|distributed|inductor),|linux-jammy-rocm-py3[.]10 / test [(](default|distributed|inductor),)","navi31":"rocm.*navi31.*/ test [(]default,","nightly":"rocm-nightly.*/ test [(](default|distributed|inductor),"}'
+        type: string
+      arch_workflow_regex_map:
+        description: 'JSON: arch -> PCRE regex that matches workflow file paths for upstream ROCm workflows that mean this arch ran on the SHA. Missing workflows mean the arch is not expected for that commit.'
+        required: false
+        default: '{"mi355":"(^|/)(trunk|rocm-mi355|periodic-rocm-mi355|inductor-rocm-mi355)[.]yml$","mi300":"(^|/)(rocm-mi300|periodic-rocm-mi300|inductor-rocm-mi300)[.]yml$","mi200":"(^|/)(trunk-rocm-sandbox|rocm-mi200|periodic-rocm-mi200|inductor-rocm-mi200)[.]yml$","navi31":"(^|/)(rocm-navi31|periodic-rocm-navi31|inductor-rocm-navi31)[.]yml$","nightly":"(^|/)rocm-nightly[.]yml$"}'
+        type: string
+      target_ref:
+        description: 'Ref of this repo to dispatch parity.yml against. Leave blank to use this workflow run''s ref.'
+        required: false
+        default: ''
+        type: string
+      dry_run:
+        description: 'Scan and log, but do not actually dispatch parity.yml.'
+        required: false
+        default: false
+        type: boolean
+
+permissions:
+  contents: read
+  actions: write
+
+concurrency:
+  group: parity-auto-trigger
+  cancel-in-progress: false
+
+jobs:
+  scan-and-dispatch:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Find ready arches per upstream commit and dispatch parity.yml
+        env:
+          GH_TOKEN: ${{ github.token }}
+          UPSTREAM: pytorch/pytorch
+          BRANCH: main
+          MAX_COMMITS: ${{ github.event_name == 'pull_request' && '20' || inputs.max_commits || '200' }}
+          MAX_DISPATCHES: ${{ github.event_name == 'pull_request' && '5' || inputs.max_dispatches || '50' }}
+          MAX_AGE_HOURS: ${{ inputs.max_age_hours || '72' }}
+          ARCHS_IN: ${{ inputs.archs || 'mi355, mi300, mi200' }}
+          ARCH_JOBNAME_REGEX_MAP: ${{ inputs.arch_jobname_regex_map || '{"mi355":"rocm.*mi355.*/ test [(](default|distributed|inductor),","mi300":"rocm.*mi300.*/ test [(](default|distributed|inductor),","mi200":"(rocm.*(mi200|mi210).*/ test [(](default|distributed|inductor),|linux-jammy-rocm-py3[.]10 / test [(](default|distributed|inductor),)","navi31":"rocm.*navi31.*/ test [(]default,","nightly":"rocm-nightly.*/ test [(](default|distributed|inductor),"}' }}
+          ARCH_WORKFLOW_REGEX_MAP: ${{ inputs.arch_workflow_regex_map || '{"mi355":"(^|/)(trunk|rocm-mi355|periodic-rocm-mi355|inductor-rocm-mi355)[.]yml$","mi300":"(^|/)(rocm-mi300|periodic-rocm-mi300|inductor-rocm-mi300)[.]yml$","mi200":"(^|/)(trunk-rocm-sandbox|rocm-mi200|periodic-rocm-mi200|inductor-rocm-mi200)[.]yml$","navi31":"(^|/)(rocm-navi31|periodic-rocm-navi31|inductor-rocm-navi31)[.]yml$","nightly":"(^|/)rocm-nightly[.]yml$"}' }}
+          TARGET_REF_IN: ${{ inputs.target_ref || '' }}
+          DRY_RUN: ${{ github.event_name == 'pull_request' && 'true' || inputs.dry_run || 'false' }}
+        run: |
+          # GitHub Actions launches this with `bash -e {0}`, so -e is already on
+          # from the shebang. It's too aggressive for the many pipelines here
+          # (grep -q returning 1, date -d edge cases, paginated API calls,
+          # etc.) and has caused the loop to silently abort after the first
+          # "no ready archs" commit. Explicitly turn -e OFF and keep -u +
+          # pipefail so undefined-variable bugs still surface.
+          set +e
+          set -uo pipefail
+
+          NOW_EPOCH=$(date -u +%s)
+          MAX_AGE_EPOCH=$((NOW_EPOCH - MAX_AGE_HOURS * 3600))
+          TARGET_REF="${TARGET_REF_IN:-$GITHUB_REF_NAME}"
+          ARCHS=$(echo "$ARCHS_IN" | tr ',' ' ' | xargs)
+
+          echo "Upstream:       $UPSTREAM@$BRANCH"
+          echo "Target ref:     $TARGET_REF"
+          echo "Scope archs:    $ARCHS"
+          echo "Max trunk runs: $MAX_COMMITS"
+          echo "Max dispatches: $MAX_DISPATCHES"
+          echo "Max age:        ${MAX_AGE_HOURS}h"
+          echo "Dry run:        $DRY_RUN"
+          echo "Arch->jobs:      $ARCH_JOBNAME_REGEX_MAP"
+          echo "Arch->workflows: $ARCH_WORKFLOW_REGEX_MAP"
+          echo
+
+          # --- 1. Recent completed upstream trunk pushes -----------------------
+          # Use trunk.yml as the candidate source instead of raw main commits.
+          # The parity report consumes trunk's CUDA/ROCm jobs, so a completed
+          # trunk push is the first point where a SHA can reasonably be ready.
+          COMMITS_JSON='[]'
+          PAGE=1
+          while [ "$(echo "$COMMITS_JSON" | jq 'length')" -lt "$MAX_COMMITS" ]; do
+            PAGE_RUNS=$(gh api \
+              "repos/$UPSTREAM/actions/workflows/trunk.yml/runs?branch=$BRANCH&event=push&status=completed&per_page=100&page=$PAGE" \
+              --jq '.workflow_runs | map({head_sha, created_at})')
+            if [ "$(echo "$PAGE_RUNS" | jq 'length')" -eq 0 ]; then
+              break
+            fi
+            COMMITS_JSON=$(jq -s --arg max "$MAX_COMMITS" '
+              (.[0] + .[1]) as $runs
+              | reduce $runs[] as $run ({seen:{}, rows:[]};
+                  if .seen[$run.head_sha] then .
+                  else .seen[$run.head_sha] = true | .rows += [$run]
+                  end
+                )
+              | .rows[:($max | tonumber)]
+            ' <(echo "$COMMITS_JSON") <(echo "$PAGE_RUNS"))
+            PAGE=$((PAGE + 1))
+          done
+          COMMITS=$(echo "$COMMITS_JSON" | jq -r '.[] | "\(.head_sha) \(.created_at)"')
+
+          if [ -z "$COMMITS" ]; then
+            echo "::warning::No completed trunk.yml push runs returned from $UPSTREAM@$BRANCH"
+            exit 0
+          fi
+
+          # --- 2. Already-dispatched SHAs in our repo --------------------------
+          # Deduplicate auto-parity-created parity runs without changing
+          # parity.yml's own output naming. New auto-dispatched runs are created
+          # by github-actions[bot]; keep the old autoparity-* title match so
+          # runs created before this workflow stopped passing csv_name still
+          # suppress duplicate dispatches.
+          EXISTING=$(gh api --paginate \
+            "repos/$GITHUB_REPOSITORY/actions/workflows/parity.yml/runs?event=workflow_dispatch&created=%3E%3D$(date -u -d "@$MAX_AGE_EPOCH" '+%Y-%m-%dT%H:%M:%SZ')&per_page=100" \
+            --jq '.workflow_runs[] | {display_title, actor: .actor.login}' |
+            jq -s '.')
+
+          sha_already_dispatched() {
+            local sha="$1"
+            echo "$EXISTING" | jq -e --arg sha "$sha" \
+              'any(.[]; ((.display_title // "") | contains($sha)) and (((.display_title // "") | startswith("autoparity-")) or (.actor == "github-actions[bot]")))' >/dev/null
+          }
+
+          # --- 3. Walk trunk SHAs, dispatch each ready unprocessed SHA ---------
+          DISPATCHED_COUNT=0
+          DISPATCHED_SUMMARY=""
+          while IFS=' ' read -r SHA DATE; do
+            [ -z "$SHA" ] && continue
+            SHORT=$(echo "$SHA" | cut -c1-8)
+            COMMIT_EPOCH=$(date -u -d "$DATE" +%s 2>/dev/null || echo 0)
+
+            if [ "$COMMIT_EPOCH" -ne 0 ] && [ "$COMMIT_EPOCH" -lt "$MAX_AGE_EPOCH" ]; then
+              echo "[$SHORT] $DATE  too old (>${MAX_AGE_HOURS}h) - stopping scan"
+              break
+            fi
+
+            if sha_already_dispatched "$SHA"; then
+              echo "[$SHORT] parity report already exists for this SHA - skip"
+              continue
+            fi
+
+            # First determine which ROCm arch workflows actually ran on this
+            # SHA. If a periodic arch workflow never ran, the arch is not
+            # expected for the report. If it did run, we must wait for its
+            # matching test shards below.
+            ALL_WORKFLOW_RUNS=$(gh api --paginate \
+              "repos/$UPSTREAM/actions/runs?head_sha=$SHA&per_page=100" \
+              --jq '.workflow_runs[] | {name,path,status,conclusion}' \
+              2>/dev/null | jq -s '.' || echo '[]')
+
+            RUN_ARCHS=""
+            NOT_RUN_NOTES=""
+            for ARCH in $ARCHS; do
+              WF_REGEX=$(echo "$ARCH_WORKFLOW_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""')
+              if [ -z "$WF_REGEX" ]; then
+                NOT_RUN_NOTES="$NOT_RUN_NOTES $ARCH:no-workflow-regex"
+                continue
+              fi
+              WF_TOTAL=$(echo "$ALL_WORKFLOW_RUNS" | jq --arg rx "$WF_REGEX" \
+                'map(select((.path // "") | test($rx))) | length')
+              if [ "$WF_TOTAL" -eq 0 ]; then
+                NOT_RUN_NOTES="$NOT_RUN_NOTES $ARCH:no-workflow"
+              else
+                RUN_ARCHS="$RUN_ARCHS $ARCH"
+              fi
+            done
+            RUN_ARCHS=$(echo "$RUN_ARCHS" | xargs)
+            NOT_RUN_NOTES=$(echo "$NOT_RUN_NOTES" | xargs)
+
+            if [ -z "$RUN_ARCHS" ]; then
+              echo "[$SHORT] $DATE  no in-scope ROCm workflows ran on upstream (${NOT_RUN_NOTES:-none}) - skip"
+              continue
+            fi
+
+            # Pull relevant upstream check-runs for this SHA. Test shards post
+            # check-runs independently, and workflow_run conclusion can flip to
+            # failure before sibling shards finish. We need per-shard state.
+            ALL_CHECK_RUNS=$(gh api --paginate \
+              "repos/$UPSTREAM/commits/$SHA/check-runs?per_page=100" \
+              --jq '.check_runs[] | {name,status,conclusion}' \
+              2>/dev/null | jq -s '.' || echo '[]')
+
+            CHECK_RUNS='[]'
+            for ARCH in $RUN_ARCHS; do
+              REGEX=$(echo "$ARCH_JOBNAME_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""')
+              [ -z "$REGEX" ] && continue
+              ARCH_CHECK_RUNS=$(echo "$ALL_CHECK_RUNS" | jq --arg rx "$REGEX" \
+                '[.[] | select((.name | test($rx)) and (.name | test("mem_leak_check|rerun_disabled_tests") | not))]')
+              CHECK_RUNS=$(jq -s 'add | unique_by(.name)' \
+                <(echo "$CHECK_RUNS") \
+                <(echo "$ARCH_CHECK_RUNS"))
+            done
+
+            CUDA_JOBNAME_REGEX='(linux-jammy-cuda13[.]0-py3[.]10-gcc11 / (test-osdc|test) [(](default|distributed),|unit-test / inductor-test / (test-osdc|test) [(]inductor,)'
+            CUDA_CHECK_RUNS=$(echo "$ALL_CHECK_RUNS" | jq --arg rx "$CUDA_JOBNAME_REGEX" \
+              '[.[] | select((.name | test($rx)) and (.name | test("mem_leak_check|rerun_disabled_tests") | not))]')
+
+            if [ "$(echo "$CHECK_RUNS" | jq 'length')" -eq 0 ]; then
+              echo "[$SHORT] $DATE  ROCm workflows ran ($RUN_ARCHS) but no parity check-runs yet - skip"
+              continue
+            fi
+
+            if [ "$(echo "$CUDA_CHECK_RUNS" | jq 'length')" -eq 0 ]; then
+              echo "[$SHORT] $DATE  no CUDA parity check-runs yet on upstream - skip"
+              continue
+            fi
+
+            # Gate 1: require EVERY upstream check-run consumed by the
+            # parity report for this SHA to be status=completed (ROCm test
+            # shards for arch workflows that ran, plus CUDA default/
+            # distributed/inductor tests). Once we dispatch for a SHA the
+            # parity report is authored, so dispatching before CUDA or
+            # another arch finishes produces partial reports.
+            GATE_CHECK_RUNS=$(jq -s 'add' \
+              <(echo "$CHECK_RUNS") \
+              <(echo "$CUDA_CHECK_RUNS"))
+            TOTAL_CR=$(echo "$GATE_CHECK_RUNS" | jq 'length')
+            PENDING_CR=$(echo "$GATE_CHECK_RUNS" | jq 'map(select(.status != "completed")) | length')
+            if [ "$PENDING_CR" -ne 0 ]; then
+              PENDING_SAMPLE=$(echo "$GATE_CHECK_RUNS" | jq -r '
+                map(select(.status != "completed"))
+                | .[0:3]
+                | map(.name)
+                | join(", ")')
+              echo "[$SHORT] $DATE  ${PENDING_CR}/${TOTAL_CR} parity check-runs still pending - skip (e.g. $PENDING_SAMPLE)"
+              continue
+            fi
+
+            # Gate 2: every arch workflow that ran on this SHA must have
+            # matching test shards before we author the one-and-only report
+            # for the SHA. Missing arch workflows are not expected; missing
+            # shards for a workflow that ran means the workflow is not ready.
+            READY=""
+            NOT_READY_NOTES=""
+            for ARCH in $RUN_ARCHS; do
+              REGEX=$(echo "$ARCH_JOBNAME_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""')
+              if [ -z "$REGEX" ]; then
+                NOT_READY_NOTES="$NOT_READY_NOTES $ARCH:no-regex"
+                continue
+              fi
+              TOTAL=$(echo "$CHECK_RUNS" | jq --arg rx "$REGEX" \
+                'map(select(.name | test($rx))) | length')
+              if [ "$TOTAL" -eq 0 ]; then
+                NOT_READY_NOTES="$NOT_READY_NOTES $ARCH:workflow-run-no-shards-yet"
+              else
+                READY="$READY $ARCH"
+              fi
+            done
+            READY=$(echo "$READY" | xargs)
+            NOT_READY_NOTES=$(echo "$NOT_READY_NOTES" | xargs)
+
+            if [ -n "$NOT_READY_NOTES" ]; then
+              echo "[$SHORT] $DATE  ROCm workflows ran ($RUN_ARCHS) but some test shards are missing - skip (${NOT_READY_NOTES})"
+              continue
+            fi
+
+            if [ -z "$READY" ]; then
+              echo "[$SHORT] $DATE  ROCm workflows ran ($RUN_ARCHS) but no in-scope arches are ready"
+              continue
+            fi
+
+            ARCH_DISPATCH=$(echo "$READY" | sed 's/ /, /g')
+            echo "[$SHORT] READY archs: '$(echo "$READY" | tr ' ' ',')' (committed $DATE; not-run: ${NOT_RUN_NOTES:-none})"
+            echo "[$SHORT] dispatching for: '$(echo "$READY" | tr ' ' ',')'"
+
+            if [ "$DRY_RUN" = "true" ]; then
+              echo "[$SHORT] DRY_RUN=true - not dispatching"
+            else
+              gh workflow run parity.yml \
+                --repo "$GITHUB_REPOSITORY" \
+                --ref  "$TARGET_REF" \
+                -f sha="$SHA" \
+                -f arch="$ARCH_DISPATCH"
+            fi
+
+            DISPATCHED_COUNT=$((DISPATCHED_COUNT + 1))
+            DISPATCHED_SUMMARY="${DISPATCHED_SUMMARY}${SHORT}:${ARCH_DISPATCH}"$'\n'
+            if [ "$DISPATCHED_COUNT" -ge "$MAX_DISPATCHES" ]; then
+              echo "Reached max dispatches for this scan ($MAX_DISPATCHES); stopping"
+              break
+            fi
+          done <<< "$COMMITS"
+
+          # --- 4. Summary -------------------------------------------------------
+          {
+            echo "### Parity auto-trigger"
+            echo ""
+            echo "- Upstream:     \`$UPSTREAM@$BRANCH\`"
+            echo "- Scope archs:  \`$ARCHS\`"
+            echo "- Max commits:  $MAX_COMMITS"
+            echo "- Max dispatches: $MAX_DISPATCHES"
+            echo "- Max age:      ${MAX_AGE_HOURS}h"
+            echo "- Target ref:   \`$TARGET_REF\`"
+            if [ "$DISPATCHED_COUNT" -gt 0 ]; then
+              if [ "$DRY_RUN" = "true" ]; then
+                echo "- Result:       would dispatch $DISPATCHED_COUNT parity run(s) (dry-run)"
+              else
+                echo "- Result:       dispatched $DISPATCHED_COUNT parity run(s)"
+              fi
+              echo ""
+              echo "$DISPATCHED_SUMMARY" | while IFS= read -r LINE; do
+                [ -z "$LINE" ] && continue
+                echo "- $LINE"
+              done
+            else
+              echo "- Result:       no ready unprocessed SHAs found"
+            fi
+          } >> "$GITHUB_STEP_SUMMARY"