Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
358 changes: 358 additions & 0 deletions .github/workflows/parity-auto.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
name: Parity Auto Trigger
run-name: "Parity auto-trigger · pytorch/pytorch main"

# Polls completed pytorch/pytorch trunk.yml pushes on main and dispatches
# parity.yml once for each SHA where all CI consumed by the report has finished,
# covering every arch whose test shards actually ran on it.
#
# Arch participation is detected at the workflow_run level: if rocm-mi300
# never ran on a SHA, we don't wait for it or include it. Readiness is then
# evaluated at the *check-run* level because ROCm test shards post check-runs
# independently, and a single failing shard flips the parent workflow_run to
# conclusion=failure while siblings are still executing.
#
# Two gates:
# 1. Detect which ROCm arch workflows actually ran on this SHA.
# 2. Require every test check-run for those ROCm arch workflows, plus every
# CUDA test check-run consumed by download_testlogs, to be status=completed.
# We don't want to dispatch mi355 while a mi300 workflow that ran on the
# same SHA is still creating or running test shards.
#
# We dispatch at most once per SHA with the ready subset of arches, so mi355
# (run as part of trunk) gets a parity report per commit, and mi300/
# mi200 join the same dispatch whenever their periodic workflow
# happens to finish on that SHA.

on:
schedule:
- cron: '*/10 * * * *'
pull_request:
paths:
- '.github/workflows/parity-auto.yml'
workflow_dispatch:
inputs:
max_commits:
description: 'How many of the most recent completed upstream trunk.yml pushes on main to scan.'
required: false
default: '200'
type: string
max_dispatches:
description: 'Maximum number of ready upstream commits to dispatch in one scan.'
required: false
default: '50'
type: string
max_age_hours:
description: 'Skip commits older than this (avoid back-filling ancient SHAs).'
required: false
default: '72'
type: string
archs:
description: 'Architectures to consider (comma/space separated).'
required: false
default: 'mi355, mi300, mi200'
type: string
arch_jobname_regex_map:
description: 'JSON: arch -> PCRE regex that matches the check-run names of that arch''s ROCm test shards on pytorch/pytorch. An arch is considered "ready" only when every check-run whose name matches has status=completed (so we wait for all test shards, not just workflow completion).'
required: false
default: '{"mi355":"rocm.*mi355.*/ test [(](default|distributed|inductor),","mi300":"rocm.*mi300.*/ test [(](default|distributed|inductor),","mi200":"(rocm.*(mi200|mi210).*/ test [(](default|distributed|inductor),|linux-jammy-rocm-py3[.]10 / test [(](default|distributed|inductor),)","navi31":"rocm.*navi31.*/ test [(]default,","nightly":"rocm-nightly.*/ test [(](default|distributed|inductor),"}'
type: string
arch_workflow_regex_map:
description: 'JSON: arch -> PCRE regex that matches workflow file paths for upstream ROCm workflows that mean this arch ran on the SHA. Missing workflows mean the arch is not expected for that commit.'
required: false
default: '{"mi355":"(^|/)(trunk|rocm-mi355|periodic-rocm-mi355|inductor-rocm-mi355)[.]yml$","mi300":"(^|/)(rocm-mi300|periodic-rocm-mi300|inductor-rocm-mi300)[.]yml$","mi200":"(^|/)(trunk-rocm-sandbox|rocm-mi200|periodic-rocm-mi200|inductor-rocm-mi200)[.]yml$","navi31":"(^|/)(rocm-navi31|periodic-rocm-navi31|inductor-rocm-navi31)[.]yml$","nightly":"(^|/)rocm-nightly[.]yml$"}'
type: string
target_ref:
description: 'Ref of this repo to dispatch parity.yml against. Leave blank to use this workflow run''s ref.'
required: false
default: ''
type: string
dry_run:
description: 'Scan and log, but do not actually dispatch parity.yml.'
required: false
default: false
type: boolean

permissions:
contents: read
actions: write

concurrency:
group: parity-auto-trigger
cancel-in-progress: false

jobs:
scan-and-dispatch:
runs-on: ubuntu-latest
steps:
- name: Find ready arches per upstream commit and dispatch parity.yml
env:
GH_TOKEN: ${{ github.token }}
UPSTREAM: pytorch/pytorch
BRANCH: main
MAX_COMMITS: ${{ github.event_name == 'pull_request' && '20' || inputs.max_commits || '200' }}
MAX_DISPATCHES: ${{ github.event_name == 'pull_request' && '5' || inputs.max_dispatches || '50' }}
MAX_AGE_HOURS: ${{ inputs.max_age_hours || '72' }}
ARCHS_IN: ${{ inputs.archs || 'mi355, mi300, mi200' }}
ARCH_JOBNAME_REGEX_MAP: ${{ inputs.arch_jobname_regex_map || '{"mi355":"rocm.*mi355.*/ test [(](default|distributed|inductor),","mi300":"rocm.*mi300.*/ test [(](default|distributed|inductor),","mi200":"(rocm.*(mi200|mi210).*/ test [(](default|distributed|inductor),|linux-jammy-rocm-py3[.]10 / test [(](default|distributed|inductor),)","navi31":"rocm.*navi31.*/ test [(]default,","nightly":"rocm-nightly.*/ test [(](default|distributed|inductor),"}' }}
ARCH_WORKFLOW_REGEX_MAP: ${{ inputs.arch_workflow_regex_map || '{"mi355":"(^|/)(trunk|rocm-mi355|periodic-rocm-mi355|inductor-rocm-mi355)[.]yml$","mi300":"(^|/)(rocm-mi300|periodic-rocm-mi300|inductor-rocm-mi300)[.]yml$","mi200":"(^|/)(trunk-rocm-sandbox|rocm-mi200|periodic-rocm-mi200|inductor-rocm-mi200)[.]yml$","navi31":"(^|/)(rocm-navi31|periodic-rocm-navi31|inductor-rocm-navi31)[.]yml$","nightly":"(^|/)rocm-nightly[.]yml$"}' }}
TARGET_REF_IN: ${{ inputs.target_ref || '' }}
DRY_RUN: ${{ github.event_name == 'pull_request' && 'true' || inputs.dry_run || 'false' }}
run: |
# GitHub Actions launches this with `bash -e {0}`, so -e is already on
# from the shebang. It's too aggressive for the many pipelines here
# (grep -q returning 1, date -d edge cases, paginated API calls,
# etc.) and has caused the loop to silently abort after the first
# "no ready archs" commit. Explicitly turn -e OFF and keep -u +
# pipefail so undefined-variable bugs still surface.
set +e
set -uo pipefail

NOW_EPOCH=$(date -u +%s)
MAX_AGE_EPOCH=$((NOW_EPOCH - MAX_AGE_HOURS * 3600))
TARGET_REF="${TARGET_REF_IN:-$GITHUB_REF_NAME}"
ARCHS=$(echo "$ARCHS_IN" | tr ',' ' ' | xargs)

echo "Upstream: $UPSTREAM@$BRANCH"
echo "Target ref: $TARGET_REF"
echo "Scope archs: $ARCHS"
echo "Max trunk runs: $MAX_COMMITS"
echo "Max dispatches: $MAX_DISPATCHES"
echo "Max age: ${MAX_AGE_HOURS}h"
echo "Dry run: $DRY_RUN"
echo "Arch->jobs: $ARCH_JOBNAME_REGEX_MAP"
echo "Arch->workflows: $ARCH_WORKFLOW_REGEX_MAP"
echo

# --- 1. Recent completed upstream trunk pushes -----------------------
# Use trunk.yml as the candidate source instead of raw main commits.
# The parity report consumes trunk's CUDA/ROCm jobs, so a completed
# trunk push is the first point where a SHA can reasonably be ready.
COMMITS_JSON='[]'
PAGE=1
while [ "$(echo "$COMMITS_JSON" | jq 'length')" -lt "$MAX_COMMITS" ]; do
PAGE_RUNS=$(gh api \
"repos/$UPSTREAM/actions/workflows/trunk.yml/runs?branch=$BRANCH&event=push&status=completed&per_page=100&page=$PAGE" \
--jq '.workflow_runs | map({head_sha, created_at})')
if [ "$(echo "$PAGE_RUNS" | jq 'length')" -eq 0 ]; then
break
fi
COMMITS_JSON=$(jq -s --arg max "$MAX_COMMITS" '
(.[0] + .[1]) as $runs
| reduce $runs[] as $run ({seen:{}, rows:[]};
if .seen[$run.head_sha] then .
else .seen[$run.head_sha] = true | .rows += [$run]
end
)
| .rows[:($max | tonumber)]
' <(echo "$COMMITS_JSON") <(echo "$PAGE_RUNS"))
PAGE=$((PAGE + 1))
done
COMMITS=$(echo "$COMMITS_JSON" | jq -r '.[] | "\(.head_sha) \(.created_at)"')

if [ -z "$COMMITS" ]; then
echo "::warning::No completed trunk.yml push runs returned from $UPSTREAM@$BRANCH"
exit 0
fi

# --- 2. Already-dispatched SHAs in our repo --------------------------
# Deduplicate auto-parity-created parity runs without changing
# parity.yml's own output naming. New auto-dispatched runs are created
# by github-actions[bot]; keep the old autoparity-* title match so
# runs created before this workflow stopped passing csv_name still
# suppress duplicate dispatches.
EXISTING=$(gh api --paginate \
"repos/$GITHUB_REPOSITORY/actions/workflows/parity.yml/runs?event=workflow_dispatch&created=%3E%3D$(date -u -d "@$MAX_AGE_EPOCH" '+%Y-%m-%dT%H:%M:%SZ')&per_page=100" \
--jq '.workflow_runs[] | {display_title, actor: .actor.login}' |
jq -s '.')

sha_already_dispatched() {
local sha="$1"
echo "$EXISTING" | jq -e --arg sha "$sha" \
'any(.[]; ((.display_title // "") | contains($sha)) and (((.display_title // "") | startswith("autoparity-")) or (.actor == "github-actions[bot]")))' >/dev/null
}

# --- 3. Walk trunk SHAs, dispatch each ready unprocessed SHA ---------
DISPATCHED_COUNT=0
DISPATCHED_SUMMARY=""
while IFS=' ' read -r SHA DATE; do
[ -z "$SHA" ] && continue
SHORT=$(echo "$SHA" | cut -c1-8)
COMMIT_EPOCH=$(date -u -d "$DATE" +%s 2>/dev/null || echo 0)

if [ "$COMMIT_EPOCH" -ne 0 ] && [ "$COMMIT_EPOCH" -lt "$MAX_AGE_EPOCH" ]; then
echo "[$SHORT] $DATE too old (>${MAX_AGE_HOURS}h) - stopping scan"
break
fi

if sha_already_dispatched "$SHA"; then
echo "[$SHORT] parity report already exists for this SHA - skip"
continue
fi

# First determine which ROCm arch workflows actually ran on this
# SHA. If a periodic arch workflow never ran, the arch is not
# expected for the report. If it did run, we must wait for its
# matching test shards below.
ALL_WORKFLOW_RUNS=$(gh api --paginate \
"repos/$UPSTREAM/actions/runs?head_sha=$SHA&per_page=100" \
--jq '.workflow_runs[] | {name,path,status,conclusion}' \
2>/dev/null | jq -s '.' || echo '[]')

RUN_ARCHS=""
NOT_RUN_NOTES=""
for ARCH in $ARCHS; do
WF_REGEX=$(echo "$ARCH_WORKFLOW_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""')
if [ -z "$WF_REGEX" ]; then
NOT_RUN_NOTES="$NOT_RUN_NOTES $ARCH:no-workflow-regex"
continue
fi
WF_TOTAL=$(echo "$ALL_WORKFLOW_RUNS" | jq --arg rx "$WF_REGEX" \
'map(select((.path // "") | test($rx))) | length')
if [ "$WF_TOTAL" -eq 0 ]; then
NOT_RUN_NOTES="$NOT_RUN_NOTES $ARCH:no-workflow"
else
RUN_ARCHS="$RUN_ARCHS $ARCH"
fi
done
RUN_ARCHS=$(echo "$RUN_ARCHS" | xargs)
NOT_RUN_NOTES=$(echo "$NOT_RUN_NOTES" | xargs)

if [ -z "$RUN_ARCHS" ]; then
echo "[$SHORT] $DATE no in-scope ROCm workflows ran on upstream (${NOT_RUN_NOTES:-none}) - skip"
continue
fi

# Pull relevant upstream check-runs for this SHA. Test shards post
# check-runs independently, and workflow_run conclusion can flip to
# failure before sibling shards finish. We need per-shard state.
ALL_CHECK_RUNS=$(gh api --paginate \
"repos/$UPSTREAM/commits/$SHA/check-runs?per_page=100" \
--jq '.check_runs[] | {name,status,conclusion}' \
2>/dev/null | jq -s '.' || echo '[]')

CHECK_RUNS='[]'
for ARCH in $RUN_ARCHS; do
REGEX=$(echo "$ARCH_JOBNAME_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""')
[ -z "$REGEX" ] && continue
ARCH_CHECK_RUNS=$(echo "$ALL_CHECK_RUNS" | jq --arg rx "$REGEX" \
'[.[] | select((.name | test($rx)) and (.name | test("mem_leak_check|rerun_disabled_tests") | not))]')
CHECK_RUNS=$(jq -s 'add | unique_by(.name)' \
<(echo "$CHECK_RUNS") \
<(echo "$ARCH_CHECK_RUNS"))
done

CUDA_JOBNAME_REGEX='(linux-jammy-cuda13[.]0-py3[.]10-gcc11 / (test-osdc|test) [(](default|distributed),|unit-test / inductor-test / (test-osdc|test) [(]inductor,)'
CUDA_CHECK_RUNS=$(echo "$ALL_CHECK_RUNS" | jq --arg rx "$CUDA_JOBNAME_REGEX" \
'[.[] | select((.name | test($rx)) and (.name | test("mem_leak_check|rerun_disabled_tests") | not))]')

if [ "$(echo "$CHECK_RUNS" | jq 'length')" -eq 0 ]; then
echo "[$SHORT] $DATE ROCm workflows ran ($RUN_ARCHS) but no parity check-runs yet - skip"
continue
fi

if [ "$(echo "$CUDA_CHECK_RUNS" | jq 'length')" -eq 0 ]; then
echo "[$SHORT] $DATE no CUDA parity check-runs yet on upstream - skip"
continue
fi

# Gate 1: require EVERY upstream check-run consumed by the
# parity report for this SHA to be status=completed (ROCm test
# shards for arch workflows that ran, plus CUDA default/
# distributed/inductor tests). Once we dispatch for a SHA the
# parity report is authored, so dispatching before CUDA or
# another arch finishes produces partial reports.
GATE_CHECK_RUNS=$(jq -s 'add' \
<(echo "$CHECK_RUNS") \
<(echo "$CUDA_CHECK_RUNS"))
TOTAL_CR=$(echo "$GATE_CHECK_RUNS" | jq 'length')
PENDING_CR=$(echo "$GATE_CHECK_RUNS" | jq 'map(select(.status != "completed")) | length')
if [ "$PENDING_CR" -ne 0 ]; then
PENDING_SAMPLE=$(echo "$GATE_CHECK_RUNS" | jq -r '
map(select(.status != "completed"))
| .[0:3]
| map(.name)
| join(", ")')
echo "[$SHORT] $DATE ${PENDING_CR}/${TOTAL_CR} parity check-runs still pending - skip (e.g. $PENDING_SAMPLE)"
continue
fi

# Gate 2: every arch workflow that ran on this SHA must have
# matching test shards before we author the one-and-only report
# for the SHA. Missing arch workflows are not expected; missing
# shards for a workflow that ran means the workflow is not ready.
READY=""
NOT_READY_NOTES=""
for ARCH in $RUN_ARCHS; do
REGEX=$(echo "$ARCH_JOBNAME_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""')
if [ -z "$REGEX" ]; then
NOT_READY_NOTES="$NOT_READY_NOTES $ARCH:no-regex"
continue
fi
TOTAL=$(echo "$CHECK_RUNS" | jq --arg rx "$REGEX" \
'map(select(.name | test($rx))) | length')
if [ "$TOTAL" -eq 0 ]; then
NOT_READY_NOTES="$NOT_READY_NOTES $ARCH:workflow-run-no-shards-yet"
else
READY="$READY $ARCH"
fi
done
READY=$(echo "$READY" | xargs)
NOT_READY_NOTES=$(echo "$NOT_READY_NOTES" | xargs)

if [ -n "$NOT_READY_NOTES" ]; then
echo "[$SHORT] $DATE ROCm workflows ran ($RUN_ARCHS) but some test shards are missing - skip (${NOT_READY_NOTES})"
continue
fi

if [ -z "$READY" ]; then
echo "[$SHORT] $DATE ROCm workflows ran ($RUN_ARCHS) but no in-scope arches are ready"
continue
fi

ARCH_DISPATCH=$(echo "$READY" | sed 's/ /, /g')
echo "[$SHORT] READY archs: '$(echo "$READY" | tr ' ' ',')' (committed $DATE; not-run: ${NOT_RUN_NOTES:-none})"
echo "[$SHORT] dispatching for: '$(echo "$READY" | tr ' ' ',')'"

if [ "$DRY_RUN" = "true" ]; then
echo "[$SHORT] DRY_RUN=true - not dispatching"
else
gh workflow run parity.yml \
--repo "$GITHUB_REPOSITORY" \
--ref "$TARGET_REF" \
-f sha="$SHA" \
-f arch="$ARCH_DISPATCH"
fi

DISPATCHED_COUNT=$((DISPATCHED_COUNT + 1))
DISPATCHED_SUMMARY="${DISPATCHED_SUMMARY}${SHORT}:${ARCH_DISPATCH}"$'\n'
if [ "$DISPATCHED_COUNT" -ge "$MAX_DISPATCHES" ]; then
echo "Reached max dispatches for this scan ($MAX_DISPATCHES); stopping"
break
fi
done <<< "$COMMITS"

# --- 4. Summary -------------------------------------------------------
{
echo "### Parity auto-trigger"
echo ""
echo "- Upstream: \`$UPSTREAM@$BRANCH\`"
echo "- Scope archs: \`$ARCHS\`"
echo "- Max commits: $MAX_COMMITS"
echo "- Max dispatches: $MAX_DISPATCHES"
echo "- Max age: ${MAX_AGE_HOURS}h"
echo "- Target ref: \`$TARGET_REF\`"
if [ "$DISPATCHED_COUNT" -gt 0 ]; then
if [ "$DRY_RUN" = "true" ]; then
echo "- Result: would dispatch $DISPATCHED_COUNT parity run(s) (dry-run)"
else
echo "- Result: dispatched $DISPATCHED_COUNT parity run(s)"
fi
echo ""
echo "$DISPATCHED_SUMMARY" | while IFS= read -r LINE; do
[ -z "$LINE" ] && continue
echo "- $LINE"
done
else
echo "- Result: no ready unprocessed SHAs found"
fi
} >> "$GITHUB_STEP_SUMMARY"