diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs index d391a7c7c10c5..75d00fbe17ed9 100755 --- a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs +++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs @@ -210,8 +210,10 @@ def _shorten_unzipped_dirs(): Converts names like: unzipped-test-reports-runattempt1-test-default-1-6-linux.rocm.gpu.gfx942.1_68613413431.zip + unzipped-test-reports-runattempt1-test-osdc-default-1-5-mt-l-x86aavx2-29-113-l4_73385044118.zip to: test-default-1-6 + test-default-1-5 Preserves the 'test-' prefix so that summarize_xml_testreports.py can still detect workflow type via substring matching. @@ -220,9 +222,9 @@ def _shorten_unzipped_dirs(): for d in sorted(Path(".").glob("unzipped-*")): if not d.is_dir(): continue - m = re.search(r'(test-\w+-\d+-\d+)', d.name) + m = re.search(r'test-(?:osdc-)?(default|distributed|inductor)-(\d+)-(\d+)', d.name) if m: - short_name = m.group(1) + short_name = f"test-{m.group(1)}-{m.group(2)}-{m.group(3)}" if not Path(short_name).exists(): d.rename(short_name) print(f" Renamed {d.name} -> {short_name}") @@ -662,6 +664,7 @@ def main(): if not args.no_cuda: cuda_job_prefix = "linux-jammy-cuda13.0-py3.10-gcc11" + cuda_test_job_kind = "test-osdc" print("==========================================") print(f"Finding CUDA tests in workflow '{CUDAWorkflowNames['default']}' by sha: {sha}") print("==========================================") @@ -686,7 +689,10 @@ def main(): for run in trunk_runs: jobs = get_workflow_jobs(run) - test_jobs = [j for j in jobs if cuda_job_prefix in j['name'] and '/ test' in j['name']] + test_jobs = [ + j for j in jobs + if cuda_job_prefix in j['name'] and f'/ {cuda_test_job_kind} (' in j['name'] + ] if test_jobs: trunk_wf = run all_cuda_jobs = jobs @@ -699,7 +705,7 @@ def main(): # by the jobs API. Use check-runs API to find the actual run. print("No CUDA test jobs in any trunk run's jobs API, trying check-runs API...") check_runs = get_check_runs_for_commit(sha, cuda_job_prefix) - cuda_test_jobs = [cr for cr in check_runs if '/ test' in cr['name']] + cuda_test_jobs = [cr for cr in check_runs if f'/ {cuda_test_job_kind} (' in cr['name']] if cuda_test_jobs: # Extract the actual workflow run ID from the check-run details URL import re as _re @@ -737,18 +743,18 @@ def main(): # Download logs if not args.artifacts_only: test_log_list_cuda_default = [ - ["cuda1.txt", f"{cuda_job_prefix} / test (default, 1, 5"], - ["cuda2.txt", f"{cuda_job_prefix} / test (default, 2, 5"], - ["cuda3.txt", f"{cuda_job_prefix} / test (default, 3, 5"], - ["cuda4.txt", f"{cuda_job_prefix} / test (default, 4, 5"], - ["cuda5.txt", f"{cuda_job_prefix} / test (default, 5, 5"], + ["cuda1.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 1, 5"], + ["cuda2.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 2, 5"], + ["cuda3.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 3, 5"], + ["cuda4.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 4, 5"], + ["cuda5.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 5, 5"], ] test_log_list_cuda = test_log_list_cuda_default if not args.exclude_distributed: test_log_list_cuda_distributed = [ - ["cuda_dist1.txt", f"{cuda_job_prefix} / test (distributed, 1, 3"], - ["cuda_dist2.txt", f"{cuda_job_prefix} / test (distributed, 2, 3"], - ["cuda_dist3.txt", f"{cuda_job_prefix} / test (distributed, 3, 3"], + ["cuda_dist1.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (distributed, 1, 3"], + ["cuda_dist2.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (distributed, 2, 3"], + ["cuda_dist3.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (distributed, 3, 3"], ] test_log_list_cuda += test_log_list_cuda_distributed @@ -756,11 +762,11 @@ def main(): # Download artifacts test_artifacts_list_cuda_default = [ - "test-reports-test-default-1-5", - "test-reports-test-default-2-5", - "test-reports-test-default-3-5", - "test-reports-test-default-4-5", - "test-reports-test-default-5-5", + "test-reports-test-osdc-default-1-5", + "test-reports-test-osdc-default-2-5", + "test-reports-test-osdc-default-3-5", + "test-reports-test-osdc-default-4-5", + "test-reports-test-osdc-default-5-5", ] test_artifacts_list_cuda = [] @@ -769,9 +775,9 @@ def main(): if not args.exclude_distributed: test_artifacts_list_cuda_distributed = [ - "test-reports-test-distributed-1-3", - "test-reports-test-distributed-2-3", - "test-reports-test-distributed-3-3", + "test-reports-test-osdc-distributed-1-3", + "test-reports-test-osdc-distributed-2-3", + "test-reports-test-osdc-distributed-3-3", ] test_artifacts_list_cuda += test_artifacts_list_cuda_distributed diff --git a/.github/workflows/parity-auto.yml b/.github/workflows/parity-auto.yml new file mode 100644 index 0000000000000..110f6f00359bd --- /dev/null +++ b/.github/workflows/parity-auto.yml @@ -0,0 +1,295 @@ +name: Parity Auto Trigger +run-name: "Parity auto-trigger · pytorch/pytorch main" + +# Polls pytorch/pytorch main for commits where all CI consumed by +# parity.yml has finished, and dispatches parity.yml once for that SHA +# covering every arch whose test shards actually ran on it. +# +# Readiness is evaluated at the *check-run* level, not the workflow +# level: ROCm test shards post check-runs independently, and a single +# failing shard flips the parent workflow_run to conclusion=failure +# while siblings are still executing. Relying on workflow_runs would +# fire too early and produce empty reports. +# +# Two gates: +# 1. Every ROCm arch test check-run and every CUDA test check-run +# consumed by download_testlogs on the SHA must be status=completed. +# We don't want to dispatch mi355 while mi300, CUDA trunk OSDC, or +# CUDA inductor shards are still running on the same commit. +# 2. For each arch in scope, we check whether its test shards +# actually ran on this SHA; mi300/mi200 etc. only show up when +# their periodic workflow happens to land on that SHA. +# +# We dispatch at most once per SHA with the ready subset of arches, so mi355 +# (run as part of trunk) gets a parity report per commit, and mi300/ +# mi200 join the same dispatch whenever their periodic workflow +# happens to finish on that SHA. + +on: + schedule: + - cron: '*/10 * * * *' + workflow_dispatch: + inputs: + max_commits: + description: 'How many of the most recent upstream commits to scan.' + required: false + default: '15' + type: string + max_dispatches: + description: 'Maximum number of ready upstream commits to dispatch in one scan.' + required: false + default: '10' + type: string + max_age_hours: + description: 'Skip commits older than this (avoid back-filling ancient SHAs).' + required: false + default: '10' + type: string + archs: + description: 'Architectures to consider (comma/space separated).' + required: false + default: 'mi355, mi300, mi200' + type: string + arch_jobname_regex_map: + description: 'JSON: arch → PCRE regex that matches the check-run names of that arch''s ROCm test shards on pytorch/pytorch. An arch is considered "ready" only when every check-run whose name matches has status=completed (so we wait for all test shards, not just workflow completion).' + required: false + default: '{"mi355":"rocm.*mi355.*/ test \\(","mi300":"rocm.*mi300.*/ test \\(","mi200":"rocm.*(mi200|mi210).*/ test \\(","navi31":"rocm.*navi31.*/ test \\(","nightly":"rocm-nightly.*/ test \\("}' + type: string + target_ref: + description: 'Ref of this repo to dispatch parity.yml against. Leave blank to use this workflow run''s ref.' + required: false + default: '' + type: string + dry_run: + description: 'Scan and log, but do not actually dispatch parity.yml.' + required: false + default: false + type: boolean + +permissions: + contents: read + actions: write + +concurrency: + group: parity-auto-trigger + cancel-in-progress: false + +jobs: + scan-and-dispatch: + runs-on: ubuntu-latest + steps: + - name: Find ready arches per upstream commit and dispatch parity.yml + env: + GH_TOKEN: ${{ github.token }} + UPSTREAM: pytorch/pytorch + BRANCH: main + MAX_COMMITS: ${{ inputs.max_commits || '15' }} + MAX_DISPATCHES: ${{ inputs.max_dispatches || '10' }} + MAX_AGE_HOURS: ${{ inputs.max_age_hours || '10' }} + ARCHS_IN: ${{ inputs.archs || 'mi355, mi300, mi200' }} + ARCH_JOBNAME_REGEX_MAP: ${{ inputs.arch_jobname_regex_map || '{"mi355":"rocm.*mi355.*/ test \\(","mi300":"rocm.*mi300.*/ test \\(","mi200":"rocm.*(mi200|mi210).*/ test \\(","navi31":"rocm.*navi31.*/ test \\(","nightly":"rocm-nightly.*/ test \\("}' }} + TARGET_REF_IN: ${{ inputs.target_ref || '' }} + DRY_RUN: ${{ inputs.dry_run || 'false' }} + run: | + # GitHub Actions launches this with `bash -e {0}`, so -e is already on + # from the shebang. It's too aggressive for the many pipelines here + # (grep -q returning 1, date -d edge cases, paginated API calls, + # etc.) and has caused the loop to silently abort after the first + # "no ready archs" commit. Explicitly turn -e OFF and keep -u + + # pipefail so undefined-variable bugs still surface. + set +e + set -uo pipefail + + NOW_EPOCH=$(date -u +%s) + MAX_AGE_EPOCH=$((NOW_EPOCH - MAX_AGE_HOURS * 3600)) + TARGET_REF="${TARGET_REF_IN:-$GITHUB_REF_NAME}" + ARCHS=$(echo "$ARCHS_IN" | tr ',' ' ' | xargs) + + echo "Upstream: $UPSTREAM@$BRANCH" + echo "Target ref: $TARGET_REF" + echo "Scope archs: $ARCHS" + echo "Max commits: $MAX_COMMITS" + echo "Max dispatches: $MAX_DISPATCHES" + echo "Max age: ${MAX_AGE_HOURS}h" + echo "Dry run: $DRY_RUN" + echo "Arch→regex: $ARCH_JOBNAME_REGEX_MAP" + echo + + # --- 1. Recent upstream commits -------------------------------------- + COMMITS=$(gh api "repos/$UPSTREAM/commits?sha=$BRANCH&per_page=$MAX_COMMITS" \ + --jq '.[] | "\(.sha) \(.commit.committer.date)"') + + if [ -z "$COMMITS" ]; then + echo "::warning::No commits returned from $UPSTREAM@$BRANCH" + exit 0 + fi + + # --- 2. Already-dispatched SHAs in our repo -------------------------- + # Pull last 200 parity runs. Run titles look like: + # " · mi355, mi300, mi200" + # Once any parity run exists for a SHA, we do not dispatch another + # report for that SHA. This keeps the dashboard to one report per + # upstream commit. + EXISTING=$(gh run list \ + --repo "$GITHUB_REPOSITORY" \ + --workflow parity.yml \ + --limit 200 \ + --json displayTitle 2>/dev/null || echo '[]') + + sha_already_dispatched() { + local sha="$1" + echo "$EXISTING" | jq -e --arg sha "$sha" \ + 'any(.[]; .displayTitle | contains($sha))' >/dev/null + } + + # --- 3. Walk commits, dispatch each ready unprocessed SHA ------------ + DISPATCHED_COUNT=0 + DISPATCHED_SUMMARY="" + while IFS=' ' read -r SHA DATE; do + [ -z "$SHA" ] && continue + SHORT=$(echo "$SHA" | cut -c1-8) + COMMIT_EPOCH=$(date -u -d "$DATE" +%s 2>/dev/null || echo 0) + + if [ "$COMMIT_EPOCH" -ne 0 ] && [ "$COMMIT_EPOCH" -lt "$MAX_AGE_EPOCH" ]; then + echo "[$SHORT] $DATE too old (>${MAX_AGE_HOURS}h) — stopping scan" + break + fi + + if sha_already_dispatched "$SHA"; then + echo "[$SHORT] parity report already exists for this SHA — skip" + continue + fi + + # Pull relevant upstream check-runs for this SHA. We use + # check-runs (not workflow_runs) because test shards post + # check-runs independently and workflow_run conclusion flips + # to 'failure' the moment any shard fails, even while siblings + # are still running. We need per-shard state. + ALL_CHECK_RUNS=$(gh api --paginate \ + "repos/$UPSTREAM/commits/$SHA/check-runs?per_page=100" \ + --jq '.check_runs[] | {name,status,conclusion}' \ + 2>/dev/null | jq -s '.' || echo '[]') + + CHECK_RUNS='[]' + for ARCH in $ARCHS; do + REGEX=$(echo "$ARCH_JOBNAME_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""') + [ -z "$REGEX" ] && continue + ARCH_CHECK_RUNS=$(echo "$ALL_CHECK_RUNS" | jq --arg rx "$REGEX" \ + '[.[] | select(.name | test($rx))]') + CHECK_RUNS=$(jq -s 'add | unique_by(.name)' \ + <(echo "$CHECK_RUNS") \ + <(echo "$ARCH_CHECK_RUNS")) + done + + CUDA_JOBNAME_REGEX='(linux-jammy-cuda13[.]0-py3[.]10-gcc11.*/ test-osdc [(](default|distributed),|unit-test / inductor-test / test [(]inductor,)' + CUDA_CHECK_RUNS=$(echo "$ALL_CHECK_RUNS" | jq --arg rx "$CUDA_JOBNAME_REGEX" \ + '[.[] | select(.name | test($rx))]') + + if [ "$(echo "$CHECK_RUNS" | jq 'length')" -eq 0 ]; then + echo "[$SHORT] $DATE no in-scope ROCm parity check-runs yet on upstream — skip" + continue + fi + + if [ "$(echo "$CUDA_CHECK_RUNS" | jq 'length')" -eq 0 ]; then + echo "[$SHORT] $DATE no CUDA parity check-runs yet on upstream — skip" + continue + fi + + # Gate 1: require EVERY upstream check-run consumed by the + # parity report for this SHA to be status=completed (ROCm + # arch test shards, plus CUDA default/distributed/inductor + # tests). Once we dispatch for a SHA the parity report is + # authored, so dispatching before CUDA or another arch + # finishes produces partial reports. + GATE_CHECK_RUNS=$(jq -s 'add' \ + <(echo "$CHECK_RUNS") \ + <(echo "$CUDA_CHECK_RUNS")) + TOTAL_CR=$(echo "$GATE_CHECK_RUNS" | jq 'length') + PENDING_CR=$(echo "$GATE_CHECK_RUNS" | jq 'map(select(.status != "completed")) | length') + if [ "$PENDING_CR" -ne 0 ]; then + PENDING_SAMPLE=$(echo "$GATE_CHECK_RUNS" | jq -r ' + map(select(.status != "completed")) + | .[0:3] + | map(.name) + | join(", ")') + echo "[$SHORT] $DATE ${PENDING_CR}/${TOTAL_CR} parity check-runs still pending — skip (e.g. $PENDING_SAMPLE)" + continue + fi + + # Gate 2: for each arch in scope, identify whether that + # arch's test shards actually ran on this SHA (some archs + # run only on periodic workflows that don't fire every + # commit, so mi300/mi200 may legitimately have no shards). + READY="" + NOT_READY_NOTES="" + for ARCH in $ARCHS; do + REGEX=$(echo "$ARCH_JOBNAME_REGEX_MAP" | jq -r --arg a "$ARCH" '.[$a] // ""') + if [ -z "$REGEX" ]; then + NOT_READY_NOTES="$NOT_READY_NOTES $ARCH:no-regex" + continue + fi + TOTAL=$(echo "$CHECK_RUNS" | jq --arg rx "$REGEX" \ + 'map(select(.name | test($rx))) | length') + if [ "$TOTAL" -eq 0 ]; then + NOT_READY_NOTES="$NOT_READY_NOTES $ARCH:no-shards" + else + READY="$READY $ARCH" + fi + done + READY=$(echo "$READY" | xargs) + NOT_READY_NOTES=$(echo "$NOT_READY_NOTES" | xargs) + + if [ -z "$READY" ]; then + echo "[$SHORT] $DATE all ROCm check-runs complete but no in-scope arches ran (${NOT_READY_NOTES:-none})" + continue + fi + + ARCH_DISPATCH=$(echo "$READY" | sed 's/ /, /g') + CSV_NAME="autoparity-$(date -u +%Y%m%d)-$SHA" + echo "[$SHORT] READY archs: '$(echo "$READY" | tr ' ' ',')' (committed $DATE; still-pending: ${NOT_READY_NOTES:-none})" + echo "[$SHORT] dispatching for: '$(echo "$READY" | tr ' ' ',')'" + + if [ "$DRY_RUN" = "true" ]; then + echo "[$SHORT] DRY_RUN=true — not dispatching" + else + gh workflow run parity.yml \ + --repo "$GITHUB_REPOSITORY" \ + --ref "$TARGET_REF" \ + -f sha="$SHA" \ + -f arch="$ARCH_DISPATCH" \ + -f csv_name="$CSV_NAME" + fi + + DISPATCHED_COUNT=$((DISPATCHED_COUNT + 1)) + DISPATCHED_SUMMARY="${DISPATCHED_SUMMARY}${SHORT}:${ARCH_DISPATCH}"$'\n' + if [ "$DISPATCHED_COUNT" -ge "$MAX_DISPATCHES" ]; then + echo "Reached max dispatches for this scan ($MAX_DISPATCHES); stopping" + break + fi + done <<< "$COMMITS" + + # --- 4. Summary ------------------------------------------------------- + { + echo "### Parity auto-trigger" + echo "" + echo "- Upstream: \`$UPSTREAM@$BRANCH\`" + echo "- Scope archs: \`$ARCHS\`" + echo "- Max commits: $MAX_COMMITS" + echo "- Max dispatches: $MAX_DISPATCHES" + echo "- Max age: ${MAX_AGE_HOURS}h" + echo "- Target ref: \`$TARGET_REF\`" + if [ "$DISPATCHED_COUNT" -gt 0 ]; then + if [ "$DRY_RUN" = "true" ]; then + echo "- Result: would dispatch $DISPATCHED_COUNT parity run(s) (dry-run)" + else + echo "- Result: dispatched $DISPATCHED_COUNT parity run(s)" + fi + echo "" + echo "$DISPATCHED_SUMMARY" | while IFS= read -r LINE; do + [ -z "$LINE" ] && continue + echo "- $LINE" + done + else + echo "- Result: no ready unprocessed SHAs found" + fi + } >> "$GITHUB_STEP_SUMMARY"