From af5c4ec030daf123c0260a0223388420e1479fa6 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 16 Feb 2026 13:13:36 -0500 Subject: [PATCH 01/87] [RELEASE 2.11] Release only changes (#175091) * [RELEASE 2.11] Release only changes * remove_file * Trigger rebuild --- .ci/pytorch/common_utils.sh | 2 +- .github/ci_commit_pins/xla.txt | 2 +- .github/scripts/filter_test_configs.py | 6 +- .github/templates/common.yml.j2 | 2 +- .../linux_binary_build_workflow.yml.j2 | 18 +- .../macos_binary_build_workflow.yml.j2 | 2 +- .../windows_binary_build_workflow.yml.j2 | 6 +- .github/workflows/_bazel-build-test.yml | 14 +- ...nary-build-flash-attention-wheel-linux.yml | 10 +- ...ry-build-flash-attention-wheel-windows.yml | 6 +- .github/workflows/_binary-build-linux.yml | 13 +- .github/workflows/_binary-test-linux.yml | 13 +- .github/workflows/_binary-upload.yml | 2 +- .github/workflows/_docs.yml | 10 +- .github/workflows/_link_check.yml | 4 +- .github/workflows/_linux-build.yml | 10 +- .github/workflows/_linux-test-stable-fa3.yml | 8 +- .github/workflows/_linux-test.yml | 14 +- .github/workflows/_mac-build.yml | 8 +- .github/workflows/_mac-test.yml | 10 +- .github/workflows/_rocm-test.yml | 8 +- .github/workflows/_runner-determinator.yml | 2 +- .github/workflows/_vllm-benchmark.yml | 4 +- .github/workflows/_win-build.yml | 4 +- .github/workflows/_win-test.yml | 4 +- .github/workflows/_xpu-test.yml | 8 +- .github/workflows/b200-distributed.yml | 2 +- .github/workflows/b200-symm-mem.yml | 2 +- .github/workflows/build-almalinux-images.yml | 2 +- .github/workflows/build-libtorch-images.yml | 4 +- .../build-manywheel-images-s390x.yml | 2 +- .github/workflows/build-manywheel-images.yml | 4 +- .github/workflows/build-triton-wheel.yml | 14 +- .github/workflows/build-vllm-wheel.yml | 6 +- .github/workflows/claude-code.yml | 2 +- .github/workflows/claude-issue-triage-run.yml | 2 +- .../close-nonexistent-disable-issues.yml | 2 +- .github/workflows/create_release.yml | 2 +- .github/workflows/docker-builds.yml | 10 +- .github/workflows/docker-cache-rocm.yml | 4 +- .github/workflows/docker-release.yml | 10 +- .github/workflows/dynamo-unittest.yml | 2 +- ...linux-aarch64-binary-manywheel-nightly.yml | 2 +- ...enerated-linux-binary-libtorch-nightly.yml | 12 +- ...nerated-linux-binary-manywheel-nightly.yml | 135 +++++------ ...d-linux-s390x-binary-manywheel-nightly.yml | 2 +- ...-arm64-binary-libtorch-release-nightly.yml | 1 - ...rated-macos-arm64-binary-wheel-nightly.yml | 7 - ...ws-arm64-binary-libtorch-debug-nightly.yml | 2 +- ...-arm64-binary-libtorch-release-nightly.yml | 2 +- ...ted-windows-arm64-binary-wheel-nightly.yml | 2 +- ...-windows-binary-libtorch-debug-nightly.yml | 26 +-- ...indows-binary-libtorch-release-nightly.yml | 26 +-- ...generated-windows-binary-wheel-nightly.yml | 212 ++++++------------ .github/workflows/h100-cutlass-backend.yml | 2 +- .github/workflows/h100-distributed.yml | 2 +- .github/workflows/h100-symm-mem.yml | 2 +- .../workflows/inductor-micro-benchmark.yml | 2 +- .github/workflows/inductor-nightly.yml | 2 +- .github/workflows/inductor-pallas.yml | 2 +- .github/workflows/inductor-perf-compare.yml | 2 +- .github/workflows/inductor-perf-test-b200.yml | 2 +- .../inductor-perf-test-nightly-aarch64.yml | 2 +- .../inductor-perf-test-nightly-h100.yml | 2 +- .../inductor-perf-test-nightly-rocm-mi300.yml | 2 +- .../inductor-perf-test-nightly-rocm-mi355.yml | 2 +- .../inductor-perf-test-nightly-x86-zen.yml | 2 +- .../inductor-perf-test-nightly-x86.yml | 2 +- .../inductor-perf-test-nightly-xpu.yml | 2 +- .../workflows/inductor-perf-test-nightly.yml | 2 +- .github/workflows/inductor-periodic.yml | 2 +- .github/workflows/inductor-rocm-mi200.yml | 2 +- .github/workflows/inductor-rocm-mi300.yml | 2 +- .github/workflows/inductor-rocm-mi355.yml | 2 +- .github/workflows/inductor-unittest.yml | 2 +- .github/workflows/inductor.yml | 2 +- .github/workflows/lint-autoformat.yml | 2 +- .github/workflows/lint-bc.yml | 2 +- .github/workflows/lint.yml | 23 +- .github/workflows/linux-aarch64.yml | 2 +- .github/workflows/llm_td_retrieval.yml | 4 +- .github/workflows/nightly-s3-uploads.yml | 2 +- .github/workflows/nightly.yml | 4 +- .github/workflows/nitpicker.yml | 2 +- .github/workflows/operator_microbenchmark.yml | 2 +- .github/workflows/periodic-rocm-mi200.yml | 2 +- .github/workflows/periodic-rocm-mi300.yml | 2 +- .github/workflows/periodic-rocm-mi355.yml | 2 +- .github/workflows/periodic.yml | 2 +- .github/workflows/pull.yml | 2 +- .github/workflows/quantization-periodic.yml | 2 +- .github/workflows/rocm-mi200.yml | 2 +- .github/workflows/rocm-mi300.yml | 2 +- .github/workflows/rocm-mi355.yml | 2 +- .github/workflows/rocm-navi31.yml | 2 +- .github/workflows/rocm-nightly.yml | 2 +- .github/workflows/slow-rocm-mi200.yml | 2 +- .github/workflows/slow.yml | 2 +- .../target-determination-indexer.yml | 10 +- .github/workflows/target_determination.yml | 4 +- .github/workflows/test-b200.yml | 2 +- .github/workflows/test-check-binary.yml | 4 +- .github/workflows/test-h100.yml | 2 +- .github/workflows/tools-unit-tests.yml | 4 +- .github/workflows/torchbench.yml | 2 +- .github/workflows/trunk-rocm-sandbox.yml | 2 +- .github/workflows/trunk.yml | 2 +- .github/workflows/unstable.yml | 2 +- .github/workflows/update-viablestrict.yml | 2 +- .github/workflows/update_pytorch_labels.yml | 2 +- .../upload-test-stats-while-running.yml | 2 +- .github/workflows/upload-test-stats.yml | 2 +- .../upload-torch-dynamo-perf-stats.yml | 2 +- .../upload_test_stats_intermediate.yml | 2 +- .github/workflows/vllm-benchmark.yml | 4 +- .github/workflows/weekly.yml | 2 +- .github/workflows/xpu.yml | 2 +- tools/stats/import_test_stats.py | 2 +- 118 files changed, 370 insertions(+), 488 deletions(-) diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index c4a92c997561e..ad45622823b4c 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -290,7 +290,7 @@ function install_torchrec_and_fbgemm() { function clone_pytorch_xla() { if [[ ! -d ./xla ]]; then - git clone --recursive --quiet https://github.com/pytorch/xla.git + git clone --recursive -b r2.11 https://github.com/pytorch/xla.git pushd xla # pin the xla hash so that we don't get broken by changes to xla git checkout "$(cat ../.github/ci_commit_pins/xla.txt)" diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 7d3a8548d145b..9939a0505c6fc 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -c04e61c3424142c0eebcc9e59984b9d8fced18c0 +r2.11 diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index 182a75f13cad6..087a59be348c4 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -41,10 +41,10 @@ def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool: "rerun_disabled_tests": lambda job_name: True, } -# The link to the published list of disabled jobs -DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json" +# The link to the published list of disabled jobs. +DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=EdtXb8H1wC3KKKfSV9z7QtgG3FngDv3B" # and unstable jobs -UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json" +UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=iafjJg17T2MK7wQiJ0qx32zIPMqqwZqv" # Some constants used to handle disabled and unstable jobs JOB_NAME_SEP = "/" diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 064eea7592230..201415632b7e1 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -32,7 +32,7 @@ concurrency: {%- macro setup_ec2_windows() -%} !{{ display_ec2_information() }} - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index b9d3a51354e06..e110f33d8ce39 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -56,7 +56,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -137,18 +137,18 @@ jobs: contents: read steps: - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11 - name: Login to ECR - uses: pytorch/pytorch/.github/actions/ecr-login@main + uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11 - uses: !{{ common.download_artifact_action }} name: Download Build Artifacts with: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: !{{ config["container_image"] }} @@ -156,7 +156,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -180,7 +180,7 @@ jobs: with: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" @@ -194,7 +194,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: !{{ config["container_image"] }} @@ -202,7 +202,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 958c6b85902c2..0414de214f123 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -71,7 +71,7 @@ jobs: steps: !{{ set_runner_specific_vars() }} !{{ setup_python(config.get("python_version", "3.10")) }} - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} - name: Populate binary env run: | "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2 index 34c148270c6bc..10153574304d7 100644 --- a/.github/templates/windows_binary_build_workflow.yml.j2 +++ b/.github/templates/windows_binary_build_workflow.yml.j2 @@ -64,7 +64,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -135,7 +135,7 @@ jobs: {%- else %} !{{ set_runner_specific_vars() }} !{{ common.setup_ec2_windows() }} - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} {%- endif %} - name: Populate binary env shell: bash @@ -211,7 +211,7 @@ jobs: "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" {%- else %} !{{ common.setup_ec2_windows() }} - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} !{{ set_runner_specific_vars() }} {%- endif %} - uses: !{{ common.download_artifact_action }} diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml index eaebce92ba898..21508ce0d7f21 100644 --- a/.github/workflows/_bazel-build-test.yml +++ b/.github/workflows/_bazel-build-test.yml @@ -47,7 +47,7 @@ jobs: reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: fetch-depth: 1 submodules: false @@ -69,13 +69,13 @@ jobs: runs-on: ${{ matrix.runner }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 - name: Setup Linux uses: ./.github/actions/setup-linux @@ -85,12 +85,12 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-image-name: ${{ inputs.docker-image-name }} - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -100,7 +100,7 @@ jobs: run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - uses: pytorch/test-infra/.github/actions/setup-nvidia@main + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.11 - name: Output disk space left run: | @@ -211,5 +211,5 @@ jobs: file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }} - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() diff --git a/.github/workflows/_binary-build-flash-attention-wheel-linux.yml b/.github/workflows/_binary-build-flash-attention-wheel-linux.yml index 3fdc1dc4175c9..68244dc98829d 100644 --- a/.github/workflows/_binary-build-flash-attention-wheel-linux.yml +++ b/.github/workflows/_binary-build-flash-attention-wheel-linux.yml @@ -23,7 +23,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -83,13 +83,13 @@ jobs: TORCH_VERSION: "2.10.0" steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} fail-silently: false - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: true @@ -97,7 +97,7 @@ jobs: uses: ./.github/actions/setup-linux - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ env.DOCKER_IMAGE }} @@ -138,5 +138,5 @@ jobs: path: ${{ runner.temp }}/artifacts/*.whl - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() diff --git a/.github/workflows/_binary-build-flash-attention-wheel-windows.yml b/.github/workflows/_binary-build-flash-attention-wheel-windows.yml index 4fc1dc8a53367..642045c0da492 100644 --- a/.github/workflows/_binary-build-flash-attention-wheel-windows.yml +++ b/.github/workflows/_binary-build-flash-attention-wheel-windows.yml @@ -22,7 +22,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -71,12 +71,12 @@ jobs: git config --global core.ignorecase false git config --global core.fsmonitor false - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true submodules: true diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index 115f6572441a1..c7d55a2e929b8 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -142,13 +142,13 @@ jobs: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" if: inputs.build_environment != 'linux-s390x-binary-manywheel' - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.github-token }} - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }} @@ -179,7 +179,6 @@ jobs: - name: Checkout PyTorch to pytorch dir uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -214,9 +213,9 @@ jobs: - name: Calculate docker image id: calculate-docker-image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: - # If doing this in main or release branch, use docker.io. Otherwise + # If doing this in release/2.11 or release branch, use docker.io. Otherwise # use ECR docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: ${{ inputs.DOCKER_IMAGE }} @@ -228,7 +227,7 @@ jobs: - name: Pull Docker image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -281,7 +280,7 @@ jobs: - name: Teardown Linux if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 - name: Chown workspace if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml index 3dd8235a4c4f2..ed7738ecbdcc2 100644 --- a/.github/workflows/_binary-test-linux.yml +++ b/.github/workflows/_binary-test-linux.yml @@ -125,14 +125,14 @@ jobs: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" if: inputs.build_environment != 'linux-s390x-binary-manywheel' - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.github-token }} # Setup the environment - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }} @@ -153,7 +153,6 @@ jobs: - name: Checkout PyTorch to pytorch dir uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive show-progress: false path: pytorch @@ -185,7 +184,7 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver - uses: pytorch/test-infra/.github/actions/setup-nvidia@main + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.11 if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }} - name: configure aws credentials @@ -200,7 +199,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: ${{ inputs.DOCKER_IMAGE }} @@ -210,7 +209,7 @@ jobs: - name: Pull Docker image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -222,7 +221,7 @@ jobs: - name: Teardown Linux if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 - name: Chown workspace if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml index 636b76d42931a..ce6c61e930620 100644 --- a/.github/workflows/_binary-upload.yml +++ b/.github/workflows/_binary-upload.yml @@ -81,7 +81,7 @@ jobs: SHA1: ${{ github.event.pull_request.head.sha || github.sha }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index f5cb186c6f189..ff21b561c760a 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -80,7 +80,7 @@ jobs: name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -91,7 +91,7 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 - name: Setup Linux uses: ./.github/actions/setup-linux @@ -103,12 +103,12 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -213,5 +213,5 @@ jobs: echo "https://docs-preview.pytorch.org/pytorch/pytorch/nightly-${{ github.sha }}/cppdocs/index.html" - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml index 014e6106b0730..b45837ab110d6 100644 --- a/.github/workflows/_link_check.yml +++ b/.github/workflows/_link_check.yml @@ -11,7 +11,7 @@ on: jobs: lint-urls: if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }} - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 with: job-name: lint-urls timeout: 120 @@ -37,7 +37,7 @@ jobs: lint-xrefs: if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }} - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 with: job-name: lint-xrefs timeout: 60 diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index 25501b59c7851..7696674ab2717 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -138,7 +138,7 @@ jobs: build-environment: ${{ inputs.build-environment }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -151,7 +151,7 @@ jobs: # checkout because when we run this action we don't *have* a local # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true @@ -185,7 +185,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image-name }} @@ -201,7 +201,7 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -464,7 +464,7 @@ jobs: artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }} - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' - name: Cleanup docker diff --git a/.github/workflows/_linux-test-stable-fa3.yml b/.github/workflows/_linux-test-stable-fa3.yml index f2e16712ff447..ffd5845adc473 100644 --- a/.github/workflows/_linux-test-stable-fa3.yml +++ b/.github/workflows/_linux-test-stable-fa3.yml @@ -60,7 +60,7 @@ jobs: contents: read steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true @@ -78,7 +78,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-image-name: ${{ inputs.docker-image }} @@ -92,7 +92,7 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -254,5 +254,5 @@ jobs: workflow_attempt: ${{github.run_attempt}} - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 979b7a8e55a08..3e9a03befbddd 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -104,7 +104,7 @@ jobs: contents: read steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }} with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -113,7 +113,7 @@ jobs: docker exec -it $(docker container ps --format '{{.ID}}') bash - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true @@ -147,7 +147,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image }} @@ -163,7 +163,7 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -175,7 +175,7 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver - uses: pytorch/test-infra/.github/actions/setup-nvidia@main + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.11 with: driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }} if: ${{ !contains(matrix.runner, 'b200') }} @@ -512,7 +512,7 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11 if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.check-tpu.outputs.has_tpu != 'true' with: benchmark-results-dir: test/test-reports @@ -570,7 +570,7 @@ jobs: workflow_attempt: ${{github.run_attempt}} - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' - name: Cleanup docker diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml index 4fd7874ee0c4d..351ab9376a416 100644 --- a/.github/workflows/_mac-build.yml +++ b/.github/workflows/_mac-build.yml @@ -71,11 +71,11 @@ jobs: build-environment: ${{ inputs.build-environment }} steps: - name: Clean up disk space before running MacOS workflow - uses: pytorch/test-infra/.github/actions/check-disk-space@main + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.11 # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 - name: Set xcode version env: @@ -86,7 +86,7 @@ jobs: fi - name: Setup Python - uses: pytorch/test-infra/.github/actions/setup-python@main + uses: pytorch/test-infra/.github/actions/setup-python@release/2.11 with: python-version: ${{ inputs.python-version }} pip-requirements-file: .ci/docker/requirements-ci.txt @@ -192,4 +192,4 @@ jobs: - name: Clean up disk space if: always() continue-on-error: true - uses: pytorch/test-infra/.github/actions/check-disk-space@main + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.11 diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 82eb3c4bf2c75..67a7320e08edc 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -105,11 +105,11 @@ jobs: done - name: Clean up disk space before running MacOS workflow - uses: pytorch/test-infra/.github/actions/check-disk-space@main + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.11 # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 - name: Get workflow job id id: get-job-id @@ -119,7 +119,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} - name: Setup Python - uses: pytorch/test-infra/.github/actions/setup-python@main + uses: pytorch/test-infra/.github/actions/setup-python@release/2.11 with: python-version: ${{ inputs.python-version }} pip-requirements-file: .ci/docker/requirements-ci.txt @@ -257,7 +257,7 @@ jobs: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} - name: Upload the benchmark results - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11 with: benchmark-results-dir: test/test-reports dry-run: false @@ -287,4 +287,4 @@ jobs: - name: Clean up disk space if: always() continue-on-error: true - uses: pytorch/test-infra/.github/actions/check-disk-space@main + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.11 diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 38f3bff66c14e..f07fb9ee8b71e 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -85,7 +85,7 @@ jobs: timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true @@ -104,12 +104,12 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -332,7 +332,7 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11 with: benchmark-results-dir: test/test-reports dry-run: false diff --git a/.github/workflows/_runner-determinator.yml b/.github/workflows/_runner-determinator.yml index 0d674f044ec42..b127c82266561 100644 --- a/.github/workflows/_runner-determinator.yml +++ b/.github/workflows/_runner-determinator.yml @@ -59,7 +59,7 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} steps: # - name: Checkout PyTorch - # uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + # uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 # with: # fetch-depth: 1 # submodules: true diff --git a/.github/workflows/_vllm-benchmark.yml b/.github/workflows/_vllm-benchmark.yml index d25c3c80767b1..d5aa61a6341c7 100644 --- a/.github/workflows/_vllm-benchmark.yml +++ b/.github/workflows/_vllm-benchmark.yml @@ -84,7 +84,7 @@ jobs: name: ${{ inputs.build_environment }} s3-bucket: gha-artifacts - - uses: pytorch/test-infra/.github/actions/setup-uv@main + - uses: pytorch/test-infra/.github/actions/setup-uv@release/2.11 with: python-version: "3.12" activate-environment: "true" @@ -219,7 +219,7 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results to OSS benchmark database for the dashboard - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11 with: benchmark-results-dir: vllm-project/vllm/benchmarks/results benchmark-name: 'PyTorch x vLLM benchmark' diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index 005d68ece857d..034a308054361 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -89,7 +89,7 @@ jobs: git config --global core.fsmonitor false - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -104,7 +104,7 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 3d2fe8a4b3fac..59bfc6a25e4ca 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -78,7 +78,7 @@ jobs: git config --global core.fsmonitor false - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -94,7 +94,7 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index 5724403e6de44..75c033789a313 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -86,7 +86,7 @@ jobs: steps: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 - name: Setup XPU uses: ./.github/actions/setup-xpu @@ -96,7 +96,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-image-name: ${{ inputs.docker-image }} @@ -110,7 +110,7 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -345,7 +345,7 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11 with: benchmark-results-dir: test/test-reports dry-run: false diff --git a/.github/workflows/b200-distributed.yml b/.github/workflows/b200-distributed.yml index e52c7a4b5f5c5..9ba6839858027 100644 --- a/.github/workflows/b200-distributed.yml +++ b/.github/workflows/b200-distributed.yml @@ -24,7 +24,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/b200-symm-mem.yml b/.github/workflows/b200-symm-mem.yml index 62367b61b07b9..6ca86affacc0f 100644 --- a/.github/workflows/b200-symm-mem.yml +++ b/.github/workflows/b200-symm-mem.yml @@ -24,7 +24,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml index 9090fba0a9773..f0ba752891cff 100644 --- a/.github/workflows/build-almalinux-images.yml +++ b/.github/workflows/build-almalinux-images.yml @@ -39,7 +39,7 @@ jobs: tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "rocm7.2", "cpu"] steps: - name: Build docker image - uses: pytorch/pytorch/.github/actions/binary-docker-build@main + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.11 with: docker-image-name: almalinux-builder custom-tag-prefix: ${{matrix.tag}} diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index 47bf15e1db3ab..bdc81c0fc4a3f 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -32,7 +32,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -59,7 +59,7 @@ jobs: ] steps: - name: Build docker image - uses: pytorch/pytorch/.github/actions/binary-docker-build@main + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.11 with: docker-image-name: libtorch-cxx11-builder custom-tag-prefix: ${{ matrix.tag }} diff --git a/.github/workflows/build-manywheel-images-s390x.yml b/.github/workflows/build-manywheel-images-s390x.yml index c498e169f1aa5..f13d0be04c81b 100644 --- a/.github/workflows/build-manywheel-images-s390x.yml +++ b/.github/workflows/build-manywheel-images-s390x.yml @@ -25,7 +25,7 @@ jobs: runs-on: linux.s390x steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false no-sudo: true diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index f86cefd7c7a1a..beabc976f30ad 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -32,7 +32,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -65,7 +65,7 @@ jobs: name: ${{ matrix.name }}:${{ matrix.tag }} steps: - name: Build docker image - uses: pytorch/pytorch/.github/actions/binary-docker-build@main + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.11 with: docker-image-name: ${{ matrix.name }} custom-tag-prefix: ${{ matrix.tag }} diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index 263745ff0fe23..60175432e1557 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -3,7 +3,7 @@ name: Build Triton wheels on: push: branches: - - main + - release/2.11 tags: # NOTE: Binary build pipelines should only get triggered on release candidate builds # Release candidate tags look like: v1.11.0-rc1 @@ -36,7 +36,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -74,12 +74,12 @@ jobs: PLATFORM: 'manylinux_2_28_x86_64' steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false @@ -90,7 +90,7 @@ jobs: uses: ./.github/actions/ecr-login - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ env.DOCKER_IMAGE }} @@ -179,7 +179,7 @@ jobs: path: ${{ runner.temp }}/artifacts/wheelhouse/* - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() build-wheel-win: @@ -212,7 +212,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml index 1ef0684688218..9fea4a06c60a2 100644 --- a/.github/workflows/build-vllm-wheel.yml +++ b/.github/workflows/build-vllm-wheel.yml @@ -65,12 +65,12 @@ jobs: BUILD_DEVICE: ${{ matrix.device }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false @@ -167,7 +167,7 @@ jobs: path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() # Copied from build-triton-wheel workflow (mostly) diff --git a/.github/workflows/claude-code.yml b/.github/workflows/claude-code.yml index 6ba4f6c26165e..06b6c5a63fb7a 100644 --- a/.github/workflows/claude-code.yml +++ b/.github/workflows/claude-code.yml @@ -75,4 +75,4 @@ jobs: - name: Upload usage metrics if: always() - uses: pytorch/test-infra/.github/actions/upload-claude-usage@main + uses: pytorch/test-infra/.github/actions/upload-claude-usage@release/2.11 diff --git a/.github/workflows/claude-issue-triage-run.yml b/.github/workflows/claude-issue-triage-run.yml index 6d63695b97ecf..655ede7b4ee89 100644 --- a/.github/workflows/claude-issue-triage-run.yml +++ b/.github/workflows/claude-issue-triage-run.yml @@ -106,4 +106,4 @@ jobs: fi - name: Upload usage metrics - uses: pytorch/test-infra/.github/actions/upload-claude-usage@main + uses: pytorch/test-infra/.github/actions/upload-claude-usage@release/2.11 diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml index bef3d8797149c..256cd8e9ec0d0 100644 --- a/.github/workflows/close-nonexistent-disable-issues.yml +++ b/.github/workflows/close-nonexistent-disable-issues.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index d5e0d96fe19f2..4932631f2d2eb 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -19,7 +19,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index c4c989af980e6..a0df8bccc8df9 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -33,7 +33,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -106,7 +106,7 @@ jobs: # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required for git merge-base - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 - name: Setup Linux uses: ./.github/actions/setup-linux @@ -116,14 +116,14 @@ jobs: - name: Build docker image id: build-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-image-name: ci-image:${{ matrix.docker-image-name }} always-rebuild: true push: true - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.build-docker-image.outputs.docker-image }} @@ -170,5 +170,5 @@ jobs: if: always() - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() diff --git a/.github/workflows/docker-cache-rocm.yml b/.github/workflows/docker-cache-rocm.yml index 2c2a9ba16647f..c53969b40e70c 100644 --- a/.github/workflows/docker-cache-rocm.yml +++ b/.github/workflows/docker-cache-rocm.yml @@ -71,7 +71,7 @@ jobs: echo "Outputs of download-docker-builds-artifacts job: ${JSON_STRINGIFIED}" - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: no-sudo: true @@ -86,7 +86,7 @@ jobs: echo "ghcr_image=${ghcr_image}" >> "$GITHUB_OUTPUT" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.ghcr-io-tag.outputs.ghcr_image }} diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index 577a8acb5203f..38db8612698f8 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -38,7 +38,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -53,7 +53,7 @@ jobs: matrix: ${{ steps.generate-matrix.outputs.matrix }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: fetch-depth: 1 submodules: true @@ -83,7 +83,7 @@ jobs: CUDNN_VERSION: ${{ matrix.cudnn_version }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 with: github-secret: ${{ secrets.GITHUB_TOKEN }} # [see note: pytorch repo ref] @@ -169,13 +169,13 @@ jobs: fi - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() validate: needs: build if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) }} - uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main + uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.11 with: channel: nightly ref: main diff --git a/.github/workflows/dynamo-unittest.yml b/.github/workflows/dynamo-unittest.yml index f7eea350b5644..ac1b8684de9ba 100644 --- a/.github/workflows/dynamo-unittest.yml +++ b/.github/workflows/dynamo-unittest.yml @@ -21,7 +21,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index e4a8e5e96a88a..0dc6a42e77d24 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index 771681dc123d0..db8ed62b924ef 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -446,7 +446,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -468,7 +467,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: libtorch-cxx11-builder @@ -476,7 +475,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -565,7 +564,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -587,7 +585,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: libtorch-cxx11-builder @@ -595,7 +593,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 4e49ac2223ada..fcd006886abed 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -433,7 +433,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -455,7 +454,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -463,7 +462,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -549,7 +548,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -571,7 +569,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -579,7 +577,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -654,9 +652,9 @@ jobs: contents: read steps: - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11 - name: Login to ECR - uses: pytorch/pytorch/.github/actions/ecr-login@main + uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11 - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -665,7 +663,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -676,7 +673,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -684,7 +681,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1100,7 +1097,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1122,7 +1118,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1130,7 +1126,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1216,7 +1212,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1238,7 +1233,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1246,7 +1241,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1321,9 +1316,9 @@ jobs: contents: read steps: - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11 - name: Login to ECR - uses: pytorch/pytorch/.github/actions/ecr-login@main + uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11 - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -1332,7 +1327,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1343,7 +1337,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1351,7 +1345,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1767,7 +1761,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1789,7 +1782,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1797,7 +1790,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1883,7 +1876,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1905,7 +1897,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1913,7 +1905,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1988,9 +1980,9 @@ jobs: contents: read steps: - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11 - name: Login to ECR - uses: pytorch/pytorch/.github/actions/ecr-login@main + uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11 - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -1999,7 +1991,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2010,7 +2001,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2018,7 +2009,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2434,7 +2425,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2456,7 +2446,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2464,7 +2454,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2550,7 +2540,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2572,7 +2561,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2580,7 +2569,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2655,9 +2644,9 @@ jobs: contents: read steps: - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11 - name: Login to ECR - uses: pytorch/pytorch/.github/actions/ecr-login@main + uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11 - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -2666,7 +2655,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2677,7 +2665,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2685,7 +2673,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3101,7 +3089,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3123,7 +3110,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3131,7 +3118,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3217,7 +3204,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3239,7 +3225,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3247,7 +3233,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3322,9 +3308,9 @@ jobs: contents: read steps: - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11 - name: Login to ECR - uses: pytorch/pytorch/.github/actions/ecr-login@main + uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11 - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -3333,7 +3319,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3344,7 +3329,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3352,7 +3337,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3768,7 +3753,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3790,7 +3774,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3798,7 +3782,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3884,7 +3868,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3906,7 +3889,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3914,7 +3897,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3989,9 +3972,9 @@ jobs: contents: read steps: - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11 - name: Login to ECR - uses: pytorch/pytorch/.github/actions/ecr-login@main + uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11 - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -4000,7 +3983,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4011,7 +3993,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -4019,7 +4001,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -4435,7 +4417,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4457,7 +4438,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -4465,7 +4446,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -4551,7 +4532,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4573,7 +4553,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -4581,7 +4561,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -4656,9 +4636,9 @@ jobs: contents: read steps: - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11 - name: Login to ECR - uses: pytorch/pytorch/.github/actions/ecr-login@main + uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11 - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: @@ -4667,7 +4647,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4678,7 +4657,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -4686,7 +4665,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml index 2314dfa31db47..9c31484e78781 100644 --- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml @@ -42,7 +42,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml index 2d47b83e5822e..ed793bbc8a1f7 100644 --- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml @@ -69,7 +69,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index 70d210c7eabb6..4814baa851180 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -65,7 +65,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -175,7 +174,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -285,7 +283,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -395,7 +392,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -505,7 +501,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -615,7 +610,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -725,7 +719,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml index 7c26dbc3b9eea..1f607424c2a23 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml index 5e30b66183840..49dab1a56f13f 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml index 1368bc942350e..3a56869f3343b 100644 --- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml index 3ca3364e5de88..92b4185acdae2 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml @@ -35,7 +35,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -84,7 +84,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -116,7 +116,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -190,7 +189,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -222,7 +221,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -332,7 +330,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -364,7 +362,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -439,7 +436,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -471,7 +468,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -582,7 +578,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -614,7 +610,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -689,7 +684,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -721,7 +716,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -832,7 +826,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -864,7 +858,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -939,7 +932,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -971,7 +964,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml index c6d1e2cf3b017..16075a8568e35 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml @@ -35,7 +35,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -84,7 +84,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -116,7 +116,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -190,7 +189,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -222,7 +221,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -332,7 +330,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -364,7 +362,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -439,7 +436,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -471,7 +468,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -582,7 +578,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -614,7 +610,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -689,7 +684,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -721,7 +716,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -832,7 +826,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -864,7 +858,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -939,7 +932,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -971,7 +964,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml index e23118631d3a9..8a322048744cd 100644 --- a/.github/workflows/generated-windows-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml @@ -35,7 +35,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -80,7 +80,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -112,7 +112,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -182,7 +181,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -214,7 +213,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -316,7 +314,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -348,7 +346,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -419,7 +416,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -451,7 +448,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -554,7 +550,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -586,7 +582,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -657,7 +652,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -689,7 +684,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -792,7 +786,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -824,7 +818,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -895,7 +888,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -927,7 +920,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1030,7 +1022,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1062,7 +1054,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1132,7 +1123,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1164,7 +1155,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1265,7 +1255,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1297,7 +1287,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1367,7 +1356,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1399,7 +1388,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1501,7 +1489,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1533,7 +1521,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1604,7 +1591,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1636,7 +1623,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1739,7 +1725,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1771,7 +1757,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1842,7 +1827,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1874,7 +1859,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1977,7 +1961,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2009,7 +1993,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2080,7 +2063,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2112,7 +2095,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2215,7 +2197,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2247,7 +2229,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2317,7 +2298,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2349,7 +2330,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2450,7 +2430,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2482,7 +2462,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2552,7 +2531,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2584,7 +2563,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2686,7 +2664,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2718,7 +2696,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2789,7 +2766,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2821,7 +2798,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2924,7 +2900,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2956,7 +2932,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3027,7 +3002,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3059,7 +3034,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3162,7 +3136,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3194,7 +3168,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3265,7 +3238,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3297,7 +3270,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3400,7 +3372,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3432,7 +3404,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3502,7 +3473,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3534,7 +3505,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3635,7 +3605,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3667,7 +3637,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3737,7 +3706,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3769,7 +3738,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3871,7 +3839,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3903,7 +3871,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3974,7 +3941,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4006,7 +3973,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4109,7 +4075,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4141,7 +4107,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4212,7 +4177,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4244,7 +4209,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4347,7 +4311,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4379,7 +4343,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4450,7 +4413,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4482,7 +4445,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4585,7 +4547,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4617,7 +4579,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4687,7 +4648,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4719,7 +4680,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4820,7 +4780,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4852,7 +4812,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4922,7 +4881,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4954,7 +4913,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5056,7 +5014,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5088,7 +5046,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5159,7 +5116,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5191,7 +5148,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5294,7 +5250,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5326,7 +5282,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5397,7 +5352,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5429,7 +5384,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5532,7 +5486,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5564,7 +5518,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5635,7 +5588,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5667,7 +5620,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5770,7 +5722,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5802,7 +5754,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5872,7 +5823,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5904,7 +5855,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6005,7 +5955,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6037,7 +5987,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6107,7 +6056,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6139,7 +6088,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6241,7 +6189,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6273,7 +6221,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6344,7 +6291,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6376,7 +6323,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6479,7 +6425,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6511,7 +6457,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6582,7 +6527,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6614,7 +6559,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6717,7 +6661,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6749,7 +6693,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6820,7 +6763,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6852,7 +6795,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6955,7 +6897,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6987,7 +6929,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7057,7 +6998,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7089,7 +7030,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7190,7 +7130,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7222,7 +7162,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7292,7 +7231,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7324,7 +7263,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7426,7 +7364,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7458,7 +7396,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7529,7 +7466,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7561,7 +7498,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7664,7 +7600,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7696,7 +7632,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7767,7 +7702,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7799,7 +7734,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7902,7 +7836,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7934,7 +7868,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -8005,7 +7938,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -8037,7 +7970,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -8140,7 +8072,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -8172,7 +8104,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -8242,7 +8173,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -8274,7 +8205,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/h100-cutlass-backend.yml b/.github/workflows/h100-cutlass-backend.yml index e5406f7600133..9f770549f40a3 100644 --- a/.github/workflows/h100-cutlass-backend.yml +++ b/.github/workflows/h100-cutlass-backend.yml @@ -27,7 +27,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml index 0e5370a51c160..e566df60728e6 100644 --- a/.github/workflows/h100-distributed.yml +++ b/.github/workflows/h100-distributed.yml @@ -24,7 +24,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/h100-symm-mem.yml b/.github/workflows/h100-symm-mem.yml index 09c362a546024..a9f8d3ff31270 100644 --- a/.github/workflows/h100-symm-mem.yml +++ b/.github/workflows/h100-symm-mem.yml @@ -24,7 +24,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml index 5813aa28365e7..35a1a4ef972a5 100644 --- a/.github/workflows/inductor-micro-benchmark.yml +++ b/.github/workflows/inductor-micro-benchmark.yml @@ -20,7 +20,7 @@ permissions: jobs: get-default-label-prefix: name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml index 4258e8fdb0c84..0e8ef5e9ea2bc 100644 --- a/.github/workflows/inductor-nightly.yml +++ b/.github/workflows/inductor-nightly.yml @@ -23,7 +23,7 @@ permissions: jobs: get-default-label-prefix: name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-pallas.yml b/.github/workflows/inductor-pallas.yml index 8676434d0e580..e0bb5731ec4b9 100644 --- a/.github/workflows/inductor-pallas.yml +++ b/.github/workflows/inductor-pallas.yml @@ -20,7 +20,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml index 17b265500d47e..6235d02970849 100644 --- a/.github/workflows/inductor-perf-compare.yml +++ b/.github/workflows/inductor-perf-compare.yml @@ -18,7 +18,7 @@ jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/inductor-perf-test-b200.yml b/.github/workflows/inductor-perf-test-b200.yml index 07c7a0cf987d7..003f27476bcb9 100644 --- a/.github/workflows/inductor-perf-test-b200.yml +++ b/.github/workflows/inductor-perf-test-b200.yml @@ -68,7 +68,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml index f7b3517dccc06..855e87ce6ca5d 100644 --- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml +++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml @@ -55,7 +55,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml index 155b995e3c8a2..a929475355888 100644 --- a/.github/workflows/inductor-perf-test-nightly-h100.yml +++ b/.github/workflows/inductor-perf-test-nightly-h100.yml @@ -73,7 +73,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml index c556c6b455783..4f4c8461cf994 100644 --- a/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml +++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml @@ -68,7 +68,7 @@ permissions: read-all jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml index e6fd83193202c..22ef17f000455 100644 --- a/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml +++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml @@ -68,7 +68,7 @@ permissions: read-all jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml index eee51b7ff8889..9d43549fae06a 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml @@ -65,7 +65,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml index 87875831e2a0b..7239952de60f2 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86.yml @@ -65,7 +65,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-xpu.yml b/.github/workflows/inductor-perf-test-nightly-xpu.yml index b51795c663957..981720537f3f0 100644 --- a/.github/workflows/inductor-perf-test-nightly-xpu.yml +++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml @@ -68,7 +68,7 @@ permissions: read-all jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml index 2a8e29278b8bc..6539a81f7c196 100644 --- a/.github/workflows/inductor-perf-test-nightly.yml +++ b/.github/workflows/inductor-perf-test-nightly.yml @@ -68,7 +68,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index 18d7c7189f38e..1e87adc965c74 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -22,7 +22,7 @@ permissions: jobs: get-default-label-prefix: name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-rocm-mi200.yml b/.github/workflows/inductor-rocm-mi200.yml index 1698eb4fc85fb..e9218f3acaa86 100644 --- a/.github/workflows/inductor-rocm-mi200.yml +++ b/.github/workflows/inductor-rocm-mi200.yml @@ -21,7 +21,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml index 633386aba487b..5828b299590c7 100644 --- a/.github/workflows/inductor-rocm-mi300.yml +++ b/.github/workflows/inductor-rocm-mi300.yml @@ -29,7 +29,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-rocm-mi355.yml b/.github/workflows/inductor-rocm-mi355.yml index 70ea41a6da698..396e1f8e65ae0 100644 --- a/.github/workflows/inductor-rocm-mi355.yml +++ b/.github/workflows/inductor-rocm-mi355.yml @@ -28,7 +28,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index b5cea8ceb5265..ea6ce55dbd470 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -22,7 +22,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index acfa832c88ebb..3736415f11b74 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -35,7 +35,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/lint-autoformat.yml b/.github/workflows/lint-autoformat.yml index b962970dc5b78..66acb3eab1f89 100644 --- a/.github/workflows/lint-autoformat.yml +++ b/.github/workflows/lint-autoformat.yml @@ -13,7 +13,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'autoformat') }} steps: - name: Checkout pytorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: true fetch-depth: 0 diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml index e0de9ede35084..43fac2dc9584d 100644 --- a/.github/workflows/lint-bc.yml +++ b/.github/workflows/lint-bc.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Run BC Lint Action - uses: pytorch/test-infra/.github/actions/bc-lint@main + uses: pytorch/test-infra/.github/actions/bc-lint@release/2.11 with: repo: ${{ github.event.pull_request.head.repo.full_name }} base_sha: ${{ github.event.pull_request.base.sha }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e5d998e100a94..ec215e92e91f3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -22,7 +22,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -36,7 +36,7 @@ jobs: all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') || github.event_name == 'push' }} lintrunner-clang: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 # Needed to prevent deduping on HUD name: lintrunner-clang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }} needs: [get-label-type, get-changed-files] @@ -78,7 +78,7 @@ jobs: # fails to find types when it should # NOTE: We should be able to disable this and consolidate with Pyrefly lintrunner-pyrefly: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 name: lintrunner-pyrefly-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }} needs: [get-label-type, get-changed-files] # Only run if there are changed files relevant to pyrefly @@ -103,7 +103,7 @@ jobs: ADDITIONAL_LINTRUNNER_ARGS="--take PYREFLY --all-files" .github/scripts/lintrunner.sh lintrunner-noclang: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 name: lintrunner-noclang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }} needs: [get-label-type, get-changed-files] with: @@ -125,7 +125,7 @@ jobs: fi quick-checks: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 needs: get-label-type with: timeout: 120 @@ -165,7 +165,7 @@ jobs: if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks') steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false fetch-depth: -1 @@ -178,7 +178,7 @@ jobs: bash .github/scripts/pr-sanity-check.sh workflow-checks: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 needs: get-label-type with: timeout: 120 @@ -189,6 +189,7 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | # Regenerate workflows + export RELEASE_VERSION_TAG=2.11 .github/scripts/generate_ci_workflows.py RC=0 @@ -212,7 +213,7 @@ jobs: exit $RC toc: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 needs: get-label-type with: timeout: 120 @@ -248,7 +249,7 @@ jobs: test-tools: name: Test tools if: ${{ github.repository == 'pytorch/pytorch' }} - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 needs: get-label-type with: timeout: 120 @@ -268,7 +269,7 @@ jobs: runs-on: linux.24_04.4x steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false fetch-depth: 1 @@ -305,7 +306,7 @@ jobs: # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required, to allow us to use git log - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml index 2c6e6b6dac39c..3636c4e626c27 100644 --- a/.github/workflows/linux-aarch64.yml +++ b/.github/workflows/linux-aarch64.yml @@ -19,7 +19,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml index 565a9b25df50f..ebe04611f0ecb 100644 --- a/.github/workflows/llm_td_retrieval.yml +++ b/.github/workflows/llm_td_retrieval.yml @@ -12,7 +12,7 @@ jobs: name: get-label-type # Don't run on forked repos if: github.repository_owner == 'pytorch' - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -116,5 +116,5 @@ jobs: AWS_REGION: "" - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() diff --git a/.github/workflows/nightly-s3-uploads.yml b/.github/workflows/nightly-s3-uploads.yml index acf3504dec9ca..dd22ec672b2b1 100644 --- a/.github/workflows/nightly-s3-uploads.yml +++ b/.github/workflows/nightly-s3-uploads.yml @@ -23,7 +23,7 @@ jobs: environment: upload-stats steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index c47b0c5763078..5202627ade876 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -21,7 +21,7 @@ concurrency: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -94,7 +94,7 @@ jobs: if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') steps: - name: "${{ matrix.repo-owner }}/${{ matrix.repo-name }} update-commit-hash" - uses: pytorch/test-infra/.github/actions/update-commit-hash@main + uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.11 with: repo-owner: ${{ matrix.repo-owner }} repo-name: ${{ matrix.repo-name }} diff --git a/.github/workflows/nitpicker.yml b/.github/workflows/nitpicker.yml index 40bd245ce913f..ce7f51b9bfb29 100644 --- a/.github/workflows/nitpicker.yml +++ b/.github/workflows/nitpicker.yml @@ -19,7 +19,7 @@ jobs: if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 - uses: ethanis/nitpicker@v1 with: nitpicks: '.github/nitpicks.yml' diff --git a/.github/workflows/operator_microbenchmark.yml b/.github/workflows/operator_microbenchmark.yml index 445cdcc4be04a..8fc1f9f319a45 100644 --- a/.github/workflows/operator_microbenchmark.yml +++ b/.github/workflows/operator_microbenchmark.yml @@ -21,7 +21,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/periodic-rocm-mi200.yml b/.github/workflows/periodic-rocm-mi200.yml index 865d999623cbb..041bb5a1bf18f 100644 --- a/.github/workflows/periodic-rocm-mi200.yml +++ b/.github/workflows/periodic-rocm-mi200.yml @@ -37,7 +37,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml index 88da168926444..eaec4ab7aac40 100644 --- a/.github/workflows/periodic-rocm-mi300.yml +++ b/.github/workflows/periodic-rocm-mi300.yml @@ -35,7 +35,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/periodic-rocm-mi355.yml b/.github/workflows/periodic-rocm-mi355.yml index 9885ffb2d3832..07135377fd44f 100644 --- a/.github/workflows/periodic-rocm-mi355.yml +++ b/.github/workflows/periodic-rocm-mi355.yml @@ -35,7 +35,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 16a706a7be6be..70403093e2568 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -43,7 +43,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 538e163fb84e1..22989263dd22f 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -61,7 +61,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/quantization-periodic.yml b/.github/workflows/quantization-periodic.yml index 8dd97ff9308db..7ae408a021bf4 100644 --- a/.github/workflows/quantization-periodic.yml +++ b/.github/workflows/quantization-periodic.yml @@ -20,7 +20,7 @@ permissions: jobs: get-default-label-prefix: name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/rocm-mi200.yml b/.github/workflows/rocm-mi200.yml index 1bbb538527f13..e9ffdc91a18f5 100644 --- a/.github/workflows/rocm-mi200.yml +++ b/.github/workflows/rocm-mi200.yml @@ -28,7 +28,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index 9c2bae06f32bd..06e43244d7d63 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -27,7 +27,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/rocm-mi355.yml b/.github/workflows/rocm-mi355.yml index 5a77695011f3c..777667dda2372 100644 --- a/.github/workflows/rocm-mi355.yml +++ b/.github/workflows/rocm-mi355.yml @@ -25,7 +25,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/rocm-navi31.yml b/.github/workflows/rocm-navi31.yml index bf1661b35e210..c04d5c2040d4b 100644 --- a/.github/workflows/rocm-navi31.yml +++ b/.github/workflows/rocm-navi31.yml @@ -28,7 +28,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/rocm-nightly.yml b/.github/workflows/rocm-nightly.yml index 649de4ab2f689..9c58062c79b02 100644 --- a/.github/workflows/rocm-nightly.yml +++ b/.github/workflows/rocm-nightly.yml @@ -15,7 +15,7 @@ permissions: read-all jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/slow-rocm-mi200.yml b/.github/workflows/slow-rocm-mi200.yml index 937f04980522e..3bd00e3d193db 100644 --- a/.github/workflows/slow-rocm-mi200.yml +++ b/.github/workflows/slow-rocm-mi200.yml @@ -41,7 +41,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 8da9c9bd219d5..c73f6d2d48b22 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -41,7 +41,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml index 3438b1dd5ac57..5f3f3725e7577 100644 --- a/.github/workflows/target-determination-indexer.yml +++ b/.github/workflows/target-determination-indexer.yml @@ -13,7 +13,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -38,7 +38,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 working-directory: pytorch @@ -53,13 +53,13 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver - uses: pytorch/test-infra/.github/actions/setup-nvidia@main + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.11 - name: Clone CodeLlama uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -152,7 +152,7 @@ jobs: "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}" - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11 if: always() concurrency: diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml index c712b11185a76..0c2e4d458c7ae 100644 --- a/.github/workflows/target_determination.yml +++ b/.github/workflows/target_determination.yml @@ -9,7 +9,7 @@ jobs: name: get-label-type # Don't run on forked repos if: github.repository_owner == 'pytorch' - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -27,7 +27,7 @@ jobs: # checkout because when we run this action we don't *have* a local # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml index 19dcb07c29844..e800ab6cf1c3d 100644 --- a/.github/workflows/test-b200.yml +++ b/.github/workflows/test-b200.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml index 883b2d253aa8f..bd0ac343ee04a 100644 --- a/.github/workflows/test-check-binary.yml +++ b/.github/workflows/test-check-binary.yml @@ -15,7 +15,7 @@ jobs: check_binary_linux_cpu: if: github.repository_owner == 'pytorch' name: Test check_binary.sh for Linux CPU - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 with: docker-image: python:3.11 docker-build-dir: "skip-docker-build" @@ -30,7 +30,7 @@ jobs: check_binary_linux_cuda: if: github.repository_owner == 'pytorch' name: Test check_binary.sh for Linux CUDA - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11 with: runner: linux.g4dn.4xlarge.nvidia.gpu docker-image: python:3.11 diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml index 4351b427b0b8a..7d75675ebcb78 100644 --- a/.github/workflows/test-h100.yml +++ b/.github/workflows/test-h100.yml @@ -28,7 +28,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/tools-unit-tests.yml b/.github/workflows/tools-unit-tests.yml index 4f87992eb5d72..6559b1852205e 100644 --- a/.github/workflows/tools-unit-tests.yml +++ b/.github/workflows/tools-unit-tests.yml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout pytorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: true fetch-depth: 0 @@ -52,7 +52,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout pytorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: true fetch-depth: 0 diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml index 508c39a653600..a84ff38e72471 100644 --- a/.github/workflows/torchbench.yml +++ b/.github/workflows/torchbench.yml @@ -18,7 +18,7 @@ jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/trunk-rocm-sandbox.yml b/.github/workflows/trunk-rocm-sandbox.yml index aee6a5d87df09..200b071eb6693 100644 --- a/.github/workflows/trunk-rocm-sandbox.yml +++ b/.github/workflows/trunk-rocm-sandbox.yml @@ -36,7 +36,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 65bc108c842af..75966ef5e5c4c 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -58,7 +58,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml index b5955127d9fb3..916a0c2d342c1 100644 --- a/.github/workflows/unstable.yml +++ b/.github/workflows/unstable.yml @@ -46,7 +46,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 1b4af0f274913..ca00236186c6f 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -18,7 +18,7 @@ jobs: environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }} steps: - name: Update viable/strict - uses: pytorch/test-infra/.github/actions/update-viablestrict@main + uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.11 id: update_viablestrict with: repository: pytorch/pytorch diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml index a1b8c38141ae8..4bd09fbbfa5aa 100644 --- a/.github/workflows/update_pytorch_labels.yml +++ b/.github/workflows/update_pytorch_labels.yml @@ -17,7 +17,7 @@ jobs: contents: read steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/upload-test-stats-while-running.yml b/.github/workflows/upload-test-stats-while-running.yml index 9aecaad0e068f..e7683c8af5eeb 100644 --- a/.github/workflows/upload-test-stats-while-running.yml +++ b/.github/workflows/upload-test-stats-while-running.yml @@ -16,7 +16,7 @@ jobs: runs-on: linux.2xlarge steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml index 5dcbfe7fd65fa..a6cc99c1efb1c 100644 --- a/.github/workflows/upload-test-stats.yml +++ b/.github/workflows/upload-test-stats.yml @@ -66,7 +66,7 @@ jobs: run: echo "${TRIGGERING_WORKFLOW}" - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 - name: Configure aws credentials uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml index 07471619437a2..b3221fb0144d0 100644 --- a/.github/workflows/upload-torch-dynamo-perf-stats.yml +++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml @@ -32,7 +32,7 @@ jobs: name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml index 5702562006055..856f6e7ccedce 100644 --- a/.github/workflows/upload_test_stats_intermediate.yml +++ b/.github/workflows/upload_test_stats_intermediate.yml @@ -17,7 +17,7 @@ jobs: environment: upload-stats steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11 with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 24c57119cae85..b15dbc7c2db2e 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -44,7 +44,7 @@ jobs: torch_cuda_arch_list: '8.0 8.9 9.0 10.0' build_environment: linux-jammy-cuda12.9-py3.12-gcc11 steps: - - uses: pytorch/test-infra/.github/actions/setup-uv@main + - uses: pytorch/test-infra/.github/actions/setup-uv@release/2.11 with: python-version: "3.12" activate-environment: "true" @@ -82,7 +82,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: working-directory: pytorch/pytorch docker-image-name: ci-image:pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index 7bed6c785d4db..b8aadb37fc528 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -22,7 +22,7 @@ jobs: fetch-depth: 0 - name: update-xla-commit-hash continue-on-error: true - uses: pytorch/test-infra/.github/actions/update-commit-hash@main + uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.11 with: repo-name: xla branch: master diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index 84b8aa3cd91d4..440580b475945 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -19,7 +19,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py index a7c661340d13e..fbf4b1d16dfd5 100644 --- a/tools/stats/import_test_stats.py +++ b/tools/stats/import_test_stats.py @@ -112,7 +112,7 @@ def process_disabled_test(the_response: dict[str, Any]) -> dict[str, Any]: return disabled_test_from_issues try: - url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json" + url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json?versionId=cnSTGFIe2xdODOeLj3qZMwi4tgoH6y67" return fetch_and_cache(dirpath, filename, url, process_disabled_test) except Exception: print("Couldn't download test skip set, leaving all tests enabled...") From 8b1d03b6bf5032c71abf4bde80c4001f989ccd5a Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 16 Feb 2026 11:00:49 -0800 Subject: [PATCH 02/87] Update inductor expected accuracy files (#175096) Update inductor expected accuracy files (#175041) ## Summary This PR updates the expected accuracy CSV files for inductor benchmarks based on CI results from PyTorch commit 93dd7743c6577271a81f2fef0fdeafc5fe06e553. These files serve as reference points for dynamo/inductor CI to track: - Graph breaks - Model accuracy ## Changes - Updated CUDA expected accuracy files in `benchmarks/dynamo/ci_expected_accuracy/` - Updated ROCm expected accuracy files in `benchmarks/dynamo/ci_expected_accuracy/rocm/` ## Test Plan - [ ] Verify that the CI jobs pass with the updated expected accuracy files - [ ] Review the diff to ensure changes are reasonable and expected - [ ] Check that no unexpected regressions are being marked as "expected" Pull Request resolved: https://github.com/pytorch/pytorch/pull/175041 Approved by: https://github.com/atalman (cherry picked from commit f90c091c44cf4e8feffbf5d5afdebd20798a86fa) --- .../ci_expected_accuracy/cpu_inductor_torchbench_inference.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv index 0157149df8cb8..5fb09f9e69f1d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv @@ -186,7 +186,7 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 +pytorch_CycleGAN_and_pix2pix,eager_fail_to_run,0 From 148630ba62bb9b216eea3737d90a2c2d6d772ba2 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Mon, 16 Feb 2026 12:29:29 -0800 Subject: [PATCH 03/87] Revert "[fix] DISABLED test_index (__main__.DistTensorOpsTest) (#172373)" (#175094) This reverts commit 70726364e8565902d6f9ed9e47cd197caf544399. Reverted https://github.com/pytorch/pytorch/pull/172373 on behalf of https://github.com/jeffdaily due to PR claims to fix ROCm DISABLED issue but it did not ([comment](https://github.com/pytorch/pytorch/pull/172373#issuecomment-3909564537)) Co-authored-by: PyTorch MergeBot --- test/distributed/tensor/test_tensor_ops.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py index c59c7aba540d5..692f074a91665 100644 --- a/test/distributed/tensor/test_tensor_ops.py +++ b/test/distributed/tensor/test_tensor_ops.py @@ -19,12 +19,7 @@ from torch.distributed.tensor._sharding_prop import ShardingPropagator from torch.distributed.tensor.debug import CommDebugMode from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_utils import ( - MI200_ARCH, - run_tests, - serialTest, - skipIfRocmArch, -) +from torch.testing._internal.common_utils import MI200_ARCH, run_tests, skipIfRocmArch from torch.testing._internal.distributed._tensor.common_dtensor import ( create_local_tensor_test_class, DTensorConverter, @@ -619,7 +614,6 @@ def test_gather(self): @skipIfRocmArch(MI200_ARCH) @with_comms - @serialTest() def test_index(self): meshes = [ self.build_device_mesh(), # 1D mesh From 1b0497e1dc77f60d79fd365524684e83f19fcf80 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Mon, 16 Feb 2026 12:30:16 -0800 Subject: [PATCH 04/87] Revert "[CI] Enable TIMM pretrained model caching on shared HF cache (#174596)" (#175095) This reverts commit 781b5d1dc94544bcc3841ad7babcd1be783a5056. Reverted https://github.com/pytorch/pytorch/pull/174596 on behalf of https://github.com/jeffdaily due to This broke ROCm dynamo benchmarks. Lots of permission denied errors. ([comment](https://github.com/pytorch/pytorch/pull/174596#issuecomment-3909918521)) Co-authored-by: PyTorch MergeBot --- .ci/pytorch/test.sh | 6 ------ .github/workflows/_linux-test.yml | 16 ---------------- benchmarks/dynamo/common.py | 32 +++---------------------------- 3 files changed, 3 insertions(+), 51 deletions(-) diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 7bc94541a7558..e3e71bd0d54fa 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -1937,12 +1937,6 @@ elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then test_dynamo_benchmark huggingface "$id" elif [[ "${TEST_CONFIG}" == *timm* ]]; then install_torchvision - TIMM_PIN="$(< .ci/docker/ci_commit_pins/timm.txt)" - export HF_HOME="${HF_HOME}/timm_${TIMM_PIN}" - if [[ "${TRANSFORMERS_OFFLINE:-1}" == "0" ]]; then - python benchmarks/dynamo/timm_models.py --download-only \ - && touch "${HF_HOME}/.timm_cache_complete" - fi id=$((SHARD_NUMBER-1)) test_dynamo_benchmark timm_models "$id" elif [[ "${TEST_CONFIG}" == cachebench ]]; then diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 3e9a03befbddd..74d89aea3f869 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -391,22 +391,6 @@ jobs: export HF_DATASETS_OFFLINE=0 fi - # For TIMM jobs, create a pin-specific cache directory and make it - # world-writable so the jenkins user inside docker can write to it. - # Only enable online mode if the cache hasn't been fully populated. - if [[ "${TEST_CONFIG}" == *timm* ]]; then - TIMM_PIN="$(< .ci/docker/ci_commit_pins/timm.txt)" - TIMM_CACHE_DIR="${HF_CACHE}/timm_${TIMM_PIN}" - if [[ ! -d "${TIMM_CACHE_DIR}" ]]; then - mkdir -p "${TIMM_CACHE_DIR}" - chmod -R a+rwX "${TIMM_CACHE_DIR}" - fi - if [[ ! -f "${TIMM_CACHE_DIR}/.timm_cache_complete" ]]; then - export TRANSFORMERS_OFFLINE=0 - export HF_DATASETS_OFFLINE=0 - fi - fi - # detached container should get cleaned up by teardown_ec2_linux # TODO: Stop building test binaries as part of the build phase # Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index ee8a95f5c8459..3771ddd65e2e9 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -475,7 +475,6 @@ def output_signpost(data, args, suite, error=None): "log_conv_args", "recompile_profiler", "find_batch_sizes", - "download_only", # Redundant "batch_size", "batch_size_file", @@ -3683,13 +3682,8 @@ def get_example_inputs(self): action="store_true", help="finds the largest batch size that could fit on GPUs", ) - group.add_argument( - "--download-only", - action="store_true", - help="Download all models and exit without running benchmarks.", - ) - mode_group = parser.add_mutually_exclusive_group(required=False) + mode_group = parser.add_mutually_exclusive_group(required=True) mode_group.add_argument( "--accuracy", action="store_true", @@ -3703,7 +3697,7 @@ def get_example_inputs(self): action="store_true", help="extracts the tolerance for each model with small batch size and eval mode", ) - run_mode_group = parser.add_mutually_exclusive_group(required=False) + run_mode_group = parser.add_mutually_exclusive_group(required=True) run_mode_group.add_argument( "--training", action="store_true", @@ -3712,13 +3706,7 @@ def get_example_inputs(self): run_mode_group.add_argument( "--inference", action="store_true", help="Performs inference" ) - args = parser.parse_args(args) - if not args.download_only: - if not any([args.accuracy, args.performance, args.tolerance]): - parser.error("one of --accuracy/--performance/--tolerance is required") - if not any([args.training, args.inference]): - parser.error("one of --training/--inference is required") - return args + return parser.parse_args(args) def process_caching_precompile(): @@ -4282,20 +4270,6 @@ def model_iter_fn_and_mark_step(*args, **kwargs): write_outputs(output_filename, [], [args.only, batch_size]) return - if args.download_only: - model_names = list(runner.iter_model_names(args)) - failed = [] - for name in model_names: - try: - runner._download_model(name) - print(f"Downloaded: {name}") - except Exception as e: - print(f"Failed: {name}: {e}") - failed.append(name) - if failed: - sys.exit(1) - return - should_profile_details = args.profile_details args.profile_details = {} if args.export_profiler_trace: From 7afdbae2d55f95c6065101e727bd1e0cd2a72d34 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 16 Feb 2026 17:11:49 -0800 Subject: [PATCH 05/87] Fix macOS arm64 libtorch release upload failure (#175108) Fix macOS arm64 libtorch release upload failure (#175100) **Summary** Failures introduced by following PR: https://github.com/pytorch/pytorch/pull/173541 The change from RENAME_WHEEL=true to RENAME_WHEEL=false as the default in build_wheel.sh (landed in the 2026-01-31 nightly) broke libtorch builds on macOS arm64. The elif branch at line 220 was missing a BUILD_PYTHONLESS guard, so libtorch builds (BUILD_PYTHONLESS=1) entered the wheel-copy path instead of the libtorch zip-packaging path. This caused the build to produce a .whl artifact instead of the expected .zip files, and the upload script then failed because it looks for *.zip files. The fix adds -z "$BUILD_PYTHONLESS" to the elif condition, matching the guard already present on the if branch. Failures can be seen here: https://hud.pytorch.org/hud/pytorch/pytorch/nightly/1?per_page=50&name_filter=macos-arm64-binary-libtorch-release%20%2F%20libtorch-cpu Failing run: https://github.com/pytorch/pytorch/actions/runs/21541142799/job/62076418921 Successful run (previous nightly): https://github.com/pytorch/pytorch/actions/runs/21508411052/job/61971405484 **Test plan** In CI run ciflow/binaries. Make sure the Rename/Copy log is same as successful run above Pull Request resolved: https://github.com/pytorch/pytorch/pull/175100 Approved by: https://github.com/huydhn, https://github.com/isuruf (cherry picked from commit bad1df73015bd733d84be9fe90765fd17d30b89e) Co-authored-by: atalman --- .ci/wheel/build_wheel.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh index afd1faf2a5f7c..6fb63c361f018 100755 --- a/.ci/wheel/build_wheel.sh +++ b/.ci/wheel/build_wheel.sh @@ -217,7 +217,7 @@ if [[ -z "$BUILD_PYTHONLESS" && $RENAME_WHEEL == true ]]; then # Copy the whl to a final destination before tests are run echo "Renaming Wheel file: $wheel_filename_gen to $wheel_filename_new" cp "$whl_tmp_dir/$wheel_filename_gen" "$PYTORCH_FINAL_PACKAGE_DIR/$wheel_filename_new" -elif [[ $RENAME_WHEEL == false ]]; then +elif [[ -z "$BUILD_PYTHONLESS" && $RENAME_WHEEL == false ]]; then echo "Copying Wheel file: $wheel_filename_gen to $PYTORCH_FINAL_PACKAGE_DIR" cp "$whl_tmp_dir/$wheel_filename_gen" "$PYTORCH_FINAL_PACKAGE_DIR/$wheel_filename_gen" if [[ "$VERIFY_WHEELNAME" == "true" && "$wheel_filename_gen" != "$wheel_filename_new" ]]; then From 2ceed7407ca27576a06a64a95f0879ba9b9071cc Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Thu, 19 Feb 2026 05:57:22 -0800 Subject: [PATCH 06/87] [benchmark] Skip pytorch_CycleGAN_and_pix2pix from inductor benchmarks (#175299) [benchmark] Skip pytorch_CycleGAN_and_pix2pix from inductor benchmarks (#175066) ## Summary Skip the `pytorch_CycleGAN_and_pix2pix` benchmark model from the inductor benchmark suite. This legacy 2017 model has been failing with `eager_fail_to_run` on 100% of commits since mid-2025, providing zero CI signal while consuming ~5.3M GPU-seconds/week across 7+ benchmark jobs on CUDA, CPU, and ROCm. **Estimated savings: ~310 GPU-hours/week (~1,240 GPU-hours/month)** Skip it in `torchbench.yaml` and remove its entries from all 31 expected accuracy CSV files. Also remove it from the `higher_fp16` tolerance list. See P2188981399 for the full CI workflow analysis. ## Test Plan - CI should pass with CycleGAN skipped (it was already failing 100% of the time) - No other benchmark models affected Pull Request resolved: https://github.com/pytorch/pytorch/pull/175066 Approved by: https://github.com/huydhn, https://github.com/malfet (cherry picked from commit 688c943324013bfdf1e24aac37d02602c4bbf02a) Co-authored-by: Eli Uriegas --- .../ci_expected_accuracy/aot_eager_torchbench_inference.csv | 4 ---- .../ci_expected_accuracy/aot_eager_torchbench_training.csv | 4 ---- .../aot_inductor_torchbench_inference.csv | 4 ---- .../cpu_aot_inductor_amp_freezing_torchbench_inference.csv | 4 ---- .../cpu_aot_inductor_freezing_torchbench_inference.csv | 4 ---- .../cpu_inductor_amp_freezing_torchbench_inference.csv | 4 ---- .../cpu_inductor_freezing_torchbench_inference.csv | 4 ---- .../cpu_inductor_torchbench_inference.csv | 4 ---- .../dynamic_aot_eager_torchbench_inference.csv | 4 ---- .../dynamic_aot_eager_torchbench_training.csv | 4 ---- ...mic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv | 4 ---- ...dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv | 4 ---- .../dynamic_cpu_inductor_torchbench_inference.csv | 4 ---- ...ax_autotune_inductor_amp_freezing_torchbench_inference.csv | 4 ---- .../dynamic_inductor_torchbench_inference.csv | 4 ---- .../dynamic_inductor_torchbench_training.csv | 4 ---- .../dynamo_eager_torchbench_inference.csv | 4 ---- .../ci_expected_accuracy/dynamo_eager_torchbench_training.csv | 4 ---- .../ci_expected_accuracy/inductor_torchbench_inference.csv | 4 ---- .../ci_expected_accuracy/inductor_torchbench_training.csv | 4 ---- .../rocm/aot_eager_torchbench_inference.csv | 1 - .../rocm/aot_eager_torchbench_training.csv | 1 - .../rocm/aot_inductor_torchbench_inference.csv | 1 - .../rocm/dynamic_aot_eager_torchbench_inference.csv | 1 - .../rocm/dynamic_aot_eager_torchbench_training.csv | 1 - .../rocm/dynamic_inductor_torchbench_inference.csv | 1 - .../rocm/dynamic_inductor_torchbench_training.csv | 1 - .../rocm/dynamo_eager_torchbench_inference.csv | 1 - .../rocm/dynamo_eager_torchbench_training.csv | 1 - .../rocm/inductor_torchbench_inference.csv | 1 - .../rocm/inductor_torchbench_training.csv | 1 - benchmarks/dynamo/torchbench.yaml | 4 +++- 32 files changed, 3 insertions(+), 92 deletions(-) diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv index bdf3313659b66..b0daa9a947ad4 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv @@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv index ee7838505d67c..9ff11382f67ad 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv @@ -130,10 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 - - - pytorch_stargan,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv index 6fdfc6e72bda3..5d9d432e644a7 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv @@ -178,10 +178,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv index 061727b22329c..c9a93b51d10c7 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv @@ -162,10 +162,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv index 061727b22329c..c9a93b51d10c7 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv @@ -162,10 +162,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv index fa64abf3c8246..0799b804bbf8c 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv @@ -186,10 +186,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv index 1d5a70739dd79..813520528261e 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv @@ -186,10 +186,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv index 5fb09f9e69f1d..0a51409c04ac3 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv @@ -186,10 +186,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,eager_fail_to_run,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv index 16034d6bdfe72..8d4895e4a1ccd 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv @@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv index 54486aa44d557..6743f7c739ef1 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv @@ -130,10 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 - - - pytorch_stargan,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv index 7deb0fbba56b5..f7df046d04cce 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv @@ -146,10 +146,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv index 7deb0fbba56b5..f7df046d04cce 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv @@ -146,10 +146,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv index 57ba90bd512bb..46d453de48c7d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv @@ -170,10 +170,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv index 723ef7a272ea1..dfa70e34afc53 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv @@ -190,10 +190,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv index 1ea87d2648875..72fd3af5beeda 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv @@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv index 24ad3a397e2cc..e4c20cfebf465 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv @@ -130,10 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 - - - pytorch_stargan,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv index bdf3313659b66..b0daa9a947ad4 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv @@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv index ee7838505d67c..9ff11382f67ad 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv @@ -130,10 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 - - - pytorch_stargan,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv index d5395066bbb42..bc98e325ec784 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv @@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 - - - pytorch_stargan,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv index 24ad3a397e2cc..e4c20cfebf465 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv @@ -130,10 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 - - - pytorch_stargan,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv index 9e1d982030b13..5f486d36c45b0 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv @@ -202,7 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv index fa52782561be2..d171c2e06bffa 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv @@ -130,7 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv index 6d4c4c0359a50..fb70c29d79fda 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv @@ -178,7 +178,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv index 61ffb7197f676..2dd7684e5bd4f 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv @@ -202,7 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv index c9d8189dc8b74..4bb772b41d8f0 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv @@ -130,7 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv index a678afd25c32d..d1150f849e2ee 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv @@ -202,7 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv index e059c66ca3d04..877277a5aa192 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv @@ -130,7 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv index 8dbc90cc8d2c7..a18f3b215ecc8 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv @@ -202,7 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv index 12d994100dfe3..d7d0a9b0b3292 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv @@ -130,7 +130,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv index ef7c4f2a01e9f..55457b2f60695 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv @@ -210,7 +210,6 @@ pyhpc_turbulent_kinetic_energy,pass,0 -pytorch_CycleGAN_and_pix2pix,pass,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv index 91e6df19ff02a..529710bbc21c1 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv @@ -134,7 +134,6 @@ phlippe_resnet,pass,6 -pytorch_CycleGAN_and_pix2pix,pass,6 diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml index b7f6229bed1cd..9a1825bc3d290 100644 --- a/benchmarks/dynamo/torchbench.yaml +++ b/benchmarks/dynamo/torchbench.yaml @@ -43,7 +43,6 @@ tolerance: - doctr_reco_predictor - drq - phlippe_resnet - - pytorch_CycleGAN_and_pix2pix higher_bf16: - doctr_reco_predictor @@ -219,6 +218,9 @@ skip: # Has never been working correctly # https://github.com/pytorch/pytorch/issues/172015#issuecomment-3730509098 - modded_nanogpt + # Broken since mid-2025, eager_fail_to_run on all platforms. + # Legacy 2017 model providing zero CI signal. + - pytorch_CycleGAN_and_pix2pix device: cpu: From d80a5844806105de9105da07e3d1836c293a46a6 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Thu, 19 Feb 2026 05:58:15 -0800 Subject: [PATCH 07/87] [CI] Move CUDA 12.8 GPU tests from per-commit trunk to periodic (#175300) [CI] Move CUDA 12.8 GPU tests from per-commit trunk to periodic (#175067) ## Summary Move CUDA 12.8 GPU tests from per-commit trunk CI to periodic (~3x/day on weekdays). Both CUDA 12.8 and 13.0 are shipping wheel targets (nightly ships cu126, cu128, cu129, cu130), but their trunk CI test suites have **85-90% failure correlation** -- they almost always fail together. Over a 30-day analysis window covering 97 reverts and 38 significant regression events, **CUDA 12.8 never uniquely caught a regression that 13.0 missed**. CUDA 13.0 is kept per-commit because: - It is the **newest** shipping CUDA version - Most likely to surface **novel breakage** from new CUDA runtime behavior - Forward-looking CI should protect what's coming, not what's already stable CUDA 12.8 is moved to periodic because: - It is **mature and well-understood** -- breakage is less likely and less urgent - The rare 12.8-only regression can tolerate the ~8-hour periodic detection window - The 12.8 build job **remains in trunk** because `cross-compile-linux-test` depends on its artifacts **Estimated savings: ~1,270 GPU-hours/week (~5,080 GPU-hours/month)** This is the #2 savings opportunity from a broader CI workflow analysis (P2188981399) covering 128 PR+trunk jobs over 30 days. Combined with #175066 (CycleGAN skip, ~310 GPU-hours/week), total savings from this stack: **~1,580 GPU-hours/week (~6,320 GPU-hours/month)**. ### Changes - `trunk.yml`: remove CUDA 12.8 test job (5 default + 3 distributed + 1 pr_time_benchmarks + 1 libtorch shards) and no-ops build - `periodic.yml`: add default (5 GPU shards on g6.4xlarge) and distributed (3 multi-GPU shards on g4dn.12xlarge) to existing CUDA 12.8 periodic entry ## Test Plan - CUDA 12.8 GPU tests continue to run in periodic (3x/day weekdays) - CUDA 13.0 per-commit coverage is unchanged - Cross-compile-linux-test continues to work (12.8 build job kept) Pull Request resolved: https://github.com/pytorch/pytorch/pull/175067 Approved by: https://github.com/malfet ghstack dependencies: #175066 (cherry picked from commit ef0353feec832b691bfae4631f54475455c42e94) Co-authored-by: Eli Uriegas --- .github/workflows/periodic.yml | 13 +++++++++++- .github/workflows/trunk.yml | 37 +++++----------------------------- 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 70403093e2568..2914329f1c48b 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -90,9 +90,20 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-cuda12.8-py3.10-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 - cuda-arch-list: 8.6 + # GPU default + distributed tests moved here from trunk.yml. + # CUDA 13.0 remains per-commit in trunk; 12.8 GPU tests run periodically. + # See P2188981399 for the full CI workflow analysis. + cuda-arch-list: '7.5 8.6 8.9' test-matrix: | { include: [ + { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, + { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, + { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 75966ef5e5c4c..5b741eb67954a 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -112,21 +112,10 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc11-test: - if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' linux-jammy-cuda12.8-py3.10-gcc11 ') }} - name: linux-jammy-cuda12.8-py3.10-gcc11 - uses: ./.github/workflows/_linux-test.yml - needs: - - linux-jammy-cuda12_8-py3_10-gcc11-build - - target-determination - - job-filter - with: - timeout-minutes: 360 - build-environment: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.build-environment }} - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} - tests-to-include: ${{ github.event.inputs.tests-to-include || '' }} - secrets: inherit + # CUDA 12.8 GPU tests moved to periodic.yml to reduce per-commit compute. + # CUDA 13.0 (the newer shipping version) remains per-commit for forward-looking coverage. + # The 12.8 build job is kept because cross-compile-linux-test depends on it. + # See P2188981399 for the full CI workflow analysis. linux-jammy-cuda13_0-py3_10-gcc11-build: if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' linux-jammy-cuda13.0-py3.10-gcc11 ') }} @@ -171,23 +160,7 @@ jobs: secrets: inherit # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated - linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build: - if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' linux-jammy-cuda12.8-py3.10-gcc11-no-ops ') }} - name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops - uses: ./.github/workflows/_linux-build.yml - needs: - - get-label-type - - job-filter - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-no-ops - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 1 }, - ]} - secrets: inherit - + # CUDA 12.8 no-ops build moved to periodic (only 13.0 remains per-commit) linux-jammy-cuda13_0-py3_10-gcc11-no-ops-build: if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' linux-jammy-cuda13.0-py3.10-gcc11-no-ops ') }} name: linux-jammy-cuda13.0-py3.10-gcc11-no-ops From c649f1ae141e8970c86846dba846d7c32d135b8b Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Thu, 19 Feb 2026 15:50:48 -0800 Subject: [PATCH 08/87] [BE] Remove cuda 12.4 periodic tests (#175362) [BE] Remove cuda 12.4 periodic tests (#175170) These tests are either timing out or failing for couple of month now. No reason to keep them around: https://hud.pytorch.org/hud/pytorch/pytorch/main/2?per_page=50&name_filter=12.4 Failures go back as far as 9.29.2025 : https://hud.pytorch.org/pytorch/pytorch/commit/efd7fd5ed5ac7ec03201a546a09fb19ec59de431 Pull Request resolved: https://github.com/pytorch/pytorch/pull/175170 Approved by: https://github.com/malfet (cherry picked from commit 174157a10161342694636d7733bfff22b21bda8c) Co-authored-by: atalman --- .github/workflows/periodic.yml | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 2914329f1c48b..16728fa24dd7f 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -51,37 +51,6 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - linux-jammy-cuda12_4-py3_10-gcc11-build: - name: linux-jammy-cuda12.4-py3.10-gcc11 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.4-py3.10-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11 - cuda-arch-list: 7.5 - test-matrix: | - { include: [ - { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_4-py3_10-gcc11-test: - name: linux-jammy-cuda12.4-py3.10-gcc11 - uses: ./.github/workflows/_linux-test.yml - needs: - - linux-jammy-cuda12_4-py3_10-gcc11-build - - target-determination - with: - build-environment: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.build-environment }} - docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc11-build: name: linux-jammy-cuda12.8-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml From 4b8a514606230b60bb8f27be5f11612f21b4aec1 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Thu, 19 Feb 2026 16:04:05 -0800 Subject: [PATCH 09/87] [CI] Add CUDA 13 periodic tests (#175380) [CI] Add CUDA 13 periodic tests (#174850) https://github.com/pytorch/pytorch/issues/173950 To prepare moving CUDA 13 wheels to stable wheels, need to add CUDA 13 periodic cuda tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/174850 Approved by: https://github.com/atalman (cherry picked from commit 7cdd4b16cad708e2083ea9ff2ec724876485cf90) Co-authored-by: Ting Lu Co-authored-by: Andrey Talman --- .github/workflows/periodic.yml | 93 +++++++++++----------------------- 1 file changed, 29 insertions(+), 64 deletions(-) diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 16728fa24dd7f..1a739986715c8 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -51,17 +51,14 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - linux-jammy-cuda12_8-py3_10-gcc11-build: - name: linux-jammy-cuda12.8-py3.10-gcc11 + linux-jammy-cuda13_0-py3_10-gcc11-build: + name: linux-jammy-cuda13.0-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 - # GPU default + distributed tests moved here from trunk.yml. - # CUDA 13.0 remains per-commit in trunk; 12.8 GPU tests run periodically. - # See P2188981399 for the full CI workflow analysis. + build-environment: linux-jammy-cuda13.0-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11 cuda-arch-list: '7.5 8.6 8.9' test-matrix: | { include: [ @@ -84,26 +81,26 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc11-test: - name: linux-jammy-cuda12.8-py3.10-gcc11 + linux-jammy-cuda13_0-py3_10-gcc11-test: + name: linux-jammy-cuda13.0-py3.10-gcc11 uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-cuda12_8-py3_10-gcc11-build + - linux-jammy-cuda13_0-py3_10-gcc11-build - target-determination with: - build-environment: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.build-environment }} - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} + build-environment: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.build-environment }} + docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc11-debug-build: - name: linux-jammy-cuda12.8-py3.10-gcc11-debug + linux-jammy-cuda13_0-py3_10-gcc11-debug-build: + name: linux-jammy-cuda13.0-py3.10-gcc11-debug uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-debug + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11 cuda-arch-list: 8.9 test-matrix: | { include: [ @@ -117,58 +114,26 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc11-debug-test: - name: linux-jammy-cuda12.8-py3.10-gcc11-debug + linux-jammy-cuda13_0-py3_10-gcc11-debug-test: + name: linux-jammy-cuda13.0-py3.10-gcc11-debug uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-cuda12_8-py3_10-gcc11-debug-build + - linux-jammy-cuda13_0-py3_10-gcc11-debug-build - target-determination with: - build-environment: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.build-environment }} - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.test-matrix }} + build-environment: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-debug-build.outputs.build-environment }} + docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-debug-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-debug-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda13_0-py3_10-gcc11-build: - name: linux-jammy-cuda13.0-py3.10-gcc11 + linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build: + name: linux-jammy-cuda13.0-py3-gcc11-slow-gradcheck uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - cuda-arch-list: 7.5 - build-environment: linux-jammy-cuda13.0-py3.10-gcc11 + build-environment: linux-jammy-cuda13.0-py3-gcc11-slow-gradcheck docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11 - test-matrix: | - { include: [ - { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda13_0-py3_10-gcc11-test: - name: linux-jammy-cuda13.0-py3.10-gcc11 - uses: ./.github/workflows/_linux-test.yml - needs: - - linux-jammy-cuda13_0-py3_10-gcc11-build - - target-determination - with: - build-environment: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.build-environment }} - docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }} - secrets: inherit - - linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build: - name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 cuda-arch-list: 8.6 test-matrix: | { include: [ @@ -183,15 +148,15 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-test: - name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck + linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-test: + name: linux-jammy-cuda13.0-py3-gcc11-slow-gradcheck uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build + - linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build - target-determination with: - build-environment: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.build-environment }} - docker-image: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.test-matrix }} + build-environment: ${{ needs.linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build.outputs.build-environment }} + docker-image: ${{ needs.linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build.outputs.test-matrix }} timeout-minutes: 300 secrets: inherit From 5decbe0079eb387c203ab654b2481e0e4676ee19 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 20 Feb 2026 07:53:48 -0800 Subject: [PATCH 10/87] [ROCm] forward fix #174087, take 4 (#175159) [ROCm] forward fix #174087, take 4 (#175098) vllm build broke due to missing getCurrentHIPStreamMasqueradingAsCUDA. Though it existed in the header aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h, this header was not included directly or indirectly by vllm. PR #174087 subtly broke this even when trying to be backward compatible. Moving the declarations of these Masquerading functions into c10/cuda/CUDAStream.h (c10/hip/HIPStream.h when hipified) fixes the vllm build. Any external projects that had included the HIPStreamMasqueradingAsCUDA.h forward to c10/hip/HIPStream.h anyway. Pull Request resolved: https://github.com/pytorch/pytorch/pull/175098 Approved by: https://github.com/atalman (cherry picked from commit e6d6f0465ae435b4b73757553d3aa4504dd92d7d) Co-authored-by: Jeff Daily --- .../hip/impl/HIPStreamMasqueradingAsCUDA.h | 27 ------------------- c10/cuda/CUDAStream.h | 22 +++++++++++++++ 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h index a6d24a55c7b0d..1c0047371d7b6 100644 --- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h @@ -24,33 +24,6 @@ class HIPStreamMasqueradingAsCUDA final : public c10::cuda::CUDAStream { c10::cuda::CUDAStream hip_stream() const { return *this; } }; -HIPStreamMasqueradingAsCUDA -inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, DeviceIndex device = -1) { - return HIPStreamMasqueradingAsCUDA(c10::cuda::getStreamFromPool(isHighPriority, device)); -} - -HIPStreamMasqueradingAsCUDA -inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) { - return HIPStreamMasqueradingAsCUDA(c10::cuda::getStreamFromPool(priority, device)); -} - -HIPStreamMasqueradingAsCUDA -inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) { - return HIPStreamMasqueradingAsCUDA(c10::cuda::getStreamFromExternal(ext_stream, device)); -} - -inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) { - return HIPStreamMasqueradingAsCUDA(c10::cuda::getDefaultCUDAStream(device_index)); -} - -inline HIPStreamMasqueradingAsCUDA getCurrentHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) { - return HIPStreamMasqueradingAsCUDA(c10::cuda::getCurrentCUDAStream(device_index)); -} - -inline void setCurrentHIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA stream) { - c10::cuda::setCurrentCUDAStream(stream.hip_stream()); -} - inline std::ostream& operator<<(std::ostream& stream, const HIPStreamMasqueradingAsCUDA& s) { stream << s.hip_stream() << " (masquerading as CUDA)"; return stream; diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h index d3d1402593751..f27c7c9176631 100644 --- a/c10/cuda/CUDAStream.h +++ b/c10/cuda/CUDAStream.h @@ -256,6 +256,28 @@ inline c10::cuda::CUDAStream getCurrentHIPStream( return c10::cuda::getCurrentCUDAStream(device_index); } inline auto& setCurrentHIPStream = c10::cuda::setCurrentCUDAStream; +inline c10::cuda::CUDAStream getStreamFromPoolMasqueradingAsCUDA( + const bool isHighPriority = false, + DeviceIndex device = -1) { + return c10::cuda::getStreamFromPool(isHighPriority, device); +} +inline c10::cuda::CUDAStream getStreamFromPoolMasqueradingAsCUDA( + const int priority, + DeviceIndex device = -1) { + return c10::cuda::getStreamFromPool(priority, device); +} +inline auto& getStreamFromExternalMasqueradingAsCUDA = + c10::cuda::getStreamFromExternal; +inline c10::cuda::CUDAStream getDefaultHIPStreamMasqueradingAsCUDA( + DeviceIndex device_index = -1) { + return c10::cuda::getDefaultCUDAStream(device_index); +} +inline c10::cuda::CUDAStream getCurrentHIPStreamMasqueradingAsCUDA( + DeviceIndex device_index = -1) { + return c10::cuda::getCurrentCUDAStream(device_index); +} +inline auto& setCurrentHIPStreamMasqueradingAsCUDA = + c10::cuda::setCurrentCUDAStream; } // namespace c10::hip #endif From a2813af6564cf10ae3ec37e568c53f91707d45c2 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 23 Feb 2026 17:29:28 -0500 Subject: [PATCH 11/87] [release-only] Remove +ptx from cuda 13.0 builds (#175567) --- .ci/manywheel/build_cuda.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh index 94bf6a6b4b26c..bd952d5e08fd5 100644 --- a/.ci/manywheel/build_cuda.sh +++ b/.ci/manywheel/build_cuda.sh @@ -115,7 +115,7 @@ case ${CUDA_VERSION} in fi ;; 13.0) - TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX" + TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0" export TORCH_NVCC_FLAGS="-compress-mode=size" export BUILD_BUNDLE_PTXAS=1 ;; From 98e35020c7423c304778a7044f6baa3c8a98ba6d Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 23 Feb 2026 17:01:30 -0800 Subject: [PATCH 12/87] [MPS] Fix 2-pass SDPA memory corruption by forcing float accumulators (#175580) [MPS] Fix 2-pass SDPA memory corruption by forcing float accumulators (#174945) Ensure `sums` and `maxs` buffers in `sdpa_vector_2pass_mps` are allocated as `kFloat` instead of inheriting the input dtype. This fixes out-of-bounds memory access and nondeterministic/corrupt results, as reported in #174861 (reproducible with bf16/fp16 and GQA, seq_len > 1023). Adds a regression test covering bf16/fp16/fp32 and relaxes tolerance for bf16 to validate numerical correctness and determinism on MPS. Fixes #174861 Pull Request resolved: https://github.com/pytorch/pytorch/pull/174945 Approved by: https://github.com/malfet (cherry picked from commit c68a1d2c01dfba9da53a1bd495cc263d6b802150) Co-authored-by: Roy Hvaara --- .../ATen/native/mps/operations/Attention.mm | 4 ++-- test/test_mps.py | 22 +++++++++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/mps/operations/Attention.mm b/aten/src/ATen/native/mps/operations/Attention.mm index ce57174177885..26477fef2ed61 100644 --- a/aten/src/ATen/native/mps/operations/Attention.mm +++ b/aten/src/ATen/native/mps/operations/Attention.mm @@ -265,8 +265,8 @@ auto out = at::empty({batchSize, num_heads, seq_len_q, headSize}, q_.options()); auto intermediate = at::empty({batchSize, num_heads, seq_len_q, blocks, headSize}, q_.options()); - auto sums = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options()); - auto maxs = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options()); + auto sums = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options().dtype(kFloat)); + auto maxs = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options().dtype(kFloat)); auto scale_factor = sdp::calculate_scale(orig_query, scale).expect_float(); bool has_mask = mask_.has_value(); diff --git a/test/test_mps.py b/test/test_mps.py index 482a163986c17..02e291017582e 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -9629,10 +9629,10 @@ def weight_int8pack_mm(a, b_int8pack, b_scales): class TestSDPA(TestCaseMPS): - def _compare_tensors(self, y, ref): + def _compare_tensors(self, y, ref, tol=0.01): denom = torch.maximum(ref.abs(), torch.tensor([1e-6], device=ref.device, dtype=ref.dtype)) err = ((y - ref).abs() / denom).mean().item() - self.assertLess(err, 0.01) + self.assertLess(err, tol) def _test_sdpa_no_mask( self, @@ -9736,6 +9736,24 @@ def test_sdpa_full_mask(self, dtype): out_mps = F.scaled_dot_product_attention(q.to('mps'), k.to('mps'), v.to('mps'), attn_mask=mask.to('mps')) self._compare_tensors(out_mps.cpu(), out_cpu) + @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float]) + def test_sdpa_2pass(self, dtype): + # Regression test for https://github.com/pytorch/pytorch/issues/174861 + q = torch.randn(1, 32, 1, 128, dtype=dtype) + k = torch.randn(1, 2, 1024, 128, dtype=dtype) + v = torch.randn(1, 2, 1024, 128, dtype=dtype) + sdpa_kwargs = {"enable_gqa": True} + + out_cpu = F.scaled_dot_product_attention(q, k, v, **sdpa_kwargs) + out_mps = F.scaled_dot_product_attention( + q.to("mps"), k.to("mps"), v.to("mps"), **sdpa_kwargs + ) + + tol = 0.1 if dtype == torch.bfloat16 else 0.01 + + self.assertEqual(out_mps, out_cpu, atol=1e-3, rtol=1e-6) + self._compare_tensors(out_mps.cpu(), out_cpu, tol=tol) + @parametrize("dtype", [torch.float16, torch.float32]) def test_sdpa_3d_input(self, dtype): head_num, seq_len, embed_dim = 16, 16, 80 From ef5c69dec13056e58810a59663db7f8a6b8bebbd Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 24 Feb 2026 09:25:58 -0800 Subject: [PATCH 13/87] Disable einops 0.8.2 check on PyTorch (#175442) Disable einops 0.8.2 check on PyTorch (#175351) Partially revert #173611 and fallback to the previous behavior on einops, which uses `allow_in_graph`. **Context** * Dynamo does not trace into `@lru_cache` and warns on any usage. * einops uses `@lru_cache` as part of `_prepare_transformation_recipe`. * Every einops op goes through this function. * Dynamo warns on every einops op trace and this creates a logspam problem. Pull Request resolved: https://github.com/pytorch/pytorch/pull/175351 Approved by: https://github.com/Lucaskabela (cherry picked from commit 1fe0f51a5f14f566c6ab58a386eb86f3f5ca227e) Co-authored-by: Guilherme Leobas --- test/dynamo/test_einops.py | 11 +++++++++++ torch/_dynamo/decorators.py | 17 +++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/test/dynamo/test_einops.py b/test/dynamo/test_einops.py index fcd86e50b944c..2c445be38ee8e 100644 --- a/test/dynamo/test_einops.py +++ b/test/dynamo/test_einops.py @@ -11,6 +11,7 @@ from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, parametrize, + xfailIf, ) @@ -190,6 +191,7 @@ def f(x): else: self.assertIn(einops_method, output) + @xfailIf(einops_version == "0.8.2") @parametrize( "method", ["reduce", "repeat", "pack", "unpack", "einsum", "rearrange"], @@ -222,6 +224,15 @@ def test_einops_method(self, method): self.fail(method) self._run_in_subprocess(flag, method, einops_method, snippet) + def test_no_warning(self): + # checks that this doesn't produce any warnings + @torch.compile(backend="eager", fullgraph=True) + def fn(x): + return einops.rearrange(x, "... -> (...)") + + x = torch.randn(5) + self.assertNotWarn(lambda: fn(x)) + instantiate_parametrized_tests( TestEinops, diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py index ed1354555a7d7..b444eaea9fb70 100644 --- a/torch/_dynamo/decorators.py +++ b/torch/_dynamo/decorators.py @@ -1156,14 +1156,15 @@ def mark_static_address(t: Any, guard: bool = False) -> None: def _allow_in_graph_einops() -> None: import einops - if einops.__version__ >= "0.8.2": - if hasattr(einops, "einops") and hasattr(einops.einops, "get_backend"): - # trigger backend registration up front to avoid a later guard failure - # that would otherwise cause a recompilation - einops.rearrange(torch.randn(1), "i -> i") - - # einops 0.8.2+ don't need explicit allow_in_graph calls - return + # There is a lru_cache logspam issue with einops when allow_in_graph is not + # used. Disabling this for now until the lru_cache issue is resolved. + # if einops.__version__ >= "0.8.2": + # if hasattr(einops, "einops") and hasattr(einops.einops, "get_backend"): + # # trigger backend registration up front to avoid a later guard failure + # # that would otherwise cause a recompilation + # einops.rearrange(torch.randn(1), "i -> i") + # # einops 0.8.2+ don't need explicit allow_in_graph calls + # return try: # requires einops > 0.6.1, torch >= 2.0 From 013fdc238535f85f3ad603016a88ab0f6fabecce Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 24 Feb 2026 14:47:31 -0800 Subject: [PATCH 14/87] [CPUBLAS] Fix UB: use vector::resize() instead of reserve() before operator[] access (#175579) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [CPUBLAS] Fix UB: use vector::resize() instead of reserve() before operator[] access (#175315) Fixes #175302 ## Summary `reserve(1)` → `resize(1)`. See issue for details. Pull Request resolved: https://github.com/pytorch/pytorch/pull/175315 Approved by: https://github.com/zou3519, https://github.com/malfet (cherry picked from commit f08aafa9e82c5ae142b97dbfcac1ebd5d9ca7fde) Co-authored-by: mulatta <67085791+mulatta@users.noreply.github.com> --- aten/src/ATen/native/CPUBlas.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index 87351cdb98717..d36b5c813a140 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -1107,7 +1107,7 @@ struct GemmHelper { // Create a scratchpad buffer for the brgemm execution scratchpad = std::vector(brg.get_scratchpad_size()); // Prepare default vector of pairs of tensors A and B offsets for each batch. - A_B_offsets.reserve(1); + A_B_offsets.resize(1); A_B_offsets[0] = std::make_pair(0, 0); } dnnl::ukernel::brgemm brg; From d70d867311dd195f80832486ff33ba6b3fdc271f Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 24 Feb 2026 14:49:38 -0800 Subject: [PATCH 15/87] Remove python constraint on setuptools (#175627) Remove python constraint on setuptools (#175577) Fixes https://github.com/pytorch/pytorch/issues/173823 Dependency on setuptools was added 8 years ago here: https://github.com/pytorch/pytorch/pull/5207 This issue remained hidden since we run smoke test in conda env. Conda create env installs setuptools by default. This became apparent when testing using uv Pull Request resolved: https://github.com/pytorch/pytorch/pull/175577 Approved by: https://github.com/malfet, https://github.com/seemethere (cherry picked from commit eaa022177ba8b2d8b38e27316a18696c49299cd8) Co-authored-by: atalman --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7b9fba42e746e..0decaf66ed625 100644 --- a/setup.py +++ b/setup.py @@ -1703,7 +1703,7 @@ def main() -> None: install_requires = [ "filelock", "typing-extensions>=4.10.0", - 'setuptools<82 ; python_version >= "3.12"', + "setuptools<82", "sympy>=1.13.3", "networkx>=2.5.1", "jinja2", From 883a7e2b5f257dcafb31da0b9210f292243ef7f7 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 24 Feb 2026 15:06:00 -0800 Subject: [PATCH 16/87] Supports custom empty tensor in InputObserver (#175581) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supports custom empty tensor in InputObserver (#174964) When running a LLM handling images and text (Gemma3), the first call to the forward method has input_ids, pixel_values and but no past_key_values. Next calls do not have pixel_values but have past_key_values. The InputObserver knows the whole list of inputs but since, there is only one example of input_pixel (and the batch dimension is usually constant accross all calls), we need to way to tell the InputObserver what a empty tensor for pixel_values when it is missing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/174964 Approved by: https://github.com/titaiwangms, https://github.com/justinchuby (cherry picked from commit bc9adaa4524c67014ed4945292930c215392d192) Co-authored-by: Xavier Dupré Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Justin Chu --- test/onnx/exporter/test_input_observer.py | 285 ++++++++++++++ .../_internal/exporter/_input_observer.py | 368 +++++++++++++++--- 2 files changed, 604 insertions(+), 49 deletions(-) diff --git a/test/onnx/exporter/test_input_observer.py b/test/onnx/exporter/test_input_observer.py index 39386a0775fc2..68e74c40b7cd3 100644 --- a/test/onnx/exporter/test_input_observer.py +++ b/test/onnx/exporter/test_input_observer.py @@ -872,6 +872,291 @@ def forward(self, x=None, y=None): with self.assertRaises(RuntimeError): observer.infer_dynamic_shapes() + def test_infer_dynamic_shapes_missing(self): + class Model(torch.nn.Module): + def forward( + self, + input_ids=None, + pixel_values=None, + attention_mask=None, + position_ids=None, + past_key_values=None, + token_type_ids=None, + cache_position=None, + ): + return input_ids + + inputs = [ + dict( + input_ids=torch.ones((1, 28), dtype=torch.int64), + pixel_values=torch.ones((1, 3, 112, 112), dtype=torch.int64), + attention_mask=torch.ones((1, 28), dtype=torch.int64), + position_ids=torch.ones((1, 28), dtype=torch.int64), + token_type_ids=torch.ones((1, 28), dtype=torch.int64), + cache_position=torch.ones((28,), dtype=torch.int64), + ), + dict( + input_ids=torch.ones((1, 1), dtype=torch.int64), + attention_mask=torch.ones((1, 29), dtype=torch.int64), + position_ids=torch.ones((1, 1), dtype=torch.int64), + past_key_values=torch.rand((1, 1, 28, 32)), + token_type_ids=torch.ones((1, 1), dtype=torch.int64), + cache_position=torch.ones((1,), dtype=torch.int64), + ), + dict( + input_ids=torch.ones((1, 1), dtype=torch.int64), + attention_mask=torch.ones((1, 30), dtype=torch.int64), + position_ids=torch.ones((1, 1), dtype=torch.int64), + past_key_values=torch.rand((1, 1, 29, 32)), + token_type_ids=torch.ones((1, 1), dtype=torch.int64), + cache_position=torch.ones((1,), dtype=torch.int64), + ), + ] + + model = Model() + observer = InputObserver( + value_if_missing=dict(pixel_values=torch.empty((0, 3, 112, 112))) + ) + with observer(model): + for kwargs in inputs: + model(**kwargs) + + shapes = observer.infer_dynamic_shapes(set_batch_dimension_for=True) + cst = torch.export.Dim.DYNAMIC + expected = { + "input_ids": {0: cst, 1: cst}, + "pixel_values": {0: cst}, + "attention_mask": {0: cst, 1: cst}, + "position_ids": {0: cst, 1: cst}, + "past_key_values": {0: cst, 2: cst}, + "token_type_ids": {0: cst, 1: cst}, + "cache_position": {0: cst}, + } + self.assertEqual(expected, shapes) + kwargs = observer.infer_arguments() + self.assertEqual(list(expected), list(kwargs)) + self.assertEqual((0, 3, 112, 112), kwargs["pixel_values"].shape) + + def test_infer_dynamic_shapes_missing_args(self): + class Model(torch.nn.Module): + def forward( + self, + input_ids=None, + pixel_values=None, + attention_mask=None, + past_key_values=None, + ): + return input_ids + + inputs = [ + ( + torch.ones((1, 28), dtype=torch.int64), + torch.ones((1, 3, 112, 112), dtype=torch.int64), + torch.ones((1, 28), dtype=torch.int64), + ), + ( + torch.ones((1, 1), dtype=torch.int64), + None, + torch.ones((1, 29), dtype=torch.int64), + torch.rand((1, 1, 28, 32)), + ), + ( + torch.ones((1, 1), dtype=torch.int64), + None, + torch.ones((1, 30), dtype=torch.int64), + torch.rand((1, 1, 29, 32)), + ), + ] + + model = Model() + observer = InputObserver( + value_if_missing={1: torch.empty((0, 3, 112, 112), dtype=torch.int64)} + ) + with observer(model): + for args in inputs: + model(*args) + + shapes = observer.infer_dynamic_shapes(set_batch_dimension_for=True) + cst = torch.export.Dim.DYNAMIC + expected = ({0: cst, 1: cst}, {0: cst}, {0: cst, 1: cst}, {0: cst, 2: cst}) + self.assertEqual(expected, shapes) + args = observer.infer_arguments() + self.assertEqual(len(expected), len(args)) + self.assertEqual((0, 3, 112, 112), args[1].shape) + + def test_infer_dynamic_shapes_missing_kwargs_nested(self): + class Model(torch.nn.Module): + def forward( + self, + input_ids=None, + pixel_values=None, + attention_mask=None, + position_ids=None, + past_key_values=None, + token_type_ids=None, + cache_position=None, + ): + return input_ids + + inputs = [ + dict( + input_ids=torch.ones((1, 28), dtype=torch.int64), + pixel_values=( + torch.ones((1, 3, 112, 112), dtype=torch.int64), + torch.ones((1, 3, 112, 112), dtype=torch.int64), + ), + attention_mask=torch.ones((1, 28), dtype=torch.int64), + position_ids=torch.ones((1, 28), dtype=torch.int64), + token_type_ids=torch.ones((1, 28), dtype=torch.int64), + cache_position=torch.ones((28,), dtype=torch.int64), + ), + dict( + input_ids=torch.ones((1, 1), dtype=torch.int64), + attention_mask=torch.ones((1, 29), dtype=torch.int64), + position_ids=torch.ones((1, 1), dtype=torch.int64), + past_key_values=torch.rand((1, 1, 28, 32)), + token_type_ids=torch.ones((1, 1), dtype=torch.int64), + cache_position=torch.ones((1,), dtype=torch.int64), + ), + dict( + input_ids=torch.ones((1, 1), dtype=torch.int64), + attention_mask=torch.ones((1, 30), dtype=torch.int64), + position_ids=torch.ones((1, 1), dtype=torch.int64), + past_key_values=torch.rand((1, 1, 29, 32)), + token_type_ids=torch.ones((1, 1), dtype=torch.int64), + cache_position=torch.ones((1,), dtype=torch.int64), + ), + ] + + model = Model() + observer = InputObserver( + value_if_missing=dict( + pixel_values=( + torch.empty((0, 3, 112, 112), dtype=torch.int64), + torch.empty((0, 3, 112, 112), dtype=torch.int64), + ) + ) + ) + with observer(model): + for kwargs in inputs: + model(**kwargs) + + shapes = observer.infer_dynamic_shapes(set_batch_dimension_for=True) + cst = torch.export.Dim.DYNAMIC + expected = { + "input_ids": {0: cst, 1: cst}, + "pixel_values": ({0: cst}, {0: cst}), + "attention_mask": {0: cst, 1: cst}, + "position_ids": {0: cst, 1: cst}, + "past_key_values": {0: cst, 2: cst}, + "token_type_ids": {0: cst, 1: cst}, + "cache_position": {0: cst}, + } + self.assertEqual(expected, shapes) + kwargs = observer.infer_arguments() + self.assertEqual(list(expected), list(kwargs)) + self.assertIsInstance(kwargs["pixel_values"], tuple) + self.assertEqual(2, len(kwargs["pixel_values"])) + self.assertEqual((0, 3, 112, 112), kwargs["pixel_values"][0].shape) + self.assertEqual((0, 3, 112, 112), kwargs["pixel_values"][1].shape) + + def test_io_captured_kwargs_kwargs(self): + class Model(torch.nn.Module): + def forward(self, x, **kwargs): + return x + kwargs["y"] + + inputs = [ + dict(x=torch.randn((5, 6)), y=torch.randn((1, 6))), + dict(x=torch.randn((7, 7)), y=torch.randn((1, 7))), + dict(x=torch.randn((7, 8)), y=torch.randn((1, 8))), + dict(x=torch.randn((7, 9)), y=torch.randn((1, 9))), + ] + + model = Model() + expected = [model(**kwargs) for kwargs in inputs] + observer = InputObserver() + with observer(model): + for kwargs in inputs: + model(**kwargs) + self.assertEqual(len(observer.info), 3) + for i in range(3): + self.assertEqual(len(observer.info.flat_outputs[i]), 1) + torch.testing.assert_close(expected[i], observer.info.flat_outputs[i][0]) + + cst = torch.export.Dim.DYNAMIC + ds = observer.infer_dynamic_shapes() + self.assertEqual(dict(x={0: cst, 1: cst}, kwargs=dict(y={1: cst})), ds) + args = observer.infer_arguments() + self.assertIsInstance(args, dict) + self.assertEqual(2, len(args)) + self.assertEqual(["x", "y"], list(args)) + + dynamic_shapes = torch.export.AdditionalInputs() + for kwargs in inputs: + dynamic_shapes.add((), kwargs) + dss = dynamic_shapes.dynamic_shapes(model, (), inputs[0]) + self.assertEqual({"x": (cst, cst), "kwargs": {"y": (None, cst)}}, dss) + + def test_io_captured_kwargs_kwargs_with_args(self): + class Model(torch.nn.Module): + def forward(self, a, *args, **kwargs): + return a - args[0] * args[1] + kwargs["x"] - kwargs["y"] + + inputs = [ + ( + (torch.randn((5, 6)), torch.randn((5, 6)), torch.randn((5, 6))), + dict(x=torch.randn((5, 6)), y=torch.randn((1, 6))), + ), + ( + (torch.randn((7, 7)), torch.randn((7, 7)), torch.randn((7, 7))), + dict(x=torch.randn((7, 7)), y=torch.randn((1, 7))), + ), + ] + + model = Model() + expected = [model(*args, **kwargs) for args, kwargs in inputs] + observer = InputObserver() + with observer(model): + for args, kwargs in inputs: + model(*args, **kwargs) + self.assertEqual(len(observer.info), 2) + for i in range(2): + self.assertEqual(len(observer.info.flat_outputs[i]), 1) + torch.testing.assert_close(expected[i], observer.info.flat_outputs[i][0]) + + cst = torch.export.Dim.DYNAMIC + ds = observer.infer_dynamic_shapes() + self.assertEqual( + { + "a": {0: cst, 1: cst}, + "args": ({0: cst, 1: cst}, {0: cst, 1: cst}), + "kwargs": {"x": {0: cst, 1: cst}, "y": {1: cst}}, + }, + ds, + ) + + dynamic_shapes = torch.export.AdditionalInputs() + for args, kwargs in inputs: + dynamic_shapes.add(args, kwargs) + dss = dynamic_shapes.dynamic_shapes(model, *inputs[0]) + self.assertEqual( + { + "a": (cst, cst), + "args": ((cst, cst), (cst, cst)), + "kwargs": {"x": (cst, cst), "y": (None, cst)}, + }, + dss, + ) + + with self.assertRaises(RuntimeError): + observer.infer_arguments() + + args, kwargs = observer.infer_arguments(as_args_kwargs=True) + self.assertIsInstance(kwargs, dict) + self.assertEqual(["x", "y"], list(kwargs)) + self.assertIsInstance(args, tuple) + self.assertEqual(len(args), 3) + if __name__ == "__main__": common_utils.run_tests() diff --git a/torch/onnx/_internal/exporter/_input_observer.py b/torch/onnx/_internal/exporter/_input_observer.py index 2a4392919e120..f6fb62b41d474 100644 --- a/torch/onnx/_internal/exporter/_input_observer.py +++ b/torch/onnx/_internal/exporter/_input_observer.py @@ -27,11 +27,11 @@ def _flatten_unflatten_for_dynamic_shapes( Args: obj: Object from a custom class. - change_function: Function to modify the tensor in the structure itself, - like replace them by a shape. + change_function: If not None, this function is called to modify the tensors + in the structure itself, like replace them by a shape. Returns: - the serialized object + The flattened object. """ if isinstance(obj, torch.Tensor): return change_function(obj) if change_function else obj @@ -77,10 +77,10 @@ def _infer_dynamic_dimensions( Args: shape_list: - list of shapes, they must all have the same length + List of shapes, they must all have the same length. set_batch_dimension: - forces the first dimension to be treated as dynamic, - even if all shapes have the same value for that dimension + Forces the first dimension to be treated as dynamic, + even if all shapes have the same value for that dimension. Returns: list of dynamic dimensions @@ -88,7 +88,7 @@ def _infer_dynamic_dimensions( unique_ranks = {len(shape) for shape in shape_list} torch._check( len(unique_ranks) == 1, - lambda: "all shapes in shape_list must have the same rank", + lambda: f"All shapes in shape_list must have the same rank but {shape_list=}.", ) rank = unique_ranks.pop() dynamic = [] @@ -102,6 +102,7 @@ def _infer_dynamic_dimensions( class InputCandidate: """Retains one set of inputs given to the forward method or any other method the class :class:`InputObserver` is stealing from. + Any class is allowed as long as it can be flattened. Args: args: Positional arguments. @@ -110,6 +111,9 @@ class InputCandidate: may be modified inplace, the original value must be retained. cst_kwargs: Any optional arguments constant over multiple calls. int, float, str, bool values must be stored here. + + The constructor flattens the received arguments. + Any necessary flattening function should have been registered first. """ def __init__( @@ -283,18 +287,36 @@ class InputObserverInfo: to be the same in the ordered dictionaries `add_inputs` receive. default_values: Default values defined by the signature of the function, any value equal to that is ignored to simplify the export. + value_if_missing: If an argument is missing, + a default value will be taken in this dictionary, + this is used when after the prefill step, an argument + disappears (such as `pixel_values`) and another one + is added (such as `past_key_values`). + The values are only to infer dynamic shapes and arguments, + not to run the model. + args_name_and_position: Name of parameter `*args` + and its position if it exists. + kwargs_name: Name of the variable keyword parameter `**kwargs` if it exists. + + This is used by class :class:`InputObserver`. """ def __init__( self, signature_names: list[str], default_values: dict[str, int | bool | str | float], + value_if_missing: dict[str | int, Any], + args_name_and_position: tuple[str, int] | None, + kwargs_name: str | None, ): self.default_values = default_values + self.value_if_missing = value_if_missing self.inputs: list[InputCandidate] = [] self.outputs_specs: list[torch.utils._pytree.PyTreeSpec] = [] self.flat_outputs: list[list[torch.Tensor | None]] = [] self.latencies: list[float] = [] + self.args_name_and_position = args_name_and_position + self.kwargs_name = kwargs_name self.signature_names = signature_names self._best_candidate: InputCandidate | None = None self._captured_inputs: dict[int | str, int] | None = None @@ -316,6 +338,7 @@ def add_inputs(self, args: tuple[Any, ...], kwargs: dict[str, Any]): if k in self.signature_names and isinstance(v, (int, float, bool, str)) and v != self.default_values.get(k, None) + and self.default_values.get(k, None) is not None } kwargs = { k: v @@ -323,6 +346,50 @@ def add_inputs(self, args: tuple[Any, ...], kwargs: dict[str, Any]): if v is not None and not isinstance(v, (int, float, bool, str)) } + # adds value_if_missing attributes + for k, v in self.value_if_missing.items(): + if isinstance(k, str): + if k not in kwargs: + # Validate that `value_if_missing` keys are compatible + # with the observed signature. + # If the function does not accept **kwargs, + # all value_if_missing keys must be + # present in the observed signature names. + if k not in self.signature_names and not self.kwargs_name: + raise ValueError( + f"Unexpected keyword argument {k!r} " + f"provided as a value_if_missing input " + "for a function that does not accept it. " + f"All value_if_missing keys must " + f"be in the observed signature: {tuple(self.signature_names)}." + ) + kwargs[k] = v + elif isinstance(k, int): + if k >= len(self.signature_names): + raise ValueError( + f"Unexpected keyword argument {k=} " + f"provided as a value_if_missing input " + "for a function that does not accept it. " + f"All value_if_missing indices must " + f"be in the observed signature: {tuple(self.signature_names)}." + ) + if k >= len(args): + raise NotImplementedError( + f"Unexpected keyword argument {k=} " + f"provided as a value_if_missing input " + "for a function that does not accept it. " + f"All value_if_missing indices must " + f"be in the observed signature: {tuple(self.signature_names)}, " + f"only {len(args)} were given." + ) + list_args = list(args) + list_args[k] = v + args = tuple(list_args) + else: + raise TypeError( + f"Unexpected type {type(k)} for a missing value. The key is {k!r}." + ) + # kwargs may come in a different order each time. # dictionaries are ordered and torch.export.export expects # dynamic shapes and kwargs to follow the same order. @@ -458,27 +525,61 @@ def _set_batch_dimension_for_flat_index(index) -> bool: flat_dynamic_shapes = [dict.fromkeys(dims, cst) for dims in dynamic_shapes] if return_flat: return tuple(flat_dynamic_shapes) + + # Let's regroup. if len(flat_dynamic_shapes) == len(self._best_candidate.args) + len( self._best_candidate.kwargs ): # It means forward method is called with tensors only. - if not self._best_candidate.kwargs and not self._best_candidate.cst_kwargs: + if ( + not self._best_candidate.kwargs + and not self._best_candidate.cst_kwargs + and not self.args_name_and_position + ): # only positional arguments return tuple(flat_dynamic_shapes) if not self._best_candidate.args: # only named arguments ds = dict(zip(list(self._best_candidate.kwargs), flat_dynamic_shapes)) - return {**ds, **dict.fromkeys(self._best_candidate.cst_kwargs, None)} + return self._post_process_for_kwargs( + {**ds, **dict.fromkeys(self._best_candidate.cst_kwargs, None)} + ) + if not self.args_name_and_position: + # positional arguments needs to be moved to the named arguments + n_args = len(self._best_candidate.args) + pos_names = self.signature_names[:n_args] + return self._post_process_for_kwargs( + { + **dict(zip(pos_names, flat_dynamic_shapes[:n_args])), + **dict( + zip( + list(self._best_candidate.kwargs), + flat_dynamic_shapes[n_args:], + ) + ), + **dict.fromkeys(self._best_candidate.cst_kwargs, None), + } + ) # positional arguments needs to be moved to the named arguments - n_args = len(self._best_candidate.args) + n_args = min(len(self._best_candidate.args), self.args_name_and_position[1]) + i_kwargs = max( + len(self._best_candidate.args), self.args_name_and_position[1] + ) + var_pos = self.args_name_and_position[0] pos_names = self.signature_names[:n_args] - return { - **dict(zip(pos_names, flat_dynamic_shapes[:n_args])), - **dict( - zip(list(self._best_candidate.kwargs), flat_dynamic_shapes[n_args:]) - ), - **dict.fromkeys(self._best_candidate.cst_kwargs, None), - } + return self._post_process_for_kwargs( + { + **dict(zip(pos_names, flat_dynamic_shapes[:n_args])), + var_pos: tuple(flat_dynamic_shapes[n_args:i_kwargs]), + **dict( + zip( + list(self._best_candidate.kwargs), + flat_dynamic_shapes[i_kwargs:], + ) + ), + **dict.fromkeys(self._best_candidate.cst_kwargs, None), + } + ) # nested types, here comes the fun part because the shapes cannot be unflattened, # custom classes must appear in their flattened shape. @@ -518,20 +619,62 @@ def change_function(t): **ds_kwargs, **dict.fromkeys(self._best_candidate.cst_kwargs, None), } - if not ds_kwargs: + if not ds_kwargs and not self.args_name_and_position: return tuple(ds_args) if not ds_args: - return ds_kwargs - pos_names = self.signature_names[: len(ds_args)] - return {**dict(zip(pos_names, ds_args)), **ds_kwargs} + return self._post_process_for_kwargs(ds_kwargs) + + if not self.args_name_and_position: + pos_names = self.signature_names[: len(ds_args)] + return self._post_process_for_kwargs( + {**dict(zip(pos_names, ds_args)), **ds_kwargs} + ) + + n_args = min(len(ds_args), self.args_name_and_position[1]) + pos_names = self.signature_names[:n_args] + return self._post_process_for_kwargs( + { + **dict(zip(pos_names, ds_args[:n_args])), + self.args_name_and_position[0]: tuple(ds_args[n_args:]), + **ds_kwargs, + } + ) def infer_arguments( self, index_or_candidate: InputCandidate | int | None = None, /, flat: bool = False, - ) -> list[torch.Tensor | None] | tuple[torch.Tensor, ...] | dict[str, torch.Tensor]: - """Infers arguments based on the collected tensors.""" + as_args_kwargs: bool = False, + ) -> ( + list[torch.Tensor | None] + | tuple[torch.Tensor, ...] + | dict[str, torch.Tensor] + | tuple[list[torch.Tensor] | tuple[torch.Tensor, ...], dict[str, torch.Tensor]] + ): + """Infers arguments based on the collected tensors. + + Args: + index_or_candidate: If missing, the method selects one set of inputs + among the available ones, usually the set of inputs containing + with the highest number of tensors. + It then replaces None values and missing tensors with empty tensors. + If not missing, it can be an integer to fetch one of the stored set + or some inputs. + flat: If True, it returns a flattened list of tensors, + if False, it returns a tuple or a dictionary preserving + the nested structures. The flat version is used internally. + It produces a single list of tensors easier to process or modify + rather than a nested structure holding the same tensors. + The original structure can be restored with + ``torch.utils._pytree.tree_unflatten(flat_list, self.aligned_spec)``. + This mechanism is used to replace None values by empty tensors. + as_args_kwargs: If True, the method always returns `(args, kwargs)`, + otherwise, it returns either a tuple (only args) or a dictionary + (only kwargs) or raises an exception if it cannot do so. + Returns: + Inferred arguments, every optional tensor is replaced by an empty tensor. + """ # This is already checked by _build_inputs_completed_with_none_values # but this is not always well captured by tools checking types. self.align_inputs_none_values() @@ -540,9 +683,9 @@ def infer_arguments( if index_or_candidate is None: for cand in self.inputs: args, kwargs = cand.args, cand.kwargs - if len(args) == len(self._best_candidate.args) and len(kwargs) == len( - self._best_candidate.kwargs - ): + if len(args) == len(self._best_candidate.args or ()) and len( + kwargs + ) == len(self._best_candidate.kwargs or {}): candidate = cand break elif isinstance(index_or_candidate, int): @@ -622,13 +765,50 @@ def infer_arguments( # pyrefly: ignore[invalid-argument] kwargs = {**kwargs, **self._best_candidate.cst_kwargs} - if not kwargs: - return args - if not args: + if not as_args_kwargs: + if not kwargs: + return args + if not args: + return kwargs + + # We need to move args to kwargs + if self.args_name_and_position: + raise RuntimeError( + "Cannot return arguments " + "as a single tuple or a single dictionary " + "because of '*args' in the function signature. " + "You need to set `as_args_kwargs=True`." + ) + n_args = len(args) + pos_names = self.signature_names[:n_args] + return {**dict(zip(pos_names, args[:n_args])), **kwargs} + + # Generic case. + return tuple(args), kwargs + + def _post_process_for_kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]: + """:func:`torch.export.export` requires dynamic shapes and keyword arguments + that are not part of the explicit function signature but are collected via + ``**`` to be wrapped under the corresponding parameter name + (``self.kwargs_name``) as ``{: {'param': shape or tensor}}``. + This function ensures this wrapping is performed when ``self.kwargs_name`` is set. + """ + if not self.kwargs_name: + # Nothing to do here. return kwargs - # We need to move args to kwargs - pos_names = self.signature_names[: len(args)] - return {**dict(zip(pos_names, args)), **kwargs} + to_be_moved = {k for k in kwargs if k not in self.signature_names} + if not to_be_moved: + return kwargs + keywords = {k: v for k, v in kwargs.items() if k in to_be_moved} + new_kwargs = {k: v for k, v in kwargs.items() if k not in to_be_moved} + if self.kwargs_name in new_kwargs: + raise ValueError( + f"Keyword argument name collision: received a keyword argument " + f"'{self.kwargs_name}' which conflicts with the **{self.kwargs_name} " + "parameter used to collect extra keyword arguments. " + "Passing a keyword argument with this name is not supported." + ) + return {**new_kwargs, self.kwargs_name: keywords} class InputObserver: @@ -636,6 +816,15 @@ class InputObserver: This information is used to infer dynamic shapes and export arguments. + Args: + value_if_missing: If an argument is missing, + a default value will be taken in this dictionary, + this is used when after the prefill step, an argument + disappears (such as `pixel_values`) and another one + is added (such as `past_key_values`). + The values are only to infer dynamic shapes and arguments, + not to run the model. + Examples -------- >>> input_observer = InputObserver() @@ -660,11 +849,59 @@ class InputObserver: >>> dynamic_shapes.input_observer.infer_dynamic_shapes(), >>> ) + The last example considers an LLM taking images and text as inputs. + The first call to the forward method which we try to export has `pixel_values` + but no `past_key_values`. The next calls do not have `pixel_values` but + `past_key_values`. The observer understands `pixel_values` and `past_key_values` + are needed but they may not be both specified at the same time. + Since `pixel_values` only appears in the first call, the observer cannot + tell how to infer an empty tensor for this argument. That's what the argument + `value_if_missing` is for. The following example is more than a dummy example + but shows how to use it with ``transformers``. + + .. code-block:: python + + from transformers import pipeline + + model_id = "tiny-random/gemma-3" + pipe = pipeline( + "image-text-to-text", + model=model_id, + device="cpu", + trust_remote_code=True, + max_new_tokens=3, + dtype=torch.float16, + ) + messages = [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a helpful assistant."}], + }, + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG", + }, + {"type": "text", "text": "What animal is on the candy?"}, + ], + }, + ] + observer = InputObserver( + value_if_missing=dict( + pixel_values=torch.empty((0, 3, 896, 896), dtype=torch.float16) + ) + ) + with observer(pipe.model): + pipe(text=messages, max_new_tokens=4) + .. versionadded:: 2.11.0 """ - def __init__(self): + def __init__(self, value_if_missing: dict[str | int, Any] | None = None): self.info: InputObserverInfo | None = None + self.value_if_missing = value_if_missing or {} def _replaced_method( self, @@ -716,6 +953,16 @@ def __call__( captured_method = getattr(model, method_name) sig = inspect.signature(captured_method) if self.info is None: + kwargs_names = [ + p + for p in sig.parameters + if sig.parameters[p].kind == inspect.Parameter.VAR_KEYWORD + ] + args_names = [ + (p, i) + for (i, p) in enumerate(sig.parameters) + if sig.parameters[p].kind == inspect.Parameter.VAR_POSITIONAL + ] self.info = InputObserverInfo( signature_names=list(sig.parameters), default_values={ @@ -724,6 +971,9 @@ def __call__( if p.default != inspect.Parameter.empty and isinstance(p.default, (int, bool, str, float)) }, + value_if_missing=self.value_if_missing, + args_name_and_position=args_names[0] if args_names else None, + kwargs_name=kwargs_names[0] if kwargs_names else None, ) n_already_stored = len(self.info) lambda_method = lambda *args, _cm=captured_method, _snc=( # noqa: E731 @@ -777,7 +1027,13 @@ def infer_arguments( self, index_or_args_or_kwargs: tuple[Any] | dict[str, Any] | int | None = None, flat: bool = False, - ) -> list[torch.Tensor | None] | tuple[torch.Tensor, ...] | dict[str, torch.Tensor]: + as_args_kwargs: bool = False, + ) -> ( + list[torch.Tensor | None] + | tuple[torch.Tensor, ...] + | dict[str, torch.Tensor] + | tuple[list[torch.Tensor] | tuple[torch.Tensor, ...], dict[str, torch.Tensor]] + ): """Infers arguments based on the collected tensors. Args: @@ -789,8 +1045,15 @@ def infer_arguments( or some inputs. flat: If True, it returns a flattened list of tensors, if False, it returns a tuple or a dictionary preserving - the nested structures. - + the nested structures. The flat version is used internally. + It produces a single list of tensors easier to process or modify + rather than a nested structure holding the same tensors. + The original structure can be restored with + ``torch.utils._pytree.tree_unflatten(flat_list, self.aligned_spec)``. + This mechanism is used to replace None values by empty tensors. + as_args_kwargs: If True, the method always returns `(args, kwargs)`, + otherwise, it returns either a tuple (only args) or a dictionary + (only kwargs) or raises an exception if it cannot do so. Returns: Inferred arguments, every optional tensor is replaced by an empty tensor. """ @@ -832,7 +1095,11 @@ def infer_arguments( self.info._captured_inputs, self.info.signature_names, ) - return self.info.infer_arguments(index_or_candidate, flat=flat) + return self.info.infer_arguments( + index_or_candidate, + flat=flat, + as_args_kwargs=as_args_kwargs, + ) def check_discrepancies( self, @@ -843,25 +1110,26 @@ def check_discrepancies( initializer: Callable[ [str | bytes], ort.InferenceSession ] = _onnx_program._ort_session_initializer, - ) -> list[dict[str, str | int | float]]: + skip_none: bool = True, + ) -> list[dict[str, str | int | float | bool]]: """Computes the discrepancies between the saved inputs and outputs with the saved onnx model. Args: - onnx_program: - Exported Model to verify. - atol: - Absolute tolerance, recommended values, 1e-4 for float, 1e-2 for float16. - rtol: - Relative tolerance. - progress_bar: - Shows a progress bar (requires `tqdm`). - initializer: The function to initialize the ONNX Runtime inference + onnx_program: Exported Model to verify. + atol: Absolute tolerance, recommended values, 1e-4 for float, 1e-2 for float16. + rtol: Relative tolerance. + progress_bar: Shows a progress bar (requires `tqdm`). + initializer: The function called to initialize the ONNX Runtime inference session with the specified model. By default, it uses the `_ort_session_initializer` function. + skip_none: Does not check discrepancies when an output is None. Returns: A list of dictionaries, ready to be consumed by a dataframe. + + The function catches exceptions, it shows the error in the returned + summary. """ # For big models, we should consider taking a filename to avoid the users # creating the model proto twice. @@ -904,7 +1172,9 @@ def check_discrepancies( duration = time.perf_counter() - begin if error: - diff: dict[str, Any] = dict(error=error, SUCCESS=False) + diff: dict[str, str | int | float | bool] = dict( + error=error, SUCCESS=False + ) elif ort_outputs is None or len(outputs) != len(ort_outputs): diff = dict(SUCCESS=False, error="not the same number of outputs") else: @@ -915,7 +1185,7 @@ def check_discrepancies( # pyrefly: ignore[no-matching-overload] for torch_tensor, ort_tensor in zip(outputs, ort_outputs): if torch_tensor is None or ort_tensor is None: - if type(torch_tensor) is not type(ort_tensor): + if type(torch_tensor) is not type(ort_tensor) and not skip_none: success = False error = "missing output" break From 9bb65cc0961441322f2b5e1acb4d7b92fe199507 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 24 Feb 2026 15:15:36 -0800 Subject: [PATCH 17/87] Bump transformers version to 5.2.0 (#175661) Bump transformers version to 5.2.0 (#175274) Take over the Dependabot PR from https://github.com/pytorch/pytorch/pull/175147 to fix the failures there Pull Request resolved: https://github.com/pytorch/pytorch/pull/175274 Approved by: https://github.com/xmfan, https://github.com/malfet (cherry picked from commit 268cfa727a55fda9a68a791bc3d4e0d941b220ba) Co-authored-by: Huy Do --- .../ci_commit_pins/huggingface-requirements.txt | 2 +- .../aot_eager_huggingface_inference.csv | 2 +- .../aot_eager_huggingface_training.csv | 2 +- ...ductor_amp_freezing_huggingface_inference.csv | 2 +- ...u_inductor_freezing_huggingface_inference.csv | 2 +- .../cpu_inductor_huggingface_inference.csv | 2 +- .../dynamic_aot_eager_huggingface_inference.csv | 2 +- .../dynamic_aot_eager_huggingface_training.csv | 2 +- ...ynamic_cpu_inductor_huggingface_inference.csv | 2 +- ...ductor_amp_freezing_huggingface_inference.csv | 2 +- .../dynamic_inductor_huggingface_inference.csv | 2 +- .../dynamic_inductor_huggingface_training.csv | 2 +- .../dynamo_eager_huggingface_inference.csv | 2 +- .../dynamo_eager_huggingface_training.csv | 2 +- .../inductor_huggingface_inference.csv | 2 +- .../inductor_huggingface_training.csv | 2 +- .../rocm/aot_eager_huggingface_inference.csv | 2 +- .../rocm/aot_eager_huggingface_training.csv | 4 ++-- .../dynamic_aot_eager_huggingface_inference.csv | 2 +- .../dynamic_aot_eager_huggingface_training.csv | 4 ++-- .../dynamic_inductor_huggingface_inference.csv | 2 +- .../dynamic_inductor_huggingface_training.csv | 4 ++-- .../rocm/dynamo_eager_huggingface_inference.csv | 2 +- .../rocm/dynamo_eager_huggingface_training.csv | 4 ++-- .../rocm/inductor_huggingface_inference.csv | 2 +- .../rocm/inductor_huggingface_training.csv | 2 +- test/distributed/test_dynamo_distributed.py | 16 ++++++++++++++++ 27 files changed, 46 insertions(+), 30 deletions(-) diff --git a/.ci/docker/ci_commit_pins/huggingface-requirements.txt b/.ci/docker/ci_commit_pins/huggingface-requirements.txt index 408343c9099c8..08538ff511057 100644 --- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt +++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt @@ -1,2 +1,2 @@ -transformers==4.57.5 +transformers==5.2.0 soxr==0.5.0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv index 82d21ea3cb298..5ca03b5ecf9fb 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv index 439c9bf530468..e4aabce10466d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv index 439c9bf530468..e4aabce10466d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv index 439c9bf530468..e4aabce10466d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv index 82d21ea3cb298..5ca03b5ecf9fb 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv index 439c9bf530468..e4aabce10466d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv index 439c9bf530468..e4aabce10466d 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv index 82d21ea3cb298..5ca03b5ecf9fb 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv index 82d21ea3cb298..5ca03b5ecf9fb 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv index 82d21ea3cb298..5ca03b5ecf9fb 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv index deea60d3203e2..af120d3d31b3b 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv @@ -30,7 +30,7 @@ DistilBertForMaskedLM,pass,5 -DistillGPT2,pass,7 +DistillGPT2,fail_accuracy,7 @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv index deea60d3203e2..af120d3d31b3b 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv @@ -30,7 +30,7 @@ DistilBertForMaskedLM,pass,5 -DistillGPT2,pass,7 +DistillGPT2,fail_accuracy,7 @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv index deea60d3203e2..af120d3d31b3b 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv @@ -30,7 +30,7 @@ DistilBertForMaskedLM,pass,5 -DistillGPT2,pass,7 +DistillGPT2,fail_accuracy,7 @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv index d3f0fbba71826..87dd88078f222 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv index deea60d3203e2..af120d3d31b3b 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv @@ -30,7 +30,7 @@ DistilBertForMaskedLM,pass,5 -DistillGPT2,pass,7 +DistillGPT2,fail_accuracy,7 @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv index 46f1e5adf4ec9..6f65795e3f04e 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0 -M2M100ForConditionalGeneration,pass,0 +M2M100ForConditionalGeneration,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv index e06f3bde8af13..07ec2bb634b39 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv @@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5 -M2M100ForConditionalGeneration,pass,4 +M2M100ForConditionalGeneration,pass,11 diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py index 61186034c746f..418c845c88e86 100644 --- a/test/distributed/test_dynamo_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -351,6 +351,8 @@ def run_hf_bert_ddp(self, model, inputs, backend): class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase): + @unittest.expectedFailure + # https://github.com/huggingface/transformers/issues/44188 @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") @patch.object(config, "optimize_ddp", True) @patch.object(torch._inductor.config, "fallback_random", True) @@ -363,6 +365,8 @@ def test_hf_bert_ddp_inductor(self): model = FakeDDP(model) run_hf_bert_ddp(self, model, inputs, "inductor") + @unittest.expectedFailure + # https://github.com/huggingface/transformers/issues/44188 @patch.object(config, "optimize_ddp", True) def test_hf_bert_ddp_aot_eager(self): model, inputs = get_hf_bert(0) @@ -597,6 +601,8 @@ def _test_hf_bert_ddp_inductor(self, static_graph): model = DDP(model, static_graph=static_graph) run_hf_bert_ddp(self, model, inputs, "inductor") + @unittest.expectedFailure + # https://github.com/huggingface/transformers/issues/44188 @skip_if_lt_x_gpu(2) @import_transformers_or_skip() @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") @@ -605,6 +611,8 @@ def _test_hf_bert_ddp_inductor(self, static_graph): def test_hf_bert_ddp_inductor(self): self._test_hf_bert_ddp_inductor(static_graph=False) + @unittest.expectedFailure + # https://github.com/huggingface/transformers/issues/44188 @skip_if_lt_x_gpu(2) @import_transformers_or_skip() @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") @@ -619,12 +627,16 @@ def _test_hf_bert_aot_eager(self, static_graph): model = DDP(model, static_graph=static_graph) run_hf_bert_ddp(self, model, inputs, "aot_eager") + @unittest.expectedFailure + # https://github.com/huggingface/transformers/issues/44188 @skip_if_lt_x_gpu(2) @import_transformers_or_skip() @config.patch(optimize_ddp=True, enable_compiler_collectives=True) def test_hf_bert_ddp_aot_eager(self): self._test_hf_bert_aot_eager(static_graph=False) + @unittest.expectedFailure + # https://github.com/huggingface/transformers/issues/44188 @skip_if_lt_x_gpu(2) @import_transformers_or_skip() @config.patch(optimize_ddp=True, enable_compiler_collectives=True) @@ -843,6 +855,8 @@ def test_fsdp_activation_checkpointing(self): find_first_node(cnt.graphs[0], tag_activation_checkpoint) is not None ) + @unittest.expectedFailure + # https://github.com/huggingface/transformers/issues/44188 @import_transformers_or_skip() @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert @@ -888,6 +902,8 @@ def apply_fsdp(model, wrap_policy): ) self.assertTrue(same(correct_results, opt_results)) + @unittest.expectedFailure + # https://github.com/huggingface/transformers/issues/44188 @import_transformers_or_skip() @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert From 8fcce8e16da24ecf53d6a4b1227e76fbf1d8bd46 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 25 Feb 2026 12:12:08 -0800 Subject: [PATCH 18/87] [CI] Switch vLLM test and benchmark workflows to CUDA 13.0 (#175781) [CI] Switch vLLM test and benchmark workflows to CUDA 13.0 (#175393) We should run vLLM test and benchmark on CUDA 13.0 now Pull Request resolved: https://github.com/pytorch/pytorch/pull/175393 Approved by: https://github.com/zou3519 (cherry picked from commit 72d0e643eb90f14085bab5e9cab8d3cceb0d7847) Co-authored-by: Huy Do --- .ci/docker/build.sh | 4 ++-- .ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml | 4 ++-- .github/workflows/_vllm-benchmark.yml | 4 ++-- .github/workflows/_vllm-build.yml | 2 +- .github/workflows/docker-builds.yml | 3 +-- .github/workflows/vllm-benchmark.yml | 6 +++--- .github/workflows/vllm.yml | 8 ++++---- 7 files changed, 15 insertions(+), 16 deletions(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 37c082e7d378e..1bc7286d4abd0 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -151,8 +151,8 @@ case "$tag" in TRITON=yes INDUCTOR_BENCHMARKS=yes ;; - pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm) - CUDA_VERSION=12.9.1 + pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm) + CUDA_VERSION=13.0.2 ANACONDA_PYTHON_VERSION=3.12 GCC_VERSION=11 VISION=yes diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml index f2f450b6f9004..0327172b414a2 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml @@ -104,10 +104,10 @@ vllm_pytorch_compilation_unit_tests: vllm_language_model_test_extended_generation_28_failure_test: title: Language Models Test (Extended Generation) 2.8 release failure - id: vllm_languagde_model_test_extended_generation_28_failure_test + id: vllm_language_model_test_extended_generation_28_failure_test package_install: - --no-build-isolation - - git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8 + - git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0 steps: - pytest -v -s models/language/generation/test_mistral.py diff --git a/.github/workflows/_vllm-benchmark.yml b/.github/workflows/_vllm-benchmark.yml index d5aa61a6341c7..7d43c271c4f1e 100644 --- a/.github/workflows/_vllm-benchmark.yml +++ b/.github/workflows/_vllm-benchmark.yml @@ -14,7 +14,7 @@ on: build_environment: required: true type: string - description: The build environment name, e.g. linux-jammy-cuda12.9-py3.12-gcc11 + description: The build environment name, e.g. linux-jammy-cuda13.0-py3.12-gcc11 pytorch_branch: required: false type: string @@ -106,7 +106,7 @@ jobs: dist/ao/torchao-*.whl \ dist/vllm/vllm-*.whl \ dist/deepgemm/deep_gemm-*.whl \ - --extra-index-url https://download.pytorch.org/whl/cu129 \ + --extra-index-url https://download.pytorch.org/whl/cu130 \ --index-strategy unsafe-best-match - name: Print some debug information diff --git a/.github/workflows/_vllm-build.yml b/.github/workflows/_vllm-build.yml index 630b5e8b6075d..b3e8c546c66f0 100644 --- a/.github/workflows/_vllm-build.yml +++ b/.github/workflows/_vllm-build.yml @@ -14,7 +14,7 @@ on: build_environment: required: true type: string - description: The build environment name, e.g. linux-jammy-cuda12.9-py3.12-gcc11 + description: The build environment name, e.g. linux-jammy-cuda13.0-py3.12-gcc11 pytorch_branch: required: false type: string diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index a0df8bccc8df9..dc9ecef7860ae 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -51,10 +51,9 @@ jobs: docker-image-name: [ pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11, pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11, - pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm, + pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm, pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks, pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks, - pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11, pytorch-linux-jammy-py3.10-clang15, pytorch-linux-jammy-py3.11-clang15, pytorch-linux-jammy-py3.12-clang15, diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index b15dbc7c2db2e..2d066241594b3 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -41,8 +41,8 @@ jobs: outputs: benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }} docker_image: ${{ steps.calculate-docker-image.outputs.docker-image }} - torch_cuda_arch_list: '8.0 8.9 9.0 10.0' - build_environment: linux-jammy-cuda12.9-py3.12-gcc11 + torch_cuda_arch_list: '8.0 8.9 9.0 10.0 12.0' + build_environment: linux-jammy-cuda13.0-py3.12-gcc11 steps: - uses: pytorch/test-infra/.github/actions/setup-uv@release/2.11 with: @@ -85,7 +85,7 @@ jobs: uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11 with: working-directory: pytorch/pytorch - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm build: name: Build PyTorch and vLLM diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index df657d057bbd0..eb1c78019b10e 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -27,9 +27,9 @@ jobs: allow-reuse-old-whl: false build-additional-packages: "vision audio" build-external-packages: "vllm" - build-environment: linux-jammy-cuda12.9-py3.12-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm - cuda-arch-list: '8.0 8.9 9.0 10.0' + build-environment: linux-jammy-cuda13.0-py3.12-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm + cuda-arch-list: '8.0 8.9 9.0 10.0 12.0' runner: linux.24xlarge.memory test-matrix: | { include: [ @@ -58,7 +58,7 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: build with: - build-environment: linux-jammy-cuda12.9-py3.12-gcc11 + build-environment: linux-jammy-cuda13.0-py3.12-gcc11 docker-image: ${{ needs.build.outputs.docker-image }} test-matrix: ${{ needs.build.outputs.test-matrix }} secrets: inherit From 6fe4bfbdcf13df37ee8297072d0977cde30d51e3 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 25 Feb 2026 12:15:15 -0800 Subject: [PATCH 19/87] Update vLLM pinned commit (#175238) (#175783) Two tweaks: * Move some tests around to match what they are in vLLM. I'll work on a proper fix for this later to avoid the need to do this manually * Fix 12.8 build. See https://github.com/vllm-project/vllm/pull/34791 Pull Request resolved: https://github.com/pytorch/pytorch/pull/175238 Approved by: https://github.com/angelayi, https://github.com/zou3519 Co-authored-by: PyTorch UpdateBot --- .../cli/lib/core/vllm/vllm_test_library.yaml | 24 +++++++++---------- .github/ci_commit_pins/vllm.txt | 2 +- .github/ci_configs/vllm/Dockerfile | 6 ++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml index 0327172b414a2..948a771385686 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml @@ -20,7 +20,8 @@ vllm_basic_models_test: - pytest -v -s models/test_registry.py - pytest -v -s models/test_utils.py - pytest -v -s models/test_vision.py - - pytest -v -s models/test_initialization.py + - pytest -v -s models/test_initialization.py -k 'not voxtral' + - HF_DATASETS_OFFLINE=0 TRANSFORMERS_OFFLINE=0 pytest -v -s models/test_initialization.py -k voxtral vllm_entrypoints_test: title: Entrypoints Test @@ -60,7 +61,7 @@ vllm_distributed_test_28_failure_test: VLLM_WORKER_MULTIPROC_METHOD: spawn num_gpus: 4 steps: - - pytest -v -s distributed/test_sequence_parallel.py + - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py vllm_lora_28_failure_test: title: LoRA pytorch 2.8 failure test @@ -85,21 +86,20 @@ vllm_multi_model_test_28_failure_test: package_install: - git+https://github.com/TIGER-AI-Lab/Mantis.git steps: - - pytest -v -s models/multimodal/generation/test_voxtral.py -k 'not 5-128-half' - - HF_DATASETS_OFFLINE=0 TRANSFORMERS_OFFLINE=0 pytest -v -s models/multimodal/generation/test_voxtral.py -k 5-128-half + - HF_DATASETS_OFFLINE=0 TRANSFORMERS_OFFLINE=0 pytest -v -s models/multimodal/generation/test_voxtral.py - pytest -v -s models/multimodal/pooling vllm_pytorch_compilation_unit_tests: title: PyTorch Compilation Unit Tests id: vllm_pytorch_compilation_unit_tests steps: - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - - pytest -v -s compile/distributed/test_sequence_parallelism.py - - pytest -v -s compile/distributed/test_async_tp.py - - pytest -v -s compile/distributed/test_fusion_all_reduce.py + - pytest -v -s compile/passes/test_pass_manager.py + - pytest -v -s compile/passes/test_fusion.py + - pytest -v -s compile/passes/test_fusion_attn.py + - pytest -v -s compile/passes/test_silu_mul_quant_fusion.py + - pytest -v -s compile/passes/distributed/test_sequence_parallelism.py + - pytest -v -s compile/passes/distributed/test_async_tp.py + - pytest -v -s compile/passes/distributed/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py vllm_language_model_test_extended_generation_28_failure_test: @@ -118,7 +118,7 @@ vllm_distributed_test_2_gpu_28_failure_test: VLLM_WORKER_MULTIPROC_METHOD: spawn num_gpus: 4 steps: - - pytest -v -s distributed/test_sequence_parallel.py + - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py vllm_lora_test: title: LoRA Test %N diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt index 235f99edfa759..c211a526574ec 100644 --- a/.github/ci_commit_pins/vllm.txt +++ b/.github/ci_commit_pins/vllm.txt @@ -1 +1 @@ -52ee21021a87735d46c4245c60bc0be42dd58c73 +a4047d4ea993fd52038433d87c16e603bee4f214 diff --git a/.github/ci_configs/vllm/Dockerfile b/.github/ci_configs/vllm/Dockerfile index 1d6ac16926834..549c336a444cb 100644 --- a/.github/ci_configs/vllm/Dockerfile +++ b/.github/ci_configs/vllm/Dockerfile @@ -146,6 +146,9 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 +ARG torch_cuda_arch_list='8.0;8.9;9.0;10.0;12.0' +ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} + # Use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ @@ -171,9 +174,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && sccache --show-stats; \ fi -ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0' -ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} - ARG vllm_target_device="cuda" ENV VLLM_TARGET_DEVICE=${vllm_target_device} ENV CCACHE_DIR=/root/.cache/ccache From 57614edc43a7e6c902b7a12389058252171a49eb Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 25 Feb 2026 12:31:27 -0800 Subject: [PATCH 20/87] [ROCm][CI] Upgrade ROCm CI to 7.2 - 4/N (#175767) [ROCm][CI] Upgrade ROCm CI to 7.2 - 4/N (#173188) In parallel with https://github.com/pytorch/pytorch/pull/173187 Pull Request resolved: https://github.com/pytorch/pytorch/pull/173188 Approved by: https://github.com/jeffdaily (cherry picked from commit 8301e14b7003034ed707e3164361f789c93a45f5) Co-authored-by: Jithun Nair Co-authored-by: Jeff Daily Co-authored-by: Jack Taylor --- .ci/docker/build.sh | 2 +- .ci/docker/common/install_rocm.sh | 23 ++++++++++++ c10/util/complex_math.h | 35 +++++++++++++++++++ test/distributed/test_dynamo_distributed.py | 5 ++- test/inductor/test_aot_inductor.py | 1 + test/inductor/test_ck_backend.py | 4 +++ .../test_torchinductor_dynamic_shapes.py | 2 ++ test/run_test.py | 4 +++ test/test_linalg.py | 10 +++++- .../_internal/common_methods_invocations.py | 16 +++++++++ 10 files changed, 99 insertions(+), 3 deletions(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 1bc7286d4abd0..9a051dc84aef5 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -195,7 +195,7 @@ case "$tag" in fi GCC_VERSION=11 VISION=yes - ROCM_VERSION=7.1 + ROCM_VERSION=7.2 NINJA_VERSION=1.9.0 TRITON=yes KATEX=yes diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index 21e5968016bd6..8b673a23f9de5 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -154,6 +154,29 @@ EOF fi fi + # ROCm 7.2 needs a fix from procprof sdk that isn't available until 7.2.1 + if [[ $(ver $ROCM_VERSION) -eq $(ver 7.2) ]]; then + git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-systems.git + pushd rocm-systems/ + git sparse-checkout init --cone + git sparse-checkout set projects/rocprofiler-sdk shared/rocprofiler-compute + git checkout develop + git checkout rocm-7.2.0 + git config --global user.email "you@example.com" + git config --global user.name "Your Name" + git cherry-pick a71cc3cc88ed68b24c40cefec77d764053044862 + sudo apt install -y cmake libdw-dev libsqlite3-dev + cmake \ + -B rocprofiler-sdk-build \ + -DCMAKE_INSTALL_PREFIX=/opt/rocm \ + -DCMAKE_PREFIX_PATH=/opt/rocm \ + -DGPU_TARGETS="${PYTORCH_ROCM_ARCH}" \ + projects/rocprofiler-sdk + cmake --build rocprofiler-sdk-build --target all --parallel $(nproc) + cmake --build rocprofiler-sdk-build --target install + popd + fi + # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime for kdb in /opt/rocm/share/miopen/db/*.kdb do diff --git a/c10/util/complex_math.h b/c10/util/complex_math.h index 2b591026c94da..d369df5059231 100644 --- a/c10/util/complex_math.h +++ b/c10/util/complex_math.h @@ -86,6 +86,41 @@ C10_HOST_DEVICE inline c10::complex pow( #endif } +// Regression in ROCm 7.2. See https://github.com/ROCm/rocm-libraries/pull/3836. +// Specialized version for complex on AMD GPUs to use FMA-based +// multiplication +#if defined(__HIPCC__) +namespace detail { +// FMA-aware complex multiplication for float precision on AMD GPUs. +// This prevents SLP vectorizer from breaking FMA formation, which causes +// numerical precision loss in complex arithmetic. +// The issue occurs when vectorizer packs scalar multiplies before backend +// can form FMA instructions, resulting in double rounding instead of single. +C10_HOST_DEVICE inline thrust::complex complex_mul_fma( + thrust::complex a, + thrust::complex b) { + // Complex multiplication: (a.r + a.i*i) * (b.r + b.i*i) + // = (a.r*b.r - a.i*b.i) + (a.r*b.i + a.i*b.r)*i + // Using __builtin_fmaf ensures FMA at source level: + // real: a.r*b.r + (-(a.i*b.i)) = FMA(a.r, b.r, -(a.i*b.i)) + // imag: a.i*b.r + a.r*b.i = FMA(a.r, b.i, a.i*b.r) + float real_part = __builtin_fmaf(a.real(), b.real(), -(a.imag() * b.imag())); + float imag_part = __builtin_fmaf(a.real(), b.imag(), a.imag() * b.real()); + return thrust::complex(real_part, imag_part); +} +} // namespace detail + +template <> +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const c10::complex& y) { + auto log_x = thrust::log(static_cast>(x)); + auto y_log_x = + detail::complex_mul_fma(static_cast>(y), log_x); + return static_cast>(thrust::exp(y_log_x)); +} +#endif + template C10_HOST_DEVICE inline c10::complex pow( const c10::complex& x, diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py index 418c845c88e86..cf09bf7ed9606 100644 --- a/test/distributed/test_dynamo_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -47,7 +47,7 @@ requires_accelerator_dist_backend, skip_if_lt_x_gpu, ) -from torch.testing._internal.common_utils import skipIfXpu +from torch.testing._internal.common_utils import MI350_ARCH, skipIfRocmArch, skipIfXpu from torch.testing._internal.inductor_utils import HAS_GPU from torch.testing._internal.triton_utils import requires_cuda_and_triton @@ -808,6 +808,7 @@ def test_fsdp_unspecialized_forced_getattr_inline(self): outputs = fsdp_m(inputs) self.assertTrue(same(correct_outputs, outputs)) + @skipIfRocmArch(MI350_ARCH) # regression in ROCm 7.2 @config.patch(enable_compiler_collectives=True) @skip_if_lt_x_gpu(1) @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") @@ -1495,6 +1496,7 @@ def test_ddp_baseline_aot_eager(self): outputs = ddp_m(inputs) self.assertTrue(same(correct_outputs, outputs)) + @skipIfRocmArch(MI350_ARCH) # regression in ROCm 7.2 @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") @patch.object(config, "optimize_ddp", False) def test_ddp_baseline_inductor(self): @@ -1700,6 +1702,7 @@ def alibi_score_mod(self, score, b, h, q_idx, kv_idx): model(hidden_states) torch.accelerator.synchronize() + @skipIfRocmArch(MI350_ARCH) # regression in ROCm 7.2 @patch.object(config, "optimize_ddp", True) @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_graph_split_inductor(self): diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py index 0f0fcdb842085..8ac981f0bbc37 100644 --- a/test/inductor/test_aot_inductor.py +++ b/test/inductor/test_aot_inductor.py @@ -3502,6 +3502,7 @@ def forward(self, x): example_inputs = (torch.randn(3, 10, device=self.device),) self.check_model(Model(), example_inputs) + @skipIfRocmArch(NAVI_ARCH) # regression on ROCm 7.2 def test_repeated_calling(self): if self.device != "cuda": raise unittest.SkipTest("requires CUDA") diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py index 079be79fcc9d8..8cae41dfbae37 100644 --- a/test/inductor/test_ck_backend.py +++ b/test/inductor/test_ck_backend.py @@ -16,7 +16,9 @@ from torch.testing._internal.common_cuda import tf32_off from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, + MI350_ARCH, parametrize, + skipIfRocmArch, ) from torch.testing._internal.inductor_utils import ( _quantize_rowwise, @@ -235,6 +237,8 @@ def mm(a, b): Y_eager = a @ b torch.testing.assert_close(Y_compiled, Y_eager, equal_nan=True) + # regression in ROCm 7.2, Mismatched elements, significantly + @skipIfRocmArch(MI350_ARCH) @unittest.skipIf(not torch.version.hip, "ROCM only") @unittest.mock.patch.dict(os.environ, _test_env) @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK")) diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py index e579184978349..0d1d2427855f8 100644 --- a/test/inductor/test_torchinductor_dynamic_shapes.py +++ b/test/inductor/test_torchinductor_dynamic_shapes.py @@ -29,6 +29,7 @@ IS_FBCODE, parametrize, serialTest, + skipIfRocm, TEST_CUDA_MEM_LEAK_CHECK, TEST_WITH_ASAN, ) @@ -630,6 +631,7 @@ def f(x, w): torch.compile(fullgraph=True)(f)(x, w).sum().backward() self.assertEqual(orig_w, w.grad) + @skipIfRocm # regression in ROCm 7.2, XBLOCK should remain 64 (got 256) @torch._dynamo.config.patch( capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True ) diff --git a/test/run_test.py b/test/run_test.py index 820206480be11..a4eaaedb0f544 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -202,6 +202,10 @@ def __contains__(self, item): if TEST_WITH_ROCM and isRocmArchAnyOf(("gfx1100",)): # Some autotune tests on gfx1100 are hanging, disable for now ROCM_BLOCKLIST.append("inductor/test_max_autotune") + # ROCm 7.2 gfx1100 started timing out due to these + ROCM_BLOCKLIST.append("inductor/test_torchinductor_dynamic_shapes") + ROCM_BLOCKLIST.append("inductor/test_torchinductor_opinfo") + ROCM_BLOCKLIST.append("inductor/test_ck_backend") S390X_BLOCKLIST = [ # these tests fail due to various reasons diff --git a/test/test_linalg.py b/test/test_linalg.py index 9a84211dde6c7..346a6c0204479 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -3767,7 +3767,15 @@ def run_test(a_shape, ind): self.assertEqual(ans, result) # compare to NumPy output - run_test((12, 3, 4), ind=1) + if not torch.version.hip: + # https://github.com/pytorch/pytorch/issues/174913 + # Skip one config due to regression on ROCm 7.2 for hipSolver. + # Rather than skip entire unit test using @skipIfRocm + # This happened on MI355, MI300, and MI200. + # Mismatched elements: 1 / 144 (0.7%) + # Greatest absolute difference: 0.00130462646484375 at index (1, 3, 6) (up to 0.001 allowed) + # Greatest relative difference: 1.5133813576539978e-05 at index (1, 3, 6) (up to 1.3e-06 allowed) + run_test((12, 3, 4), ind=1) run_test((3, 8, 24), ind=2) run_test((18, 3, 3, 2), ind=1) run_test((1, 4, 2, 2), ind=2) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 7a9e9df8a519d..e3ac2e1c4aade 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -16052,6 +16052,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo( toleranceOverride({torch.chalf: tol(atol=9e-2, rtol=9e-2), }), 'TestCommon', 'test_complex_half_reference_testing'), + DecorateInfo( + toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}), + 'TestOperators', 'test_vjpvmap', device_type='cuda' + ), DecorateInfo( toleranceOverride({torch.half: tol(atol=9e-3, rtol=2e-1), }), 'TestInductorOpInfo', 'test_comprehensive', device_type='cpu')], @@ -16205,6 +16209,18 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}), 'TestOperators', 'test_vjpvmap', ), + DecorateInfo( + toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}), + 'TestOperators', 'test_jvpvjp', device_type="cuda" + ), + DecorateInfo( + toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}), + 'TestOperators', 'test_vjp', device_type="cuda" + ), + DecorateInfo( + toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}), + 'TestCompositeCompliance', 'test_backward', device_type="cuda" + ), DecorateInfo( toleranceOverride({torch.float16: tol(atol=5e-3, rtol=1e-3)}), 'TestInductorOpInfo', 'test_comprehensive', From c67ec25fdb87e850f542408b2610656b9da66c4a Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 25 Feb 2026 12:33:32 -0800 Subject: [PATCH 21/87] [ROCm] Added CUDA check to test_pattern_matcher (#175766) [ROCm] Added CUDA check to test_pattern_matcher (#175092) Forward fix to #173856. Pull Request resolved: https://github.com/pytorch/pytorch/pull/175092 Approved by: https://github.com/jeffdaily, https://github.com/Skylion007 (cherry picked from commit f6dcaa37201c0b6499f31e264b482574507b3085) Co-authored-by: Arash Pakbin --- test/dynamo/test_activation_checkpointing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py index 5d3e52612309f..dc6b910f9b111 100644 --- a/test/dynamo/test_activation_checkpointing.py +++ b/test/dynamo/test_activation_checkpointing.py @@ -1821,7 +1821,7 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs): prefer_cudnn = ( cudnn_version > 91500 and dprops.major in (9, 10) and dprops.minor in (0, 3) ) - if prefer_cudnn: + if prefer_cudnn and torch.version.cuda: sdpa_op = torch.ops.aten._scaled_dot_product_cudnn_attention.default else: sdpa_op = torch.ops.aten._scaled_dot_product_flash_attention.default From 9e94d2dace4cdea1661fb0ba6123d8906bd0e568 Mon Sep 17 00:00:00 2001 From: eqy Date: Wed, 25 Feb 2026 14:56:03 -0800 Subject: [PATCH 22/87] [cuDNN][2.11] cuDNN upgrade / sync to 9.19 for linux/windows on 2.11 (#175672) * [WINDOWS][cuDNN] Fix cuDNN version mismatch in Windows (#175547) Authored with claude code Previous PRs such as https://github.com/pytorch/pytorch/pull/174310 updated cuDNN versions for Linux builds but neglected to do so for Windows. Claude wrote all of the lintrunner additions for consistency checking Pull Request resolved: https://github.com/pytorch/pytorch/pull/175547 Approved by: https://github.com/Skylion007, https://github.com/atalman, https://github.com/malfet * [cuDNN] Upgrade cuDNN to 9.19 for 12.8 and 13.0 wheels (#174310) Currently being tested internally, currently looks OK also needed for https://github.com/pytorch/pytorch/pull/172108 Pull Request resolved: https://github.com/pytorch/pytorch/pull/174310 Approved by: https://github.com/Skylion007, https://github.com/ngimel, https://github.com/malfet --- .ci/docker/common/install_cuda.sh | 4 +- .ci/pytorch/windows/internal/cuda_install.bat | 8 +-- .../scripts/generate_binary_build_matrix.py | 66 ++++++++++++++++++- ...linux-aarch64-binary-manywheel-nightly.yml | 28 ++++---- ...nerated-linux-binary-manywheel-nightly.yml | 28 ++++---- test/jit/test_freezing.py | 3 +- 6 files changed, 100 insertions(+), 37 deletions(-) diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh index 2d1db795d9cb4..c031e0928784b 100644 --- a/.ci/docker/common/install_cuda.sh +++ b/.ci/docker/common/install_cuda.sh @@ -129,7 +129,7 @@ function install_129 { } function install_128 { - CUDNN_VERSION=9.17.1.4 + CUDNN_VERSION=9.19.0.56 echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" # install CUDA 12.8.1 in the same container install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux @@ -147,7 +147,7 @@ function install_128 { } function install_130 { - CUDNN_VERSION=9.17.1.4 + CUDNN_VERSION=9.19.0.56 echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" # install CUDA 13.0 in the same container install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat index 1349d3e661f55..0c8c023831e45 100644 --- a/.ci/pytorch/windows/internal/cuda_install.bat +++ b/.ci/pytorch/windows/internal/cuda_install.bat @@ -43,7 +43,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6" ) -set CUDNN_FOLDER=cudnn-windows-x86_64-9.5.0.50_cuda12-archive +set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive set CUDNN_LIB_FOLDER="lib" set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( @@ -70,7 +70,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( set "ARGS=cuda_profiler_api_12.8 thrust_12.8 nvcc_12.8 cuobjdump_12.8 nvprune_12.8 nvprof_12.8 cupti_12.8 cublas_12.8 cublas_dev_12.8 cudart_12.8 cufft_12.8 cufft_dev_12.8 curand_12.8 curand_dev_12.8 cusolver_12.8 cusolver_dev_12.8 cusparse_12.8 cusparse_dev_12.8 npp_12.8 npp_dev_12.8 nvrtc_12.8 nvrtc_dev_12.8 nvml_dev_12.8 nvjitlink_12.8 nvtx_12.8" ) -set CUDNN_FOLDER=cudnn-windows-x86_64-9.7.0.66_cuda12-archive +set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda12-archive set CUDNN_LIB_FOLDER="lib" set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( @@ -97,7 +97,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( set "ARGS=cuda_profiler_api_12.9 thrust_12.9 nvcc_12.9 cuobjdump_12.9 nvprune_12.9 nvprof_12.9 cupti_12.9 cublas_12.9 cublas_dev_12.9 cudart_12.9 cufft_12.9 cufft_dev_12.9 curand_12.9 curand_dev_12.9 cusolver_12.9 cusolver_dev_12.9 cusparse_12.9 cusparse_dev_12.9 npp_12.9 npp_dev_12.9 nvrtc_12.9 nvrtc_dev_12.9 nvml_dev_12.9 nvjitlink_12.9 nvtx_12.9" ) -set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive +set CUDNN_FOLDER=cudnn-windows-x86_64-9.17.1.4_cuda12-archive set CUDNN_LIB_FOLDER="lib" set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( @@ -124,7 +124,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( set "ARGS=" ) -set CUDNN_FOLDER=cudnn-windows-x86_64-9.12.0.46_cuda13-archive +set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda13-archive set CUDNN_LIB_FOLDER="lib" set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index 267afbe216fa4..eda03260446be 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -60,7 +60,7 @@ "12.8": ( "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | " # noqa: B950 "cuda-bindings==12.9.4; platform_system == 'Linux' | " - "nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | " + "nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | " "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'" @@ -76,7 +76,7 @@ "13.0": ( "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | " # noqa: B950 "cuda-bindings==13.0.3; platform_system == 'Linux' | " - "nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | " + "nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | " "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " "nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | " "nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'" @@ -175,6 +175,67 @@ def validate_nccl_dep_consistency(arch_version: str) -> None: ) +def _parse_linux_cudnn_versions() -> dict[str, str]: + """Return {cuda_short_version: cudnn_version} from install_cuda.sh.""" + text = (REPO_ROOT / ".ci" / "docker" / "common" / "install_cuda.sh").read_text() + results: dict[str, str] = {} + func_re = re.compile(r"^function install_(\d+)\s*\{") + cudnn_re = re.compile(r"^\s*CUDNN_VERSION=(\S+)") + current_func: str | None = None + for line in text.splitlines(): + m = func_re.match(line) + if m: + digits = m.group(1) + current_func = digits[:-1] + "." + digits[-1] + continue + if current_func is not None: + m = cudnn_re.match(line) + if m: + results[current_func] = m.group(1) + current_func = None + return results + + +def _parse_windows_cudnn_versions() -> dict[str, str]: + """Return {cuda_short_version: cudnn_version} from cuda_install.bat.""" + text = ( + REPO_ROOT / ".ci" / "pytorch" / "windows" / "internal" / "cuda_install.bat" + ).read_text() + results: dict[str, str] = {} + label_re = re.compile(r"^:cuda(\d+)\s*$") + cudnn_re = re.compile( + r"^set CUDNN_FOLDER=cudnn-windows-x86_64-([0-9.]+)_cuda\d+-archive" + ) + current_label: str | None = None + for line in text.splitlines(): + m = label_re.match(line) + if m: + digits = m.group(1) + current_label = digits[:-1] + "." + digits[-1] + continue + if current_label is not None: + m = cudnn_re.match(line) + if m: + results[current_label] = m.group(1) + current_label = None + return results + + +def validate_cudnn_version_consistency(arch_version: str) -> None: + linux_versions = _parse_linux_cudnn_versions() + windows_versions = _parse_windows_cudnn_versions() + linux_ver = linux_versions.get(arch_version) + windows_ver = windows_versions.get(arch_version) + if linux_ver is None or windows_ver is None: + return + if linux_ver != windows_ver: + raise RuntimeError( + f"cuDNN version mismatch for CUDA {arch_version}: " + f"Linux has {linux_ver} (.ci/docker/common/install_cuda.sh) " + f"but Windows has {windows_ver} (.ci/pytorch/windows/internal/cuda_install.bat)" + ) + + def arch_type(arch_version: str) -> str: if arch_version in CUDA_ARCHES: return "cuda" @@ -421,6 +482,7 @@ def generate_wheels_matrix( arch_version = "" for arch_version in CUDA_ARCHES: validate_nccl_dep_consistency(arch_version) + validate_cudnn_version_consistency(arch_version) del arch_version diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index 0dc6a42e77d24..567d099675a60 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -204,7 +204,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -346,7 +346,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -554,7 +554,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -696,7 +696,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -904,7 +904,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1046,7 +1046,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1254,7 +1254,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1396,7 +1396,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1604,7 +1604,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1746,7 +1746,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1954,7 +1954,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2096,7 +2096,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2304,7 +2304,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2446,7 +2446,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index fcd006886abed..2e3aaa64f4d42 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -195,7 +195,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -329,7 +329,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -859,7 +859,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -993,7 +993,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1523,7 +1523,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1657,7 +1657,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2187,7 +2187,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2321,7 +2321,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2851,7 +2851,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2985,7 +2985,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -3515,7 +3515,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -3649,7 +3649,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -4179,7 +4179,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -4313,7 +4313,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index d7fbbdab0096e..a695e0e9b3f7d 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -11,7 +11,7 @@ import torch.nn.functional as F from torch.jit._recursive import wrap_cpp_module from torch.testing import FileCheck -from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN +from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN, tf32_on_and_off from torch.testing._internal.common_quantization import skipIfNoFBGEMM from torch.testing._internal.common_quantized import override_quantized_engine from torch.testing._internal.common_utils import ( @@ -2964,6 +2964,7 @@ def test_conv_to_mkldnn_no_mkldnn(self): inp = torch.rand([4, 3, 4, 4]) self.assertEqual(frozen(inp), mod(inp)) + @tf32_on_and_off(0.005) @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN") def test_freeze_conv_relu_fusion(self): with set_default_dtype(torch.float): From 195f9cd260844ec2ba9644cdb5a8d3c98e7d602e Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Thu, 26 Feb 2026 07:19:59 -0800 Subject: [PATCH 23/87] Fix pep517 release handling (#175793) Fix pep517 release handling (#175635) Fix pep517 release handling Fix sdist upload: correct PEP 440 version and file path PYTORCH_BUILD_VERSION was being set unconditionally to the raw tag/branch name (including 'v' prefix for tags), which fails PEP 440 validation in get_torch_version(), and was not exported so Python subprocesses couldn't see it anyway. Fix both issues: set and export PYTORCH_BUILD_VERSION only for release/RC tags, stripping the 'v' prefix and converting '-rc' to 'rc' for PEP 440 compliance. For branch pushes and PRs, leave it unset so get_torch_version falls back to version.txt. Also fix the sdist upload path: python -m build places the sdist in dist/, so move it to the workspace root for consistency with all upload steps (release, GHA artifact, and S3). These fixes are tested/verified in the second PR in this stack. This commit was created with the help of Claude Sonnet 4.6. Pull Request resolved: https://github.com/pytorch/pytorch/pull/175635 Approved by: https://github.com/atalman, https://github.com/malfet (cherry picked from commit 11eba5b6efcd78f68730bdff9bb701d47a9e256f) Co-authored-by: Klaus Zimmermann --- .github/workflows/create_release.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index 4932631f2d2eb..2506ad4192bfd 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -54,11 +54,19 @@ jobs: tag_or_branch="${tag_or_branch#refs/heads/}" # replace directory separators with _ in branch name tag_or_branch="${tag_or_branch//\//_}" + # Set PYTORCH_BUILD_VERSION only for release/RC tags; convert to PEP 440 + if [[ "$PT_GITHUB_REF" =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+(-rc[0-9]+)?$ ]]; then + ver="${PT_GITHUB_REF#refs/tags/v}" + export PYTORCH_BUILD_VERSION="${ver/-rc/rc}" + export PYTORCH_BUILD_NUMBER=0 + fi torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')" { echo "PT_RELEASE_NAME=pytorch-$tag_or_branch"; echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz"; echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz"; + echo "PYTORCH_BUILD_VERSION=${PYTORCH_BUILD_VERSION:-}"; + echo "PYTORCH_BUILD_NUMBER=${PYTORCH_BUILD_NUMBER:-}"; } >> "$GITHUB_ENV" - name: Checkout optional submodules run: python3 tools/optional_submodules.py @@ -83,7 +91,7 @@ jobs: run: | pip install build==1.2.2.post1 || exit 1 python -m build --sdist || exit 1 - cd dist || exit 1 + mv dist/$PT_PEP517_RELEASE_FILE . || exit 1 - name: Upload source distribution for release if: ${{ github.event_name == 'release' }} uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 @@ -102,7 +110,7 @@ jobs: uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: ${{ env.PT_PEP517_RELEASE_FILE }} - path: dist/${{ env.PT_PEP517_RELEASE_FILE }} + path: ${{ env.PT_PEP517_RELEASE_FILE }} - name: Set output id: release_name run: | From f99ab991dcd3719ee25dd3377a53ea12e518308e Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 27 Feb 2026 11:06:08 -0800 Subject: [PATCH 24/87] [CI] Update inductor CI jobs to CUDA 13.0 (#175826) (#175955) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Docker image switch — All workflows that used pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks now use the cuda13.0 variant. The unused CUDA 12.8 image definition was removed from .ci/docker/build.sh and its duplicate entry dropped from docker-builds.yml. 3. Duplicate cleanup — Five workflows previously had both a CUDA 12.8 build and a separate -cuda13 build. After migrating the main build to CUDA 13.0, the -cuda13 duplicates were removed: - inductor-periodic.yml — removed periodic-dynamo-benchmarks-build-cuda13 + test - inductor-micro-benchmark.yml — removed build-cuda13 + test-cuda13 - inductor-perf-compare.yml — removed build-cuda13 + test-cuda13 - inductor-perf-test-nightly.yml — removed build-cuda13 + 3 test jobs - trunk.yml — removed inductor-build-cuda13 Pull Request resolved: https://github.com/pytorch/pytorch/pull/175826 Approved by: https://github.com/atalman Signed-off-by: Huy Do --- .ci/docker/build.sh | 11 --- .github/workflows/docker-builds.yml | 1 - .../workflows/inductor-micro-benchmark.yml | 35 +------ .github/workflows/inductor-perf-compare.yml | 43 +------- .github/workflows/inductor-perf-test-b200.yml | 12 +-- .../inductor-perf-test-nightly-h100.yml | 4 +- .../workflows/inductor-perf-test-nightly.yml | 98 ++----------------- .github/workflows/inductor-periodic.yml | 59 +---------- .github/workflows/inductor-unittest.yml | 4 +- .github/workflows/inductor.yml | 35 +------ .github/workflows/pull.yml | 32 ------ .github/workflows/torchbench.yml | 8 +- .github/workflows/trunk.yml | 13 --- 13 files changed, 34 insertions(+), 321 deletions(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 9a051dc84aef5..7df6453c22da9 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -129,17 +129,6 @@ case "$tag" in UCC_COMMIT=${_UCC_COMMIT} TRITON=yes ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks) - CUDA_VERSION=12.8.1 - ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=11 - VISION=yes - KATEX=yes - UCX_COMMIT=${_UCX_COMMIT} - UCC_COMMIT=${_UCC_COMMIT} - TRITON=yes - INDUCTOR_BENCHMARKS=yes - ;; pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks) CUDA_VERSION=13.0.2 ANACONDA_PYTHON_VERSION=3.10 diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index dc9ecef7860ae..10dc09fcec2f1 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -52,7 +52,6 @@ jobs: pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11, pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11, pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm, - pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks, pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks, pytorch-linux-jammy-py3.10-clang15, pytorch-linux-jammy-py3.11-clang15, diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml index 35a1a4ef972a5..19a6c764e403f 100644 --- a/.github/workflows/inductor-micro-benchmark.yml +++ b/.github/workflows/inductor-micro-benchmark.yml @@ -30,14 +30,14 @@ jobs: opt_out_experiments: lf build: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -46,7 +46,7 @@ jobs: secrets: inherit test: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-test.yml needs: build with: @@ -55,30 +55,3 @@ jobs: test-matrix: ${{ needs.build.outputs.test-matrix }} timeout-minutes: 720 secrets: inherit - - build-cuda13: - name: cuda13.0-py3.10-gcc11-sm80 - uses: ./.github/workflows/_linux-build.yml - needs: - - get-default-label-prefix - with: - runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks - cuda-arch-list: '8.0' - test-matrix: | - { include: [ - { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] }, - ]} - secrets: inherit - - test-cuda13: - name: cuda13.0-py3.10-gcc11-sm80 - uses: ./.github/workflows/_linux-test.yml - needs: build-cuda13 - with: - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 - docker-image: ${{ needs.build-cuda13.outputs.docker-image }} - test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }} - timeout-minutes: 720 - secrets: inherit diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml index 6235d02970849..3a9b01e97aed5 100644 --- a/.github/workflows/inductor-perf-compare.yml +++ b/.github/workflows/inductor-perf-compare.yml @@ -27,15 +27,15 @@ jobs: opt_out_experiments: lf build: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" runner: linux.4xlarge.memory - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -48,7 +48,7 @@ jobs: secrets: inherit test: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-test.yml needs: build with: @@ -60,38 +60,3 @@ jobs: monitor-log-interval: 15 monitor-data-collect-interval: 4 secrets: inherit - - build-cuda13: - name: cuda13.0-py3.10-gcc11-sm80 - uses: ./.github/workflows/_linux-build.yml - needs: - - get-default-label-prefix - with: - runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - runner: linux.4xlarge.memory - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks - cuda-arch-list: '8.0' - test-matrix: | - { include: [ - { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, - { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.aws.a100" }, - { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" }, - { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, - ]} - build-additional-packages: "vision audio torchao" - secrets: inherit - - test-cuda13: - name: cuda13.0-py3.10-gcc11-sm80 - uses: ./.github/workflows/_linux-test.yml - needs: build-cuda13 - with: - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 - docker-image: ${{ needs.build-cuda13.outputs.docker-image }} - test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }} - # disable monitor in perf tests for more investigation - disable-monitor: false - monitor-log-interval: 15 - monitor-data-collect-interval: 4 - secrets: inherit diff --git a/.github/workflows/inductor-perf-test-b200.yml b/.github/workflows/inductor-perf-test-b200.yml index 003f27476bcb9..0c2558fb772a8 100644 --- a/.github/workflows/inductor-perf-test-b200.yml +++ b/.github/workflows/inductor-perf-test-b200.yml @@ -78,7 +78,7 @@ jobs: opt_out_experiments: lf build: - name: cuda12.8-py3.10-gcc11-sm100 + name: cuda13.0-py3.10-gcc11-sm100 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -88,8 +88,8 @@ jobs: # from trunk. Also use a memory-intensive runner here because memory is # usually the bottleneck runner: linux.12xlarge.memory - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm100 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '10.0' test-matrix: | { include: [ @@ -102,7 +102,7 @@ jobs: secrets: inherit test-periodically: - name: cuda12.8-py3.10-gcc11-sm100 + name: cuda13.0-py3.10-gcc11-sm100 uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '0 7 * * 1-6' @@ -119,7 +119,7 @@ jobs: secrets: inherit test-weekly: - name: cuda12.8-py3.10-gcc11-sm100 + name: cuda13.0-py3.10-gcc11-sm100 uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '0 7 * * 0' @@ -136,7 +136,7 @@ jobs: secrets: inherit test: - name: cuda12.8-py3.10-gcc11-sm100 + name: cuda13.0-py3.10-gcc11-sm100 uses: ./.github/workflows/_linux-test.yml needs: build with: diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml index a929475355888..0c027682cc168 100644 --- a/.github/workflows/inductor-perf-test-nightly-h100.yml +++ b/.github/workflows/inductor-perf-test-nightly-h100.yml @@ -93,8 +93,8 @@ jobs: # from trunk. Also use a memory-intensive runner here because memory is # usually the bottleneck runner: linux.12xlarge.memory - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm90 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '9.0' test-matrix: | { include: [ diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml index 6539a81f7c196..1c684a9d7a270 100644 --- a/.github/workflows/inductor-perf-test-nightly.yml +++ b/.github/workflows/inductor-perf-test-nightly.yml @@ -78,15 +78,15 @@ jobs: opt_out_experiments: lf build: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" # Every bit to make perf run faster helps runner: linux.12xlarge.memory - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -115,7 +115,7 @@ jobs: secrets: inherit test-nightly: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '0 7 * * 1-6' @@ -131,7 +131,7 @@ jobs: secrets: inherit test-weekly: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '0 7 * * 0' @@ -148,7 +148,7 @@ jobs: secrets: inherit test: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-test.yml needs: build if: github.event_name == 'workflow_dispatch' @@ -162,89 +162,3 @@ jobs: monitor-log-interval: 15 monitor-data-collect-interval: 4 secrets: inherit - - build-cuda13: - name: cuda13.0-py3.10-gcc11-sm80 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - # Every bit to make perf run faster helps - runner: linux.12xlarge.memory - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks - cuda-arch-list: '8.0' - test-matrix: | - { include: [ - { config: "inductor_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" }, - { config: "inductor_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" }, - { config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" }, - { config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" }, - { config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" }, - { config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, - { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, - { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" }, - { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" }, - ]} - selected-test-configs: ${{ inputs.benchmark_configs }} - build-additional-packages: "vision audio torchao" - secrets: inherit - - test-nightly-cuda13: - name: cuda13.0-py3.10-gcc11-sm80 - uses: ./.github/workflows/_linux-test.yml - needs: build-cuda13 - if: github.event.schedule == '0 7 * * 1-6' - with: - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 - dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true - docker-image: ${{ needs.build-cuda13.outputs.docker-image }} - test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }} - timeout-minutes: 720 - disable-monitor: false - monitor-log-interval: 15 - monitor-data-collect-interval: 4 - secrets: inherit - - test-weekly-cuda13: - name: cuda13.0-py3.10-gcc11-sm80 - uses: ./.github/workflows/_linux-test.yml - needs: build-cuda13 - if: github.event.schedule == '0 7 * * 0' - with: - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 - dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true - docker-image: ${{ needs.build-cuda13.outputs.docker-image }} - test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }} - timeout-minutes: 1440 - # disable monitor in perf tests, next step is to enable it - disable-monitor: false - monitor-log-interval: 15 - monitor-data-collect-interval: 4 - secrets: inherit - - test-cuda13: - name: cuda13.0-py3.10-gcc11-sm80 - uses: ./.github/workflows/_linux-test.yml - needs: build-cuda13 - if: github.event_name == 'workflow_dispatch' - with: - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 - dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }} - docker-image: ${{ needs.build-cuda13.outputs.docker-image }} - test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }} - timeout-minutes: 720 - disable-monitor: false - monitor-log-interval: 15 - monitor-data-collect-interval: 4 - secrets: inherit diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index 1e87adc965c74..1506f1ac375d9 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -38,8 +38,8 @@ jobs: with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" runner: linux.4xlarge.memory - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '8.0;8.6' test-matrix: | { include: [ @@ -82,57 +82,6 @@ jobs: test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }} secrets: inherit - periodic-dynamo-benchmarks-build-cuda13: - name: periodic-dynamo-benchmarks-build-cuda13 - uses: ./.github/workflows/_linux-build.yml - needs: get-default-label-prefix - with: - runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - runner: linux.4xlarge.memory - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks - cuda-arch-list: '8.0;8.6' - test-matrix: | - { include: [ - { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, - { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - ]} - build-additional-packages: "vision audio torchao" - secrets: inherit - - periodic-dynamo-benchmarks-test-cuda13: - name: periodic-dynamo-benchmarks-test-cuda13 - uses: ./.github/workflows/_linux-test.yml - needs: periodic-dynamo-benchmarks-build-cuda13 - with: - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86 - docker-image: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.docker-image }} - test-matrix: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.test-matrix }} - secrets: inherit - rocm-periodic-dynamo-benchmarks-build: if: github.repository_owner == 'pytorch' name: rocm-periodic-dynamo-benchmarks-build @@ -191,8 +140,8 @@ jobs: with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" runner: linux.4xlarge.memory - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '8.0' test-matrix: | { include: [ diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index ea6ce55dbd470..bce149251a477 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -36,8 +36,8 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '8.6' runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 3736415f11b74..7fe6a193283f5 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -49,8 +49,8 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '8.6' runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner: linux.4xlarge.memory @@ -75,37 +75,6 @@ jobs: test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} secrets: inherit - inductor-build-cuda13: - name: inductor-build-cuda13 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks - cuda-arch-list: '8.6' - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runner: linux.4xlarge.memory - test-matrix: | - { include: [ - { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - ]} - build-additional-packages: "vision audio torchao" - secrets: inherit - - inductor-test-cuda13: - name: inductor-test-cuda13 - uses: ./.github/workflows/_linux-test.yml - needs: inductor-build-cuda13 - with: - build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86 - docker-image: ${{ needs.inductor-build-cuda13.outputs.docker-image }} - test-matrix: ${{ needs.inductor-build-cuda13.outputs.test-matrix }} - secrets: inherit - inductor-cpu-build: name: inductor-cpu-build uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 22989263dd22f..96024e2ef1a6d 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -430,38 +430,6 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc11-inductor-build: - if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' cuda12.8-py3.10-gcc11-sm75 ') }} - name: cuda12.8-py3.10-gcc11-sm75 - uses: ./.github/workflows/_linux-build.yml - needs: - - get-label-type - - job-filter - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks - cuda-arch-list: '7.5' - test-matrix: | - { include: [ - { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_8-py3_10-gcc11-inductor-test: - if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' cuda12.8-py3.10-gcc11-sm75 ') }} - name: cuda12.8-py3.10-gcc11-sm75 - uses: ./.github/workflows/_linux-test.yml - needs: - - linux-jammy-cuda12_8-py3_10-gcc11-inductor-build - - job-filter - with: - build-environment: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.build-environment }} - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.test-matrix }} - tests-to-include: ${{ github.event.inputs.tests-to-include || '' }} - secrets: inherit - linux-jammy-cuda13_0-py3_10-gcc11-inductor-build: if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' cuda13.0-py3.10-gcc11-sm75 ') }} name: cuda13.0-py3.10-gcc11-sm75 diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml index a84ff38e72471..ba18a1c8c2a32 100644 --- a/.github/workflows/torchbench.yml +++ b/.github/workflows/torchbench.yml @@ -26,14 +26,14 @@ jobs: curr_ref_type: ${{ github.ref_type }} build: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks + build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks cuda-arch-list: '8.0' test-matrix: | { include: [ @@ -42,7 +42,7 @@ jobs: secrets: inherit test: - name: cuda12.8-py3.10-gcc11-sm80 + name: cuda13.0-py3.10-gcc11-sm80 uses: ./.github/workflows/_linux-test.yml needs: build with: diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 5b741eb67954a..18de36f752130 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -313,19 +313,6 @@ jobs: if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' inductor-build ') }} name: inductor-build uses: ./.github/workflows/_linux-build.yml - needs: - - get-label-type - - job-filter - with: - build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks - cuda-arch-list: '8.0' - secrets: inherit - - inductor-build-cuda13: - if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' inductor-build-cuda13 ') }} - name: inductor-build-cuda13 - uses: ./.github/workflows/_linux-build.yml needs: - get-label-type - job-filter From f95d7a4bacff6a1e4f11a232c0f8a3f2b42bed4e Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Sun, 8 Mar 2026 06:55:48 -0700 Subject: [PATCH 25/87] update previous version 2.10 installation in get start xpu (#176408) update previous version 2.10 installation in get start xpu (#176141) update previous version 2.10 installation in get start xpu for release 2.11 Pull Request resolved: https://github.com/pytorch/pytorch/pull/176141 Approved by: https://github.com/EikanWang (cherry picked from commit 14f828cb8c2ac10e66497b3bfe32ffe557753d5f) Co-authored-by: ZhaoqiongZ <106125927+ZhaoqiongZ@users.noreply.github.com> --- docs/source/notes/get_start_xpu.rst | 45 +++++++++++++++++------------ 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/docs/source/notes/get_start_xpu.rst b/docs/source/notes/get_start_xpu.rst index 5a1442e813805..dc0cb984c47fc 100644 --- a/docs/source/notes/get_start_xpu.rst +++ b/docs/source/notes/get_start_xpu.rst @@ -4,7 +4,8 @@ Getting Started on Intel GPU Hardware Prerequisite --------------------- -For Intel Data Center GPU +Intel Data Center GPU +^^^^^^^^^^^^^^^^^^^^^ .. list-table:: :widths: 50 50 50 50 @@ -19,7 +20,8 @@ For Intel Data Center GPU - yes - yes -For Intel Client GPU +Intel Client GPU +^^^^^^^^^^^^^^^^ +---------------------------------------+-----------------------------------------------------------------------------------------------------+ | Supported OS | Validated Hardware | @@ -51,37 +53,42 @@ Binaries Now that we have `Intel GPU Driver `_ installed, use the following commands to install ``pytorch``, ``torchvision``, ``torchaudio``. -For release wheels +Stable Releases +~~~~~~~~~~~~~~~ + +To install the latest stable release wheels for Intel GPU (XPU): .. code-block:: bash pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu -For nightly wheels +Nightly Builds +~~~~~~~~~~~~~~ + +To install the latest preview/nightly wheels: .. code-block:: bash pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu -For previous versions +Previous Versions +~~~~~~~~~~~~~~~~~ -v2.9.1 +**v2.10.0** .. code-block:: bash - pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/xpu + pip install torch==2.10.0 torchvision==0.25.0 torchaudio==2.10.0 --index-url https://download.pytorch.org/whl/xpu -v2.9.0 +**v2.9.1** .. code-block:: bash - pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/xpu - -v2.8.0 + pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/xpu -.. code-block:: bash +.. note:: - pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/xpu + For older wheels, please refer to the `previous versions `_ page and ensure you use the ``xpu`` index URL. From Source ^^^^^^^^^^^ @@ -137,7 +144,7 @@ Here are a few inference workflow examples. Inference with FP32 -""""""""""""""""""" +~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -157,7 +164,7 @@ Inference with FP32 print("Execution finished") Inference with AMP -"""""""""""""""""" +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -181,7 +188,7 @@ Inference with AMP print("Execution finished") Inference with ``torch.compile`` -"""""""""""""""""""""""""""""""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -222,7 +229,7 @@ Training Examples Here is a few training workflow examples. Train with FP32 -""""""""""""""" +~~~~~~~~~~~~~~~ .. code-block:: python @@ -279,7 +286,7 @@ Train with FP32 print("Execution finished") Train with AMP -"""""""""""""" +~~~~~~~~~~~~~~ .. note:: Training with ``GradScaler`` requires hardware support for ``FP64``. ``FP64`` is not natively supported by the Intel® Arc™ A-Series Graphics. If you run your workloads on Intel® Arc™ A-Series Graphics, please disable ``GradScaler``. @@ -347,7 +354,7 @@ Train with AMP print("Execution finished") Train with ``torch.compile`` -"""""""""""""""""""""""""""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python From dc12b65cd31ba18cbdc7f2e12e7d1564a67770d0 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Sun, 8 Mar 2026 06:57:24 -0700 Subject: [PATCH 26/87] [inductor] Fix Identity comparability and evalf recursion (#176783) [inductor] Fix Identity comparability and evalf recursion (#175975) Fixes #175856 ## Summary This PR adds a narrow `Identity._eval_evalf(self, prec)` override in `torch/utils/_sympy/functions.py` to fix the SymPy recursion/comparison failure seen in Inductor simplification (e.g. `Max(0, Identity(-6))`). The implementation only unwraps comparable integer constants: ```python def _eval_evalf(self, prec): arg = self.args[0] if arg.is_Integer and arg.is_comparable: return arg return None ``` This keeps the fix minimal for the index-math path involved in the bug. Tests Added targeted tests in test/inductor/test_utils.py: `testIdentityComparisonNoRecursion` `testIdentityComparableNumbersInMinMax` `testIdentityEvalfIntegerOnly` Validation Repro fails on unpatched builds in the same SymPy/Inductor path. Repro passes with this fix applied. Pull Request resolved: https://github.com/pytorch/pytorch/pull/175975 Approved by: https://github.com/azahed98, https://github.com/laithsakka (cherry picked from commit cea64de6cc14c55cf3d909787a51b2cd64b3aa04) Co-authored-by: bhack --- test/inductor/test_utils.py | 16 ++++++++++++++++ torch/utils/_sympy/functions.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/test/inductor/test_utils.py b/test/inductor/test_utils.py index 41bb05c4cf594..cd37bcb57ede3 100644 --- a/test/inductor/test_utils.py +++ b/test/inductor/test_utils.py @@ -87,6 +87,22 @@ def testSympySubsIdentityNonComparable(self): result = sympy_subs(expr, {q0: I}) self.assertTrue(result.has(I)) + def testIdentityComparisonNoRecursion(self): + self.assertTrue(Identity(sympify("0")) >= 0) + self.assertFalse(Identity(sympify("-6")) >= 0) + self.assertTrue(0 >= Identity(sympify("-6"))) + + def testIdentityComparableNumbersInMinMax(self): + expr = Identity(sympify("-6")) + self.assertTrue(expr.is_number) + self.assertTrue(expr.is_comparable) + self.assertEqual(Max(0, expr), 0) + + def testIdentityRationalComparisonNoRecursion(self): + expr = Identity(sympify("1/7")) + self.assertTrue(expr >= 0) + self.assertTrue(Max(0, expr).has(expr)) + def test_sympy_str(self): self.assertEqual(sympy_str(sympify("a+b+c")), "a + b + c") self.assertEqual(sympy_str(sympify("a*b+c")), "c + a * b") diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py index ee04c2461f3c6..5102afd9b530e 100644 --- a/torch/utils/_sympy/functions.py +++ b/torch/utils/_sympy/functions.py @@ -1366,6 +1366,38 @@ def __int__(self) -> int: # pyrefly: ignore [missing-attribute] return int(self.args[0]) + def _identity_atom_compare(self, other, op): + """ + Fast path for comparing wrapped numeric atomics against other numeric atomics. + Keep compound expressions on SymPy's default symbolic path. + """ + arg = self.args[0] + if isinstance(other, int): + other = sympy.Integer(other) + if not isinstance(other, sympy.Expr): + return None + if not (arg.is_Atom and arg.is_number and arg.is_comparable): + return None + if not (other.is_Atom and other.is_number and other.is_comparable): + return None + return sympy.S.true if op(arg, other) else sympy.S.false + + def __ge__(self, other): + out = self._identity_atom_compare(other, lambda a, b: a >= b) + return out if out is not None else super().__ge__(other) + + def __gt__(self, other): + out = self._identity_atom_compare(other, lambda a, b: a > b) + return out if out is not None else super().__gt__(other) + + def __le__(self, other): + out = self._identity_atom_compare(other, lambda a, b: a <= b) + return out if out is not None else super().__le__(other) + + def __lt__(self, other): + out = self._identity_atom_compare(other, lambda a, b: a < b) + return out if out is not None else super().__lt__(other) + def __float__(self) -> float: # pyrefly: ignore [missing-attribute] return float(self.args[0]) From 63fcbe1040ffef63e82abd4e66da1d7554d23aa4 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Sun, 8 Mar 2026 09:40:08 -0700 Subject: [PATCH 27/87] [XPU] Fix SyclExtension Windows build for oneAPI 2025.3+ breaking change (#175333) [XPU] Fix SyclExtension Windows build for oneAPI 2025.3+ breaking change (#170701) ## Summary Fixes SyclExtension compilation on Windows when using oneAPI 2025.3 or higher. ## Problem oneAPI 2025.3 introduced a breaking change in how include paths are ordered to align with MSVC behavior. This causes build failures when compiling SyclExtension on Windows. The issue occurs because MSVC include directories are explicitly passed on the compiler command line. With the new include path ordering in oneAPI 2025.3, this causes the wrong std headers included. These MSVC directories are already added as correctly-ordered implicit include paths by the compiler, so they should not need to be passed explicitly on the command line. Passing them explicitly disrupts the intended include order. ## Solution When building SYCL extensions on Windows with oneAPI version >= 2025.3, filter out Microsoft Visual Studio paths from the compiler's include directories. The fix is version-gated to only apply for oneAPI 2025.3+ to avoid affecting users on older oneAPI versions. Fixes: https://github.com/intel/torch-xpu-ops/issues/2574 Pull Request resolved: https://github.com/pytorch/pytorch/pull/170701 Approved by: https://github.com/dvrogozh, https://github.com/EikanWang, https://github.com/atalman (cherry picked from commit a09b29e732f52a690d4ca3764256f26369115858) Co-authored-by: astachowiczhabana --- torch/utils/cpp_extension.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index a63bff50d5ec3..109a6608aa4bd 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -947,6 +947,20 @@ def win_cuda_flags(cflags): def win_hip_flags(cflags): return (COMMON_HIPCC_FLAGS + COMMON_HIP_FLAGS + cflags + _get_rocm_arch_flags(cflags)) + def win_filter_msvc_include_dirs(pp_opts) -> list[str]: + """Filter out MSVC include dirs from pp_opts for oneAPI 2025.3+.""" + # oneAPI 2025.3+ changed include path ordering to match MSVC behavior. + # Filter out MSVC headers to avoid conflicting declarations with oneAPI's std headers. + icpx_version = int(_get_icpx_version()) + if icpx_version >= 20250300: + vc_tools_dir = os.path.normcase(os.environ.get('VCToolsInstallDir', '')) + if vc_tools_dir: + pp_opts = [ + path for path in pp_opts + if vc_tools_dir not in os.path.normcase(path) + ] + return pp_opts + def win_wrap_single_compile(sources, output_dir=None, macros=None, @@ -1116,7 +1130,7 @@ def win_wrap_ninja_compile(sources, sycl_post_cflags = None sycl_dlink_post_cflags = None if with_sycl: - sycl_cflags = common_cflags + pp_opts + _COMMON_SYCL_FLAGS + sycl_cflags = common_cflags + win_filter_msvc_include_dirs(pp_opts) + _COMMON_SYCL_FLAGS if isinstance(extra_postargs, dict): sycl_post_cflags = extra_postargs['sycl'] else: From 5d919bfe0f2ba7c7aabdb75ef6a20512f163e662 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 9 Mar 2026 06:02:02 -0700 Subject: [PATCH 28/87] [Inductor] Reject non-contiguous subnode fusion in mix-order reduction. (#176410) [Inductor] Reject non-contiguous subnode fusion in mix-order reduction. (#176131) We observed assert error after PR #174947 on XPU in https://github.com/intel/torch-xpu-ops/issues/2932: The assert error in line L2125: https://github.com/pytorch/pytorch/blob/f99ab991dcd3719ee25dd3377a53ea12e518308e/torch/_inductor/scheduler.py#L2122-L2125 which is caused by: https://github.com/pytorch/pytorch/blob/f99ab991dcd3719ee25dd3377a53ea12e518308e/torch/_inductor/scheduler.py#L2200-L2203 Root cause: - MixOrderReduction.can_fuse is a pre-fusion heuristic; it only checks static conditions (both reductions, reversed orders, common reads, one contiguous pre-fusion, size/heuristics). It cannot see access-pattern changes introduced by backend.fuse. - In the failing case, self.node1=op1115 (reduction, contiguous=True) is fused with other=op1123 (pointwise, contiguous=False), producing fused_node=op1115_op1123 (non-contiguous). self.node2=op1117_op1119 is already non-contiguous. The mix-order reduction invariant (at least one side contiguous) is violated, so FusedMixOrderReductions would assert. ``` self.node1 = op1115 (SchedulerNode, reduction, contiguous=True) other = op1123 (SchedulerNode, pointwise, contiguous=False) backend.fuse(self.node1, other) | v fused_node = op1115_op1123 (FusedSchedulerNode, reduction+pointwise, contiguous=False) self.node2 = op1117_op1119 (FusedSchedulerNode, reduction+reduction, contiguous=False) mix-order reduction attempt: fused_node + self.node2 -> FusedMixOrderReductions (assert fails) ``` Fix: - Add a general post-fusion validation in FusedMixOrderReductions.fuse_with: after backend.fuse, re-check the contiguity invariant and reject the fusion if both sides are non-contiguous. - Implement a FusionRejected signal and catch it in Scheduler.fuse_two_nodes to keep nodes unfused. Test: - Added a regression test which reproduced the assert error on **cuda/xpu** and pass with this PR. -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Pull Request resolved: https://github.com/pytorch/pytorch/pull/176131 Approved by: https://github.com/shunting314 (cherry picked from commit 5a6d6b3ece55cc4f4db0e377fa33b9c8374a507f) Co-authored-by: xinan.lin --- test/inductor/test_mix_order_reduction.py | 24 +++++++++++++++++++++++ torch/_inductor/scheduler.py | 7 +++++++ 2 files changed, 31 insertions(+) diff --git a/test/inductor/test_mix_order_reduction.py b/test/inductor/test_mix_order_reduction.py index 5dc30015976b6..e858f542e0e75 100644 --- a/test/inductor/test_mix_order_reduction.py +++ b/test/inductor/test_mix_order_reduction.py @@ -160,6 +160,30 @@ def f(x, y): # shared memory. self.assertEqual(metrics.codegen_mix_order_reduction, 0) + @inductor_config.patch(split_reductions=False) + def test_fuse_non_contiguous_pointwise(self): + if not inductor_config.triton.mix_order_reduction: + self.skipTest("Mix order reduction not enabled") + + # Regression: mix-order reduction can appear valid pre-fusion, but a pointwise + # fused into one side can change access patterns and break the contiguity + # invariant. This test builds a reduction + pointwise path plus a second + # reduction, matching the shape/ordering pattern seen in the E2E failure. + + def f(x): + # First reduction (contiguous on its own). + r1 = x.sum(dim=1) + # Pointwise depends on both reduced and unreduced data, so fusing it + # with the reduction can change access strides. + y = r1 * x[:, 0] + # Second reduction across a different dimension to trigger mix-order logic. + r2 = x.sum(dim=0) + return y, r2 + + # Large, asymmetric shape encourages mix-order reduction heuristics. + x = torch.randn(32768, 768, dtype=torch.float, device=GPU_TYPE) + self.check_numeric(f, (x,)) + @inductor_config.patch(coordinate_descent_tuning=True) def test_XBLOCK_coordest_tuning(self): """ diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py index 6e1323ca942a3..55e55a6eda421 100644 --- a/torch/_inductor/scheduler.py +++ b/torch/_inductor/scheduler.py @@ -2153,6 +2153,13 @@ def sub_node_can_fuse( if not self.scheduler.can_fuse(node1, node2, allow_mix_order_reduction=False): return False + # Since node1 is from the current mix order reduction, if node1 is + # contiguous, the fused node should also be contiguous. + if MixOrderReduction.is_contiguous_node( + node1 + ) and not MixOrderReduction.is_contiguous_node(node2): + return False + def _get_ancestors(nodes: tuple[BaseSchedulerNode, ...]) -> OrderedSet[str]: out = OrderedSet() return out.union(*(n.ancestors for n in nodes)) From 3c40486f8a515b3f6f851a0cc4b3a2dc07744f6c Mon Sep 17 00:00:00 2001 From: shunting314 <52589240+shunting314@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:26:09 -0700 Subject: [PATCH 29/87] [inductor] avoid multi-stage for mix-order-red by default (#176228) (#176495) The default >1 num_stages have causing multiple out of shared memory issues. Make it to be 1 by default. We could explore other alternatives 1. always add a config with num_stages=1 while keeping the current heuristics. Could increase compilation time 2. dynamically scale down num-stages if all config fail to compile due to out of shared memory 3. minic Triton logic to estimate the amount of shared memory needed per stage and set num-stages accordingly based on smem capacity. Pull Request resolved: https://github.com/pytorch/pytorch/pull/176228 Approved by: https://github.com/eellison, https://github.com/drisspg, https://github.com/jansel (cherry picked from commit ab17a385d9ce099ac68080b5493ab4e6e8b3131b) --- test/inductor/test_mix_order_reduction.py | 123 +++++++++++++++++++ torch/_inductor/codegen/triton.py | 1 + torch/_inductor/config.py | 5 + torch/_inductor/runtime/triton_heuristics.py | 5 +- 4 files changed, 133 insertions(+), 1 deletion(-) diff --git a/test/inductor/test_mix_order_reduction.py b/test/inductor/test_mix_order_reduction.py index e858f542e0e75..de96245ed51d3 100644 --- a/test/inductor/test_mix_order_reduction.py +++ b/test/inductor/test_mix_order_reduction.py @@ -6,11 +6,13 @@ import torch import torch._inductor.config as inductor_config import torch.nn.functional as F +from torch import nn from torch._dynamo.utils import same from torch._inductor import metrics, utils from torch._inductor.scheduler import MixOrderReduction from torch._inductor.test_case import run_tests, TestCase from torch.testing import FileCheck +from torch.testing._internal.common_device_type import largeTensorTest from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, isRocmArchAnyOf, @@ -750,6 +752,127 @@ def f(x): compile_metrics = torch._dynamo.utils._compilation_metrics self.assertEqual(len(compile_metrics), 1, "Don't recompile") + @largeTensorTest("36GB", device=GPU_TYPE, inductor=True) + def test_out_of_shared_memory(self): + """ + Fix https://github.com/pytorch/pytorch/issues/175250 + """ + if not inductor_config.triton.mix_order_reduction: + self.skipTest("Mix order reduction not enabled") + + NUM_HEADS = 32 + NUM_KV_HEADS = 8 + HEAD_DIM = 128 + HIDDEN_SIZE = NUM_HEADS * HEAD_DIM * 2 + SEQ_LEN = 8192 * 2 + + def rotate_half(x): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin): + cos = cos[:, None, :, :] + sin = sin[:, None, :, :] + return (q * cos) + (rotate_half(q) * sin), (k * cos) + ( + rotate_half(k) * sin + ) + + @torch.compile + def forward( + x, + q_proj, + k_proj, + v_proj, + o_proj, + embed_norm, + hidden_norm, + cos, + sin, + ): + batch, seq_len, _ = x.shape + + # Eagle3 first layer: split concatenated [embeds, hidden] input + mid = x.shape[2] // 2 + embeds, hidden = x.split(mid, dim=-1) + + # Dual RMSNorm (pow, sum, div, mul in backward) + embeds = embed_norm(embeds) + hidden = hidden_norm(hidden) + residual = hidden + + # Recombine for attention input (2 * HIDDEN_SIZE) + x = torch.cat([embeds, hidden], dim=-1) + + # Adding a graph break here "fixes" the issue + # by breaking up the fused op + # torch._dynamo.graph_break() + + # Q/K/V projections from 2*hidden_size input + q = q_proj(x).view(batch, seq_len, NUM_HEADS, HEAD_DIM).transpose(1, 2) + k = k_proj(x).view(batch, seq_len, NUM_KV_HEADS, HEAD_DIM).transpose(1, 2) + v = v_proj(x).view(batch, seq_len, NUM_KV_HEADS, HEAD_DIM).transpose(1, 2) + + q, k = apply_rotary_pos_emb(q, k, cos, sin) + k = torch.repeat_interleave(k, NUM_HEADS // NUM_KV_HEADS, dim=1) + v = torch.repeat_interleave(v, NUM_HEADS // NUM_KV_HEADS, dim=1) + out = q.contiguous() @ k.contiguous().transpose(-2, -1) @ v.contiguous() + + out = out.transpose(1, 2).contiguous().reshape(batch, seq_len, -1) + return o_proj(out) + residual + + # Layers + embed_norm = nn.RMSNorm(HIDDEN_SIZE).to(GPU_TYPE) + hidden_norm = nn.RMSNorm(HIDDEN_SIZE).to(GPU_TYPE) + # Q/K/V project from 2*HIDDEN_SIZE (concatenated embeds + hidden) + q_proj = nn.Linear(2 * HIDDEN_SIZE, NUM_HEADS * HEAD_DIM, bias=False).to( + GPU_TYPE + ) + k_proj = nn.Linear(2 * HIDDEN_SIZE, NUM_KV_HEADS * HEAD_DIM, bias=False).to( + GPU_TYPE + ) + v_proj = nn.Linear(2 * HIDDEN_SIZE, NUM_KV_HEADS * HEAD_DIM, bias=False).to( + GPU_TYPE + ) + o_proj = nn.Linear(NUM_HEADS * HEAD_DIM, HIDDEN_SIZE, bias=False).to(GPU_TYPE) + + # Block mask - simple causal only + def causal_mask(_b, _h, q, kv): + return q >= kv + + # Rotary embeddings (precomputed, no grad needed) + inv_freq = 1.0 / ( + 500000.0 + ** ( + torch.arange(0, HEAD_DIM, 2, dtype=torch.float32, device=GPU_TYPE) + / HEAD_DIM + ) + ) + pos = torch.arange(1, SEQ_LEN + 1, dtype=torch.float32, device=GPU_TYPE) + freqs = torch.outer(pos, inv_freq) + emb = torch.cat((freqs, freqs), dim=-1).unsqueeze(0) + cos, sin = emb.cos(), emb.sin() + + # Input: 2*HIDDEN_SIZE to match split [embeds, hidden] + x = torch.randn( + 1, SEQ_LEN, 2 * HIDDEN_SIZE, device=GPU_TYPE, requires_grad=True + ) + + out = forward( + x, + q_proj, + k_proj, + v_proj, + o_proj, + embed_norm, + hidden_norm, + cos, + sin, + ) + loss = out.sum() + loss.backward() + self.assertTrue(metrics.codegen_mix_order_reduction > 1) + @inductor_config.patch( "triton.mix_order_reduction", not inductor_config.triton.mix_order_reduction diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index bca5da15e3e22..039b53ee1f2fc 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -5297,6 +5297,7 @@ def inductor_meta_common(cls): "store_cubin": config.triton.store_cubin, "deterministic": config.deterministic, "force_filter_reduction_configs": config.test_configs.force_filter_reduction_configs, + "mix_order_reduction_allow_multi_stages": config.triton.mix_order_reduction_allow_multi_stages, } if config.write_are_deterministic_algorithms_enabled: diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index 9bf1f70fa4c4c..e2fee26f45cc1 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -1793,6 +1793,11 @@ class triton: # this could be helpful to avoid recompilations in some cases mix_order_reduction_non_strict_mode = False + # Don't allow multi-stages by default to avoid out of shared memory + mix_order_reduction_allow_multi_stages = ( + os.environ.get("TORCHINDUCTOR_MIX_ORDER_REDUCTION_ALLOW_MULTI_STAGES") == "1" + ) + enable_tlx_templates: bool = ( os.environ.get("TORCHINDUCTOR_ENABLE_TLX_TEMPLATES", "0") == "1" ) diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 61bb640f5a072..2a1447fbf0bda 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -3776,7 +3776,10 @@ def persistent_reduction( # With large rnumel, we have higher chance of out-of-shared memory # To avoid adding too much autotuning overhead, we just constrain NUM_STAGES # if rnumel is large - MAX_NUM_STAGES = 2 if rnumel_hint > 8192 else 3 + if inductor_meta.get("mix_order_reduction_allow_multi_stages", True): + MAX_NUM_STAGES = 2 if rnumel_hint > 8192 else 3 + else: + MAX_NUM_STAGES = 1 c.kwargs["NUM_STAGES"] = min(max(num_iters // 4, 1), MAX_NUM_STAGES) if rnumel_hint <= 1024: From f31baaae5d066cf833ac96a270495d71bdc1d508 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 9 Mar 2026 14:49:46 -0700 Subject: [PATCH 30/87] Fix the torch.Stream context manager reentrance (#176603) Fix the torch.Stream context manager reentrance (#176568) # Motivation This PR aims to fix `torch.Stream` as a context manager nested/reentrance scenario. `torch.cuda.stream` and `torch.xpu.stream` could support these usages. The following scenario would be fixed with this PR: ```python import torch s0 = torch.Stream() with s0, s0: pass ``` ```python import torch s0 = torch.Stream() s1 = torch.Stream() with s0, s1: with s0, s1: pass ``` # Addtional Context Fix https://github.com/pytorch/pytorch/issues/176560 Pull Request resolved: https://github.com/pytorch/pytorch/pull/176568 Approved by: https://github.com/albanD (cherry picked from commit d43570c98bf31b1f7a14e821b0e197abecf92758) Co-authored-by: Yu, Guangye --- test/test_accelerator.py | 15 ++++++++ torch/csrc/Stream.cpp | 74 ++++++++++++++++++++++++++++++++++------ 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/test/test_accelerator.py b/test/test_accelerator.py index 7daebc01adfe9..67b92969aefd0 100644 --- a/test/test_accelerator.py +++ b/test/test_accelerator.py @@ -111,6 +111,21 @@ def test_stream_context_manager(self): self.assertEqual(torch.accelerator.current_stream(), s) self.assertEqual(torch.accelerator.current_stream(), prev_stream) + def test_stream_context_manager_reentrance(self): + prev_stream = torch.accelerator.current_stream() + s0 = torch.Stream() + with s0, s0: + self.assertEqual(torch.accelerator.current_stream(), s0) + self.assertEqual(torch.accelerator.current_stream(), prev_stream) + s1 = torch.Stream() + with s0: + self.assertEqual(torch.accelerator.current_stream(), s0) + with s1: + self.assertEqual(torch.accelerator.current_stream(), s1) + with s0: + self.assertEqual(torch.accelerator.current_stream(), s0) + self.assertEqual(torch.accelerator.current_stream(), prev_stream) + @unittest.skipIf(not TEST_MULTIACCELERATOR, "only one accelerator detected") def test_multi_device_stream_context_manager(self): src_device = 0 diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp index fc1ca916cfbed..c5a8e343e6b27 100644 --- a/torch/csrc/Stream.cpp +++ b/torch/csrc/Stream.cpp @@ -119,6 +119,7 @@ PyObject* THPStream_Wrap(const c10::Stream& stream) { static void THPStream_dealloc(THPStream* self) { PyObject_ClearWeakRefs((PyObject*)self); + Py_CLEAR(self->context); Py_TYPE(self)->tp_free(reinterpret_cast(self)); } @@ -277,38 +278,71 @@ static PyObject* THPStream_enter(PyObject* _self, PyObject* unused) { auto self = reinterpret_cast(_self); c10::DeviceType stream_device_type = static_cast(self->device_type); + // No operation is performed if the stream does not belong to an accelerator. if (C10_UNLIKELY(!at::accelerator::isAccelerator(stream_device_type))) { Py_INCREF(_self); return _self; } + + // Note [Reentrant Stream Context Manager] + // + // We maintain a stack of context entries to support nested/reentrant + // stream context managers. Each entry records the previously active + // stream and device so that they can be restored in __exit__. + // + // The stack is stored as a Python list where each entry is either: + // - Py_None: no-op enter (stream was already current); + // - dict: {_ctx_stream, _ctx_device_index} saved before switching. + // + // self->context is initialized lazily as a PyList on first __enter__. + if (!self->context) { + auto list = THPObjectPtr(PyList_New(0)); + if (!list) { + throw python_error(); + } + self->context = list.release(); + } + c10::DeviceIndex cur_device_idx = at::accelerator::getDeviceIndex(); c10::DeviceIndex stream_device_idx = static_cast(self->device_index); + c10::Stream cur_stream = at::accelerator::getCurrentStream(stream_device_idx); + + // If the stream is already current, push None as a no-op sentinel. + if (cur_stream.id() == self->stream_id && + cur_stream.device_index() == stream_device_idx) { + if (PyList_Append(self->context, Py_None) < 0) { + throw python_error(); + } + Py_INCREF(_self); + return _self; + } + // If the stream is not on the current device, switch the current device to // the device of the stream. if (stream_device_idx != cur_device_idx) { at::accelerator::setDeviceIndex(stream_device_idx); } - c10::Stream cur_stream = at::accelerator::getCurrentStream(stream_device_idx); at::accelerator::setCurrentStream(c10::Stream::unpack3( self->stream_id, stream_device_idx, stream_device_type)); - // Save the current device index and previous stream to the context. + + // Save the current device index and previous stream as a dict on the stack. auto ctx_device_index = THPObjectPtr(THPUtils_packDeviceIndex(cur_device_idx)); auto ctx_stream = THPObjectPtr(THPStream_Wrap(cur_stream)); - TORCH_CHECK(!(self->context), "Stream's context should not be initialized."); auto dict = THPObjectPtr(PyDict_New()); if (!dict) { throw python_error(); } - self->context = dict.release(); if (PyDict_SetItemString( - self->context, "_ctx_device_index", ctx_device_index.get()) < 0) { + dict.get(), "_ctx_device_index", ctx_device_index.get()) < 0) { + throw python_error(); + } + if (PyDict_SetItemString(dict.get(), "_ctx_stream", ctx_stream.get()) < 0) { throw python_error(); } - if (PyDict_SetItemString(self->context, "_ctx_stream", ctx_stream.get()) < - 0) { + if (PyList_Append(self->context, dict.get()) < 0) { throw python_error(); } Py_INCREF(_self); @@ -319,19 +353,34 @@ static PyObject* THPStream_enter(PyObject* _self, PyObject* unused) { static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) { HANDLE_TH_ERRORS auto self = reinterpret_cast(_self); + // No operation is performed if the stream does not belong to an accelerator. if (C10_UNLIKELY(!at::accelerator::isAccelerator( static_cast(self->device_type)))) { Py_RETURN_NONE; } + + // Pop the top entry from the stack. + Py_ssize_t stack_size = PyList_Size(self->context); + TORCH_INTERNAL_ASSERT(stack_size > 0, "Stream context stack is empty."); + PyObject* top = PyList_GET_ITEM(self->context, stack_size - 1); + + // Sentinel: this __enter__ was a no-op, nothing to restore. + if (top == Py_None) { + if (PyList_SetSlice(self->context, stack_size - 1, stack_size, nullptr) < + 0) { + throw python_error(); + } + Py_RETURN_NONE; + } + PyObject* py_stream = nullptr; - if (PyDict_GetItemStringRef(self->context, "_ctx_stream", &py_stream) < 0) { + if (PyDict_GetItemStringRef(top, "_ctx_stream", &py_stream) < 0) { throw python_error(); } auto ctx_stream = THPObjectPtr(py_stream); PyObject* py_device_index = nullptr; - if (PyDict_GetItemStringRef( - self->context, "_ctx_device_index", &py_device_index) < 0) { + if (PyDict_GetItemStringRef(top, "_ctx_device_index", &py_device_index) < 0) { throw python_error(); } auto ctx_device_index = THPObjectPtr(py_device_index); @@ -342,6 +391,7 @@ static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) { ctx_device_index.get(), "ctx_device_index should be present on the context dict."); auto prev_device_index = THPUtils_unpackDeviceIndex(ctx_device_index.get()); + at::accelerator::setCurrentStream(c10::Stream::unpack3( prev_stream->stream_id, static_cast(prev_stream->device_index), @@ -350,7 +400,9 @@ static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) { if (static_cast(self->device_index) != prev_device_index) { at::accelerator::setDeviceIndex(prev_device_index); } - Py_CLEAR(self->context); + if (PyList_SetSlice(self->context, stack_size - 1, stack_size, nullptr) < 0) { + throw python_error(); + } Py_RETURN_NONE; END_HANDLE_TH_ERRORS } From 052ff5c474718ac457abc5cfb7674f2876231eb3 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 11 Mar 2026 10:15:02 -0700 Subject: [PATCH 31/87] Windows override AMI pre-installed cudnn (#177094) Windows override AMI pre-installed cudnn (#177027) Fixes: https://github.com/pytorch/pytorch/issues/167242 Also refactor code to avoid duplication. Test via ciflow/binaries. I do see: ``` cuDNN version : 9.19.0 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/177027 Approved by: https://github.com/malfet (cherry picked from commit 61fae89be8357e04d4dc38a5424d6b307025d2e6) Co-authored-by: atalman --- .ci/pytorch/windows/internal/cuda_install.bat | 172 +++++++++--------- 1 file changed, 91 insertions(+), 81 deletions(-) diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat index 0c8c023831e45..456b53183f186 100644 --- a/.ci/pytorch/windows/internal/cuda_install.bat +++ b/.ci/pytorch/windows/internal/cuda_install.bat @@ -20,8 +20,8 @@ set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR% set CUDNN_FOLDER="cuda" set CUDNN_LIB_FOLDER="lib\x64" -:: Skip all of this if we already have cuda installed -if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto set_cuda_env_vars +:: If CUDA is already installed, skip CUDA installation but still verify cuDNN +if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto check_cudnn if %CUDA_VER% EQU 126 goto cuda126 if %CUDA_VER% EQU 128 goto cuda128 @@ -34,110 +34,47 @@ exit /b 1 goto cuda_common :cuda126 - set CUDA_INSTALL_EXE=cuda_12.6.2_560.94_windows.exe -if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( - curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore - if errorlevel 1 exit /b 1 - set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6" -) - +set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6" set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive -set CUDNN_LIB_FOLDER="lib" -set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" -if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( - curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore - if errorlevel 1 exit /b 1 - set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" -) - -@REM cuDNN 8.3+ required zlib to be installed on the path -echo Installing ZLIB dlls -curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" -7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" -xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" - -goto cuda_common +goto cuda_download :cuda128 - set CUDA_INSTALL_EXE=cuda_12.8.0_571.96_windows.exe -if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( - curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore - if errorlevel 1 exit /b 1 - set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=cuda_profiler_api_12.8 thrust_12.8 nvcc_12.8 cuobjdump_12.8 nvprune_12.8 nvprof_12.8 cupti_12.8 cublas_12.8 cublas_dev_12.8 cudart_12.8 cufft_12.8 cufft_dev_12.8 curand_12.8 curand_dev_12.8 cusolver_12.8 cusolver_dev_12.8 cusparse_12.8 cusparse_dev_12.8 npp_12.8 npp_dev_12.8 nvrtc_12.8 nvrtc_dev_12.8 nvml_dev_12.8 nvjitlink_12.8 nvtx_12.8" -) - +set "ARGS=cuda_profiler_api_12.8 thrust_12.8 nvcc_12.8 cuobjdump_12.8 nvprune_12.8 nvprof_12.8 cupti_12.8 cublas_12.8 cublas_dev_12.8 cudart_12.8 cufft_12.8 cufft_dev_12.8 curand_12.8 curand_dev_12.8 cusolver_12.8 cusolver_dev_12.8 cusparse_12.8 cusparse_dev_12.8 npp_12.8 npp_dev_12.8 nvrtc_12.8 nvrtc_dev_12.8 nvml_dev_12.8 nvjitlink_12.8 nvtx_12.8" set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda12-archive -set CUDNN_LIB_FOLDER="lib" -set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" -if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( - curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore - if errorlevel 1 exit /b 1 - set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" -) - -@REM cuDNN 8.3+ required zlib to be installed on the path -echo Installing ZLIB dlls -curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" -7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" -xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" - -goto cuda_common +goto cuda_download :cuda129 - set CUDA_INSTALL_EXE=cuda_12.9.1_576.57_windows.exe -if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( - curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore - if errorlevel 1 exit /b 1 - set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=cuda_profiler_api_12.9 thrust_12.9 nvcc_12.9 cuobjdump_12.9 nvprune_12.9 nvprof_12.9 cupti_12.9 cublas_12.9 cublas_dev_12.9 cudart_12.9 cufft_12.9 cufft_dev_12.9 curand_12.9 curand_dev_12.9 cusolver_12.9 cusolver_dev_12.9 cusparse_12.9 cusparse_dev_12.9 npp_12.9 npp_dev_12.9 nvrtc_12.9 nvrtc_dev_12.9 nvml_dev_12.9 nvjitlink_12.9 nvtx_12.9" -) - +set "ARGS=cuda_profiler_api_12.9 thrust_12.9 nvcc_12.9 cuobjdump_12.9 nvprune_12.9 nvprof_12.9 cupti_12.9 cublas_12.9 cublas_dev_12.9 cudart_12.9 cufft_12.9 cufft_dev_12.9 curand_12.9 curand_dev_12.9 cusolver_12.9 cusolver_dev_12.9 cusparse_12.9 cusparse_dev_12.9 npp_12.9 npp_dev_12.9 nvrtc_12.9 nvrtc_dev_12.9 nvml_dev_12.9 nvjitlink_12.9 nvtx_12.9" set CUDNN_FOLDER=cudnn-windows-x86_64-9.17.1.4_cuda12-archive -set CUDNN_LIB_FOLDER="lib" -set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" -if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( - curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore - if errorlevel 1 exit /b 1 - set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" -) - -@REM cuDNN 8.3+ required zlib to be installed on the path -echo Installing ZLIB dlls -curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" -7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" -xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" - -goto cuda_common +goto cuda_download :cuda130 - set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe +set "ARGS=" +set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda13-archive +goto cuda_download + +:: Common download logic for CUDA toolkit, cuDNN, and ZLIB +:cuda_download +set CUDNN_LIB_FOLDER="lib" +set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" + if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore if errorlevel 1 exit /b 1 set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=" ) -set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda13-archive -set CUDNN_LIB_FOLDER="lib" -set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore if errorlevel 1 exit /b 1 set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ) -@REM cuDNN 8.3+ required zlib to be installed on the path -echo Installing ZLIB dlls -curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" -7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" -xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" +call :install_zlib goto cuda_common @@ -211,6 +148,69 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_ goto set_cuda_env_vars +:check_cudnn +:: When CUDA is pre-installed on the AMI, cuDNN may still be missing. +:: Set the correct cuDNN variables for the CUDA version, then install if needed. + +set CUDNN_LIB_FOLDER="lib" +if %CUDA_VER% EQU 126 ( + set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive + set EXPECTED_CUDNN_VERSION=9.10.2 +) +if %CUDA_VER% EQU 128 ( + set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda12-archive + set EXPECTED_CUDNN_VERSION=9.19.0 +) +if %CUDA_VER% EQU 129 ( + set CUDNN_FOLDER=cudnn-windows-x86_64-9.17.1.4_cuda12-archive + set EXPECTED_CUDNN_VERSION=9.17.1 +) +if %CUDA_VER% EQU 130 ( + set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda13-archive + set EXPECTED_CUDNN_VERSION=9.19.0 +) +set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" + +set "CUDNN_VERSION_FILE=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include\cudnn_version.h" + +if not exist "%CUDNN_VERSION_FILE%" ( + echo cuDNN not found, installing %CUDNN_FOLDER%... + goto install_cudnn +) + +for /f "tokens=3" %%a in ('findstr /C:"#define CUDNN_MAJOR " "%CUDNN_VERSION_FILE%"') do set INSTALLED_MAJOR=%%a +for /f "tokens=3" %%a in ('findstr /C:"#define CUDNN_MINOR " "%CUDNN_VERSION_FILE%"') do set INSTALLED_MINOR=%%a +for /f "tokens=3" %%a in ('findstr /C:"#define CUDNN_PATCHLEVEL " "%CUDNN_VERSION_FILE%"') do set INSTALLED_PATCHLEVEL=%%a +set "INSTALLED_CUDNN_VERSION=%INSTALLED_MAJOR%.%INSTALLED_MINOR%.%INSTALLED_PATCHLEVEL%" + +if "%INSTALLED_CUDNN_VERSION%" == "%EXPECTED_CUDNN_VERSION%" ( + echo cuDNN %INSTALLED_CUDNN_VERSION% already installed at %ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR% + goto set_cuda_env_vars +) + +echo cuDNN version mismatch: installed %INSTALLED_CUDNN_VERSION%, expected %EXPECTED_CUDNN_VERSION%. Reinstalling... + +:install_cudnn + +if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" + +curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore +if errorlevel 1 exit /b 1 + +7z x "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" -o"%SRC_DIR%\temp_build\cudnn" +if errorlevel 1 ( + echo Failed to extract cuDNN archive %CUDNN_INSTALL_ZIP% + exit /b 1 +) +xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin" +xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64" +xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include" + +call :install_zlib + +echo Cleaning temp files +rd /s /q "%SRC_DIR%\temp_build" || ver > nul + :set_cuda_env_vars echo Setting up environment... @@ -218,3 +218,13 @@ set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\b set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt" + +goto :eof + +@REM cuDNN 8.3+ requires zlib to be installed on the path +:install_zlib +echo Installing ZLIB dlls +curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" +7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" +xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" +goto :eof From bac7b59c6fe3241bb6d6cca89cb4bf1da0662788 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 11 Mar 2026 11:47:12 -0700 Subject: [PATCH 32/87] fix acc failure for vit_base_patch14_dinov2.lvd142m (#177142) fix acc failure for vit_base_patch14_dinov2.lvd142m (#177042) Pull Request resolved: https://github.com/pytorch/pytorch/pull/177042 Approved by: https://github.com/v0i0, https://github.com/jansel (cherry picked from commit 78eae0472f45575d2d45b7d45d5fe5eccc4a8dcd) Co-authored-by: Shunting Zhang --- benchmarks/dynamo/timm_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py index 5d845e4fcee00..94da21d25bc1b 100755 --- a/benchmarks/dynamo/timm_models.py +++ b/benchmarks/dynamo/timm_models.py @@ -100,6 +100,7 @@ def pip_install(package): REQUIRE_LARGER_MULTIPLIER_FOR_SMALLER_TENSOR = { "inception_v3", "mobilenetv3_large_100", + "vit_base_patch14_dinov2.lvd142m", } From e04ddeaf45651e14b819f232af8e19d5615adfcd Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 11 Mar 2026 11:47:54 -0700 Subject: [PATCH 33/87] [Inductor] Don't unfuse addmm for bf16/fp16 to avoid precision loss (#177144) [Inductor] Don't unfuse addmm for bf16/fp16 to avoid precision loss (#176848) This fixes https://github.com/pytorch/pytorch/issues/176411 (see: [PyTorch Hud](https://hud.pytorch.org/benchmark/v3/dashboard/compiler_inductor?renderGroupId=detail_view&time.start=2026-03-02T00%3A00%3A00.000Z&time.end=2026-03-09T23%3A59%3A59.999Z&filters.benchmarkName=compiler&filters.mode=inference&filters.dtype=bfloat16&filters.deviceName=cuda+%28h100%29&filters.device=cuda&filters.arch=h100&filters.suite=huggingface&filters.compiler=cudagraphs&filters.model=openai%2Fwhisper-tiny&lcommit.commit=3bfa1aaa46152e895089d5314002e216092e924a&lcommit.workflow_id=22865042382&lcommit.date=2026-03-09T18%3A00%3A00Z&lcommit.branch=gh%2FNikhilAPatel%2F124%2Fhead&rcommit.commit=3bfa1aaa46152e895089d5314002e216092e924a&rcommit.workflow_id=22865042382&rcommit.date=2026-03-09T18%3A00%3A00Z&rcommit.branch=gh%2FNikhilAPatel%2F124%2Fhead&lbranch=gh%2FNikhilAPatel%2F124%2Fhead&rbranch=gh%2FNikhilAPatel%2F124%2Fhead&maxSampling=110)) It looks like the cuDNN frontend 1.16.1 upgrade changed the default SDPA backend. With addmm, cuBLAS keeps the matmul result in higher precision before combining with bias. With mm + add, the result is truncated to bf16 first. This ~1.4x per-layer RMSE difference compounds through whisper's 8 attention layers, exceeding the 3.0x accuracy threshold. Pull Request resolved: https://github.com/pytorch/pytorch/pull/176848 Approved by: https://github.com/jansel, https://github.com/mlazos (cherry picked from commit 1a270b4aa73b7169a76dd3aa09c52a04d1312c13) Co-authored-by: NikhilAPatel --- test/inductor/test_pattern_matcher.py | 17 +++++++++++++++++ torch/_inductor/fx_passes/post_grad.py | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py index 08575462bef5f..d0504470bd976 100644 --- a/test/inductor/test_pattern_matcher.py +++ b/test/inductor/test_pattern_matcher.py @@ -1212,6 +1212,23 @@ def fn2(inp, a, b): _, (code) = run_and_get_code(fn2, args[0], args[1], args[2]) FileCheck().check_not("extern_kernels.addmm(").run(code[0]) + @parametrize("dtype", [torch.bfloat16, torch.float16]) + def test_unfuse_bias_addmm_half_dtypes(self, dtype): + args = [ + torch.randn(20, device=GPU_TYPE, dtype=dtype), + torch.randn(10, 15, device=GPU_TYPE, dtype=dtype), + torch.randn(15, 20, device=GPU_TYPE, dtype=dtype), + ] + + # addmm with pointwise consumer should not be unfused for half dtypes + # to avoid precision loss from extra truncation at the mm output + @torch.compile() + def fn(inp, a, b): + return torch.nn.functional.gelu(torch.ops.aten.addmm(inp, a, b)) + + _, (code) = run_and_get_code(fn, args[0], args[1], args[2]) + FileCheck().check("extern_kernels.addmm(").run(code[0]) + def test_addmm_alpha_beta_with_pointwise(self): # Test that addmm with alpha/beta != 1 is unfused correctly with pointwise ops # See https://github.com/pytorch/pytorch/issues/167313 diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py index 427a6918a9cea..5c560b9dda4b7 100644 --- a/torch/_inductor/fx_passes/post_grad.py +++ b/torch/_inductor/fx_passes/post_grad.py @@ -1517,6 +1517,11 @@ def should_prefer_unfused_addmm(match): extra_check=should_prefer_unfused_addmm, ) def unfuse_bias_add_to_pointwise(match: Match, mat1, mat2, *, inp, alpha, beta): + # Unfusing addmm introduces an extra bf16/fp16 truncation at the mm output + # that compounds through deep models and causes accuracy failures. + if inp.meta["val"].dtype in (torch.bfloat16, torch.float16): + return + def repl(inp, x1, x2, alpha, beta): mm_result = x1 @ x2 if alpha != 1: From e2fa2953033020ad7e0f823ec534044fac15a3c7 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 11 Mar 2026 11:49:21 -0700 Subject: [PATCH 34/87] [CD] Unpin cuda-bindings dependencies (#177159) [CD] Unpin cuda-bindings dependencies (#176042) Within the same CUDA major version Fixes https://github.com/pytorch/pytorch/issues/175948 Pull Request resolved: https://github.com/pytorch/pytorch/pull/176042 Approved by: https://github.com/ngimel, https://github.com/drisspg (cherry picked from commit 87f052cebb66c799b6fef71c5e5fa13af2165ac3) Co-authored-by: Nikita Shulga --- .../scripts/generate_binary_build_matrix.py | 8 +-- ...linux-aarch64-binary-manywheel-nightly.yml | 56 +++++++++---------- ...nerated-linux-binary-manywheel-nightly.yml | 56 +++++++++---------- 3 files changed, 60 insertions(+), 60 deletions(-) diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index eda03260446be..afcd637c6c57c 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -51,7 +51,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { "12.6": ( "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | " # noqa: B950 - "cuda-bindings==12.9.4; platform_system == 'Linux' | " + "cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | " "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | " @@ -59,7 +59,7 @@ ), "12.8": ( "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | " # noqa: B950 - "cuda-bindings==12.9.4; platform_system == 'Linux' | " + "cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | " "nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | " @@ -67,7 +67,7 @@ ), "12.9": ( "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | " # noqa: B950 - "cuda-bindings==12.9.4; platform_system == 'Linux' | " + "cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | " "nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | " @@ -75,7 +75,7 @@ ), "13.0": ( "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | " # noqa: B950 - "cuda-bindings==13.0.3; platform_system == 'Linux' | " + "cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | " "nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | " "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " "nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | " diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index 567d099675a60..fe4f51edf2c19 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -133,7 +133,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -204,7 +204,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -275,7 +275,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -346,7 +346,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -483,7 +483,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -554,7 +554,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -625,7 +625,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -696,7 +696,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -833,7 +833,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -904,7 +904,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -975,7 +975,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1046,7 +1046,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1183,7 +1183,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1254,7 +1254,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1325,7 +1325,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1396,7 +1396,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1533,7 +1533,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1604,7 +1604,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1675,7 +1675,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1746,7 +1746,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1883,7 +1883,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1954,7 +1954,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2025,7 +2025,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2096,7 +2096,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2233,7 +2233,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2304,7 +2304,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2375,7 +2375,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2446,7 +2446,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 2e3aaa64f4d42..b820042fba170 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -128,7 +128,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -195,7 +195,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -262,7 +262,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_9 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -329,7 +329,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -792,7 +792,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -859,7 +859,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -926,7 +926,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_9 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -993,7 +993,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1456,7 +1456,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1523,7 +1523,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1590,7 +1590,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_9 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1657,7 +1657,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2120,7 +2120,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2187,7 +2187,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2254,7 +2254,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_9 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2321,7 +2321,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2784,7 +2784,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2851,7 +2851,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2918,7 +2918,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_9 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -2985,7 +2985,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -3448,7 +3448,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -3515,7 +3515,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -3582,7 +3582,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_9 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -3649,7 +3649,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -4112,7 +4112,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -4179,7 +4179,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -4246,7 +4246,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_9 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -4313,7 +4313,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} From 41f8e3e0381395e1669ca4bc6e36a7872d25cdcd Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 11 Mar 2026 11:52:13 -0700 Subject: [PATCH 35/87] [CI] Stop using G3 runners (#177161) [CI] Stop using G3 runners (#175938) Which is an old Tesla M60, that reached EOL back in October 2025 It's really hard to find an official doc, but here is public issue about it https://github.com/SummitRoute/aws_breaking_changes/issues/114 Pull Request resolved: https://github.com/pytorch/pytorch/pull/175938 Approved by: https://github.com/seemethere, https://github.com/jeanschmidt (cherry picked from commit 3b68f13463ea5499c5af8ca1a3138ea06a26c852) Co-authored-by: Nikita Shulga --- .github/actionlint.yaml | 6 -- .../linux_binary_build_workflow.yml.j2 | 5 +- .github/workflows/_binary-test-linux.yml | 2 +- ...nerated-linux-binary-manywheel-nightly.yml | 56 +++++++++---------- 4 files changed, 30 insertions(+), 39 deletions(-) diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 95637501e1069..8ef1ee2240a2e 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -25,9 +25,6 @@ self-hosted-runner: - linux.aws.h100 - linux.aws.h100.4 - linux.aws.h100.8 - - linux.4xlarge.nvidia.gpu - - linux.8xlarge.nvidia.gpu - - linux.16xlarge.nvidia.gpu - linux.g5.4xlarge.nvidia.gpu - linux.c7i.2xlarge # Pytorch/pytorch AWS Linux Runners on Linux Foundation account @@ -37,9 +34,6 @@ self-hosted-runner: - lf.linux.12xlarge - lf.linux.24xlarge - lf.linux.arm64.2xlarge - - lf.linux.4xlarge.nvidia.gpu - - lf.linux.8xlarge.nvidia.gpu - - lf.linux.16xlarge.nvidia.gpu - lf.linux.g5.4xlarge.nvidia.gpu - lf.linux.c7i.2xlarge # Repo-specific IBM hosted S390x runner diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index e110f33d8ce39..3f41256728e52 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -116,12 +116,9 @@ jobs: ALPINE_IMAGE: "docker.io/s390x/alpine" {%- elif config["gpu_arch_type"] == "rocm" %} runs_on: linux.rocm.gpu - {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.6"] %} - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner {%- elif config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu {%- else %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.4xlarge diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml index ed7738ecbdcc2..9f22bf2a01dda 100644 --- a/.github/workflows/_binary-test-linux.yml +++ b/.github/workflows/_binary-test-linux.yml @@ -63,7 +63,7 @@ on: runs_on: required: true type: string - description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu + description: Hardware to run this job on. Valid values are linux.4xlarge, linux.g4dn.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu secrets: github-token: required: true diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index b820042fba170..3dd2d544fd7f0 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -152,7 +152,7 @@ jobs: build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-upload: # Uploading @@ -219,7 +219,7 @@ jobs: build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-upload: # Uploading @@ -286,7 +286,7 @@ jobs: build_name: manywheel-py3_10-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_9-upload: # Uploading @@ -353,7 +353,7 @@ jobs: build_name: manywheel-py3_10-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda13_0-upload: # Uploading @@ -816,7 +816,7 @@ jobs: build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-upload: # Uploading @@ -883,7 +883,7 @@ jobs: build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-upload: # Uploading @@ -950,7 +950,7 @@ jobs: build_name: manywheel-py3_11-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_9-upload: # Uploading @@ -1017,7 +1017,7 @@ jobs: build_name: manywheel-py3_11-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda13_0-upload: # Uploading @@ -1480,7 +1480,7 @@ jobs: build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-upload: # Uploading @@ -1547,7 +1547,7 @@ jobs: build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-upload: # Uploading @@ -1614,7 +1614,7 @@ jobs: build_name: manywheel-py3_12-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_9-upload: # Uploading @@ -1681,7 +1681,7 @@ jobs: build_name: manywheel-py3_12-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda13_0-upload: # Uploading @@ -2144,7 +2144,7 @@ jobs: build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-upload: # Uploading @@ -2211,7 +2211,7 @@ jobs: build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-upload: # Uploading @@ -2278,7 +2278,7 @@ jobs: build_name: manywheel-py3_13-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_9-upload: # Uploading @@ -2345,7 +2345,7 @@ jobs: build_name: manywheel-py3_13-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda13_0-upload: # Uploading @@ -2808,7 +2808,7 @@ jobs: build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-upload: # Uploading @@ -2875,7 +2875,7 @@ jobs: build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-upload: # Uploading @@ -2942,7 +2942,7 @@ jobs: build_name: manywheel-py3_13t-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_9-upload: # Uploading @@ -3009,7 +3009,7 @@ jobs: build_name: manywheel-py3_13t-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda13_0-upload: # Uploading @@ -3472,7 +3472,7 @@ jobs: build_name: manywheel-py3_14-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_6-upload: # Uploading @@ -3539,7 +3539,7 @@ jobs: build_name: manywheel-py3_14-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_8-upload: # Uploading @@ -3606,7 +3606,7 @@ jobs: build_name: manywheel-py3_14-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_9-upload: # Uploading @@ -3673,7 +3673,7 @@ jobs: build_name: manywheel-py3_14-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda13_0-upload: # Uploading @@ -4136,7 +4136,7 @@ jobs: build_name: manywheel-py3_14t-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_6-upload: # Uploading @@ -4203,7 +4203,7 @@ jobs: build_name: manywheel-py3_14t-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_8-upload: # Uploading @@ -4270,7 +4270,7 @@ jobs: build_name: manywheel-py3_14t-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_9-upload: # Uploading @@ -4337,7 +4337,7 @@ jobs: build_name: manywheel-py3_14t-cuda13_0 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda13_0-upload: # Uploading From 036b25f5a29dc58cbc62e7b976efb860ff128c3f Mon Sep 17 00:00:00 2001 From: mikaylagawarecki Date: Wed, 11 Mar 2026 15:00:15 -0400 Subject: [PATCH 36/87] Let stable::from_blob accept a lambda as deleter (cherry-pick) (#176440) --- .../csrc/my_from_blob_with_lambda_deleter.cpp | 101 ++++++++++++++++++ .../libtorch_agn_2_11/ops.py | 53 +++++++++ test/cpp_extensions/test_libtorch_agnostic.py | 75 +++++++++++++ torch/csrc/shim_common.cpp | 10 +- torch/csrc/stable/c/shim.h | 8 +- torch/csrc/stable/ops.h | 71 ++++++++---- 6 files changed, 294 insertions(+), 24 deletions(-) create mode 100644 test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp diff --git a/test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp b/test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp new file mode 100644 index 0000000000000..8e498cbf2b9f4 --- /dev/null +++ b/test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include + +#ifdef LAE_USE_CUDA +#include +#endif + +using torch::stable::Tensor; + +// Global counter to track lambda deleter calls for testing +static int64_t g_lambda_deleter_call_count = 0; + +// Wrapper for from_blob with a capturing-lambda deleter. +// The lambda captures a pointer to the global counter and increments it, +// which exercises the capturing-lambda code path in torch_from_blob. +Tensor my_from_blob_with_lambda_deleter( + int64_t data_ptr, + torch::headeronly::HeaderOnlyArrayRef sizes, + torch::headeronly::HeaderOnlyArrayRef strides, + torch::stable::Device device, + torch::headeronly::ScalarType dtype) { + void* data = reinterpret_cast(data_ptr); + int64_t* counter = &g_lambda_deleter_call_count; + auto deleter = [counter](void* /*data*/) { (*counter)++; }; + return torch::stable::from_blob(data, sizes, strides, device, dtype, deleter); +} + +int64_t get_lambda_deleter_call_count() { + return g_lambda_deleter_call_count; +} + +void reset_lambda_deleter_call_count() { + g_lambda_deleter_call_count = 0; +} + +STABLE_TORCH_LIBRARY_FRAGMENT(STABLE_LIB_NAME, m) { + m.def( + "my_from_blob_with_lambda_deleter(int data_ptr, int[] sizes, int[] strides, Device device, ScalarType dtype) -> Tensor"); + m.def("get_lambda_deleter_call_count() -> int"); + m.def("reset_lambda_deleter_call_count() -> ()"); +} + +STABLE_TORCH_LIBRARY_IMPL( + STABLE_LIB_NAME, + CompositeExplicitAutograd, + m) { + m.impl( + "my_from_blob_with_lambda_deleter", + TORCH_BOX(&my_from_blob_with_lambda_deleter)); + m.impl( + "get_lambda_deleter_call_count", + TORCH_BOX(&get_lambda_deleter_call_count)); + m.impl( + "reset_lambda_deleter_call_count", + TORCH_BOX(&reset_lambda_deleter_call_count)); +} + +#ifdef LAE_USE_CUDA + +// Same as my_from_blob_with_cuda_deleter (from 2.11) but uses a non-capturing +// lambda deleter. +Tensor my_from_blob_with_cuda_lambda_deleter( + int64_t numel, + torch::stable::Device device) { + size_t size_bytes = numel * sizeof(float); + + void* data = nullptr; + cudaError_t err = cudaMalloc(&data, size_bytes); + if (err != cudaSuccess) { + throw std::runtime_error("cudaMalloc failed"); + } + + // Zero the memory + cudaMemset(data, 0, size_bytes); + + std::array sizes = {numel}; + std::array strides = {1}; + + // This lambda doesn't capture anything, but capture is tested above in + // my_from_blob_with_lambda_deleter + auto deleter = [](void* data) { cudaFree(data); }; + return torch::stable::from_blob( + data, + torch::headeronly::HeaderOnlyArrayRef(sizes.data(), sizes.size()), + torch::headeronly::HeaderOnlyArrayRef(strides.data(), strides.size()), + device, + torch::headeronly::ScalarType::Float, + deleter); +} + +STABLE_TORCH_LIBRARY_FRAGMENT(STABLE_LIB_NAME, m) { + m.def("my_from_blob_with_cuda_lambda_deleter(int numel, Device device) -> Tensor"); +} + +STABLE_TORCH_LIBRARY_IMPL(STABLE_LIB_NAME, CompositeExplicitAutograd, m) { + m.impl("my_from_blob_with_cuda_lambda_deleter", TORCH_BOX(&my_from_blob_with_cuda_lambda_deleter)); +} + +#endif // LAE_USE_CUDA diff --git a/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py b/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py index 4315898009269..442f9c97ce5db 100644 --- a/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py +++ b/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py @@ -57,6 +57,59 @@ def my_from_blob_with_cuda_deleter(numel: int, device) -> Tensor: ) +def my_from_blob_with_lambda_deleter(data_ptr, sizes, strides, device, dtype) -> Tensor: + """ + Creates a Tensor from existing memory with a capturing-lambda deleter. + + The deleter is a capturing lambda that updates a global call count, + exercising the capturing-lambda code path in torch_from_blob. + + Args: + data_ptr: int - pointer to the data buffer + sizes: tuple[int] - size of the tensor + strides: tuple[int] - strides of the tensor + device: Device - device on which the tensor resides + dtype: ScalarType - data type of the tensor + + Returns: Tensor - tensor wrapping the existing memory + """ + return torch.ops.libtorch_agn_2_11.my_from_blob_with_lambda_deleter.default( + data_ptr, sizes, strides, device, dtype + ) + + +def get_lambda_deleter_call_count() -> int: + """ + Returns the number of times the lambda test deleter has been called. + """ + return torch.ops.libtorch_agn_2_11.get_lambda_deleter_call_count.default() + + +def reset_lambda_deleter_call_count() -> None: + """ + Resets the lambda deleter call counter to zero. + """ + torch.ops.libtorch_agn_2_11.reset_lambda_deleter_call_count.default() + + +def my_from_blob_with_cuda_lambda_deleter(numel: int, device) -> Tensor: + """ + Creates a CUDA tensor that owns its memory via cudaMalloc, using a lambda deleter. + + Similar to my_from_blob_with_cuda_deleter but uses the capturing-lambda + code path in torch_from_blob. + + Args: + numel: int - number of elements in the tensor + device: Device - CUDA device + + Returns: Tensor - a 1D float32 tensor of zeros + """ + return torch.ops.libtorch_agn_2_11.my_from_blob_with_cuda_lambda_deleter.default( + numel, device + ) + + # ============================================================================= # Proxy for inherited ops (from libtorch_agn_2_9 and libtorch_agn_2_10 csrc/) # diff --git a/test/cpp_extensions/test_libtorch_agnostic.py b/test/cpp_extensions/test_libtorch_agnostic.py index 195651c4284ae..b2843fef09822 100644 --- a/test/cpp_extensions/test_libtorch_agnostic.py +++ b/test/cpp_extensions/test_libtorch_agnostic.py @@ -1809,6 +1809,81 @@ def test_my_from_blob_with_cuda_deleter_no_leak(self, device): curr_mem = torch.cuda.memory_allocated(device) self.assertEqual(curr_mem, init_mem) + @skipIfTorchVersionLessThan(2, 11) + @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor") + def test_my_from_blob_with_lambda_deleter(self, device): + """Test for from_blob with capturing-lambda deleter (2.11 feature).""" + import libtorch_agn_2_11 as libtorch_agnostic + + from_blob_fn = libtorch_agnostic.ops.my_from_blob_with_lambda_deleter + get_count = libtorch_agnostic.ops.get_lambda_deleter_call_count + reset_count = libtorch_agnostic.ops.reset_lambda_deleter_call_count + + is_cuda = torch.device(device).type == "cuda" + if is_cuda: + init_mem = torch.cuda.memory_allocated(device) + + def inner(): + reset_count() + self.assertEqual(get_count(), 0) + + # We need an original tensor to create the tensor with from_blob. + original = torch.rand(2, 3, device=device, dtype=torch.float32) + blob_tensor = from_blob_fn( + original.data_ptr(), + original.size(), + original.stride(), + device, + torch.float32, + ) + + self.assertEqual(blob_tensor, original) + self.assertEqual(blob_tensor.data_ptr(), original.data_ptr()) + + self.assertEqual(get_count(), 0) + + del blob_tensor + gc.collect() + + # Ensure the deleter was called. The original tensor still exists + # and can be used. + self.assertEqual(get_count(), 1) + original += 1 + # original goes out of scope here and its cuda memory should be + # freed. + + inner() + + if is_cuda: + # original tensor is out of scope, all the memory should be freed + torch.cuda.synchronize(device) + curr_mem = torch.cuda.memory_allocated(device) + self.assertEqual(curr_mem, init_mem) + + @onlyCUDA + @skipIfTorchVersionLessThan(2, 11) + def test_my_from_blob_with_cuda_lambda_deleter_no_leak(self, device): + """Test that from_blob lambda deleter properly frees cudaMalloc'd memory.""" + import libtorch_agn_2_11 as libtorch_agnostic + + from_blob_fn = libtorch_agnostic.ops.my_from_blob_with_cuda_lambda_deleter + + torch.cuda.synchronize(device) + init_mem = torch.cuda.memory_allocated(device) + numel = 1024 * 1024 # 4 MB per tensor + + for _ in range(10): + tensor = from_blob_fn(numel, device) + # Verify tensor was created correctly + self.assertEqual(tensor.numel(), numel) + self.assertEqual(tensor.device, torch.device(device)) + del tensor + gc.collect() + torch.cuda.synchronize(device) + + curr_mem = torch.cuda.memory_allocated(device) + self.assertEqual(curr_mem, init_mem) + @onlyCPU def test_my_layout(self, device): """Test layout() method for various tensor layouts.""" diff --git a/torch/csrc/shim_common.cpp b/torch/csrc/shim_common.cpp index d6c3cd2b41e0d..eac8147a30b29 100644 --- a/torch/csrc/shim_common.cpp +++ b/torch/csrc/shim_common.cpp @@ -667,7 +667,8 @@ AOTI_TORCH_EXPORT AOTITorchError torch_from_blob( int32_t layout, const uint8_t* opaque_metadata, int64_t opaque_metadata_size, - void (*deleter)(void*)) { + void (*deleter_callback)(void* data, void* ctx), + void* deleter_ctx) { AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({ c10::IntArrayRef sizes(sizes_ptr, ndim); c10::IntArrayRef strides(strides_ptr, ndim); @@ -676,11 +677,14 @@ AOTI_TORCH_EXPORT AOTITorchError torch_from_blob( static_cast(dtype)); at::Tensor tensor; if (data != nullptr) { - if (deleter != nullptr) { + if (deleter_callback != nullptr) { + auto wrapped_deleter = [deleter_callback, deleter_ctx](void* data) { + deleter_callback(data, deleter_ctx); + }; tensor = at::for_blob(data, sizes) .strides(strides) .storage_offset(storage_offset) - .deleter(deleter) + .deleter(wrapped_deleter) .options(options) .make_tensor(); } else { diff --git a/torch/csrc/stable/c/shim.h b/torch/csrc/stable/c/shim.h index f8a21e6c570f6..ec3caec593beb 100644 --- a/torch/csrc/stable/c/shim.h +++ b/torch/csrc/stable/c/shim.h @@ -165,8 +165,9 @@ AOTI_TORCH_EXPORT int32_t torch_dtype_float8_e8m0fnu(); AOTI_TORCH_EXPORT int32_t torch_dtype_float4_e2m1fn_x2(); // Creates a tensor from an existing data blob with an optional deleter. -// The deleter is called with the data pointer when the tensor's storage -// is deallocated. +// The deleter receives both the data pointer and a caller-supplied context +// pointer, which allows passing capturing lambdas across the C ABI boundary +// by heap-allocating the callable and passing it as deleter_ctx. AOTI_TORCH_EXPORT AOTITorchError torch_from_blob( void* data, int64_t ndim, @@ -180,7 +181,8 @@ AOTI_TORCH_EXPORT AOTITorchError torch_from_blob( int32_t layout, const uint8_t* opaque_metadata, int64_t opaque_metadata_size, - void (*deleter)(void*)); + void (*deleter)(void* data, void* ctx), + void* deleter_ctx); #endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_11_0 diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h index 19c109404cb5b..dbede4faba49e 100644 --- a/torch/csrc/stable/ops.h +++ b/torch/csrc/stable/ops.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -722,26 +723,28 @@ inline torch::stable::Tensor from_blob( /// /// This is the same as the from_blob function above, but allows specifying a /// custom deleter function that will be called when the tensor's storage is -/// deallocated. +/// deallocated. Accepts both plain function pointers and capturing lambdas. +/// /// Minimum compatible version: PyTorch 2.11. /// +/// @tparam F The callable type. Must be invocable with (void*). /// @param data Pointer to the data buffer. /// @param sizes The size of each dimension of the tensor. /// @param strides The stride for each dimension. /// @param device The device where the data resides. /// @param dtype The scalar type of the data. -/// @param deleter Function to call when the tensor is deallocated. May be -/// nullptr if no cleanup is needed. +/// @param deleter Callable to invoke when the tensor is deallocated. /// @param storage_offset The offset into the data buffer. Defaults to 0. /// @param layout The memory layout. Defaults to Strided. /// @return A tensor backed by the provided data. +template , int> = 0> inline torch::stable::Tensor from_blob( void* data, torch::headeronly::IntHeaderOnlyArrayRef sizes, torch::headeronly::IntHeaderOnlyArrayRef strides, torch::stable::Device device, torch::headeronly::ScalarType dtype, - DeleterFnPtr deleter, + F deleter, int64_t storage_offset = 0, torch::headeronly::Layout layout = torch::headeronly::Layout::Strided) { auto shim_dtype = @@ -750,21 +753,53 @@ inline torch::stable::Tensor from_blob( torch::stable::detail::from(device.type())); auto shim_layout = torch::stable::detail::to(torch::stable::detail::from(layout)); + AtenTensorHandle ath; - TORCH_ERROR_CODE_CHECK(torch_from_blob( - data, - sizes.size(), - sizes.data(), - strides.data(), - storage_offset, - shim_dtype, - shim_device_type, - device.index(), - &ath, - shim_layout, - nullptr, - 0, - deleter)); + if constexpr (std::is_convertible_v) { + // Simple function pointer: pass it as ctx, no heap allocation. + auto deleter_callback = [](void* data, void* ctx) { + auto fn = reinterpret_cast(ctx); + fn(data); + }; + TORCH_ERROR_CODE_CHECK(torch_from_blob( + data, + sizes.size(), + sizes.data(), + strides.data(), + storage_offset, + shim_dtype, + shim_device_type, + device.index(), + &ath, + shim_layout, + nullptr, + 0, + deleter_callback, + reinterpret_cast(static_cast(deleter)))); + } else { + // Capturing lambda: heap-allocate and type-erase. + F* heap_allocated_deleter = new F(std::move(deleter)); + auto deleter_callback = [](void* data, void* ctx) { + F* func = static_cast(ctx); + (*func)(data); + delete func; + }; + TORCH_ERROR_CODE_CHECK(torch_from_blob( + data, + sizes.size(), + sizes.data(), + strides.data(), + storage_offset, + shim_dtype, + shim_device_type, + device.index(), + &ath, + shim_layout, + nullptr, + 0, + deleter_callback, + static_cast(heap_allocated_deleter))); + } return torch::stable::Tensor(ath); } #endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_11_0 From fa384de31efe6548e694758d47ff295f2c2edb57 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:03:16 -0700 Subject: [PATCH 37/87] [Inductor][MPS] Fix half-precision type mismatches in Metal shader codegen (#176436) (#177193) Metal Shading Language rejects implicit float-to-bfloat conversions, so bare float literals like `0.0` in generated shaders cause compilation failures when the target variable is `bfloat` (or `half`). Three codegen methods were affected: - `constant()` ignored its `dtype` parameter and returned raw literals. - `masked()` assigned a bare literal in the else-branch (`} else tmp = 0.0;`). - `where()` passed a bare literal through the ternary without casting. All three now emit `static_cast(...)` / `static_cast(...)` where needed. Tests added for half-precision constants, reductions, and conditionals. Pull Request resolved: https://github.com/pytorch/pytorch/pull/176436 Approved by: https://github.com/malfet Test plan: Run `python -c "import torch;F=torch.nn.functional;print(torch.compile(lambda x: F.pad(F.gelu(x), [1, 0]))(torch.randn(4, device='mps', dtype=torch.bfloat16)))"` (cherry picked from commit 3b161e7a756798e6eb1ab096f4ef1232d163a68d) Co-authored-by: Mergen Nachin --- torch/_inductor/codegen/mps.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py index 05d0e84c681ad..4e409238d0b72 100644 --- a/torch/_inductor/codegen/mps.py +++ b/torch/_inductor/codegen/mps.py @@ -240,13 +240,17 @@ def masked(mask: CSEVariable, body: sympy.Expr, other: CSEVariable) -> str: ) with V.kernel.compute.indent(): V.kernel.compute.splice(scoped_body) - V.kernel.compute.writeline(f"{var} = {rc};") - V.kernel.compute.writeline(f"}} else {var} = {other_str};") + V.kernel.compute.writeline( + f"{var} = static_cast({rc});" + ) + V.kernel.compute.writeline( + f"}} else {var} = static_cast({other_str});" + ) return var @staticmethod def where(a: OpVarT, b: OpVarT, c: OpVarT) -> str: - return f"{a} ? {b} : {value_to_metal(c)}" + return f"{a} ? {b} : static_cast({value_to_metal(c)})" @staticmethod def remainder(a: OpVarT, b: OpVarT) -> str: From 76fd07897dd9126df160e9723d97511b79888087 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 12 Mar 2026 15:59:41 -0400 Subject: [PATCH 38/87] [release-only] Fix libtorch builds. Fix lint (#177299) --- .../workflows/generated-linux-binary-libtorch-nightly.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index db8ed62b924ef..88152f2cf92dd 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -156,7 +156,7 @@ jobs: build_name: libtorch-cuda12_6-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_6-shared-with-deps-release-upload: # Uploading @@ -225,7 +225,7 @@ jobs: build_name: libtorch-cuda12_8-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_8-shared-with-deps-release-upload: # Uploading @@ -294,7 +294,7 @@ jobs: build_name: libtorch-cuda12_9-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading @@ -363,7 +363,7 @@ jobs: build_name: libtorch-cuda13_0-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading From 7f2cdeb75b76bf07bb73776444bbb94456adbfa0 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Thu, 12 Mar 2026 20:43:15 -0700 Subject: [PATCH 39/87] [windows][smoke test] Add an option to install cuda if required cuda/cudnn on windows AMI do not match (#177369) [windows][smoke test] Add an option to install cuda if required cuda/cudnn on windows AMI do not match (#177273) Followup fix for https://github.com/pytorch/pytorch/issues/167242 After https://github.com/pytorch/pytorch/pull/175547 Windows AMI and intendent version of CUDNN do not match. Hence followup fixes required. With this approach we are flexible to update windows CUDNN without actually updating the Windows AMI. Windows AMI update can be done later. Fixes failure during test: https://github.com/pytorch/pytorch/actions/runs/22979336872/job/66724249070 ``` RuntimeError: cuDNN version incompatibility: PyTorch was compiled against (9, 19, 0) but found runtime version (9, 10, 2). PyTorch already comes bundled with cuDNN. One option to resolving this error is to ensure PyTorch can find the bundled cuDNN. ``` Please note: The cuda/cudnn version will not be updated if the right version is already installed. Test Plan: In CI via ciflow/binaries Pull Request resolved: https://github.com/pytorch/pytorch/pull/177273 Approved by: https://github.com/malfet, https://github.com/albanD (cherry picked from commit e55da9f31ef3ca0a5e0bb0ba29c6d2a6b1352f52) Co-authored-by: atalman --- .ci/pytorch/windows/internal/cuda_install.bat | 35 +++++++++++++------ .ci/pytorch/windows/internal/smoke_test.bat | 4 +++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat index 456b53183f186..3538c7aa2d323 100644 --- a/.ci/pytorch/windows/internal/cuda_install.bat +++ b/.ci/pytorch/windows/internal/cuda_install.bat @@ -17,8 +17,8 @@ set /a CUDA_VER=%CUDA_VERSION% set CUDA_VER_MAJOR=%CUDA_VERSION:~0,-1% set CUDA_VER_MINOR=%CUDA_VERSION:~-1,1% set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR% -set CUDNN_FOLDER="cuda" -set CUDNN_LIB_FOLDER="lib\x64" +set CUDNN_FOLDER=cuda +set CUDNN_LIB_FOLDER=lib\x64 :: If CUDA is already installed, skip CUDA installation but still verify cuDNN if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto check_cudnn @@ -59,7 +59,7 @@ goto cuda_download :: Common download logic for CUDA toolkit, cuDNN, and ZLIB :cuda_download -set CUDNN_LIB_FOLDER="lib" +set CUDNN_LIB_FOLDER=lib set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip" if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( @@ -126,9 +126,12 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_ echo Installing cuDNN... 7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn" - xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin" - xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64" - xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include" + xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\" + if exist "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\x64\*.*" ( + xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\x64\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\" + ) + xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64\" + xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include\" echo Installing GPU driver DLLs 7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -o"C:\Windows\System32" @@ -152,7 +155,7 @@ goto set_cuda_env_vars :: When CUDA is pre-installed on the AMI, cuDNN may still be missing. :: Set the correct cuDNN variables for the CUDA version, then install if needed. -set CUDNN_LIB_FOLDER="lib" +set CUDNN_LIB_FOLDER=lib if %CUDA_VER% EQU 126 ( set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive set EXPECTED_CUDNN_VERSION=9.10.2 @@ -190,6 +193,11 @@ if "%INSTALLED_CUDNN_VERSION%" == "%EXPECTED_CUDNN_VERSION%" ( echo cuDNN version mismatch: installed %INSTALLED_CUDNN_VERSION%, expected %EXPECTED_CUDNN_VERSION%. Reinstalling... +:: Remove old cuDNN DLLs so they don't shadow the new version at runtime. +:: AMI-installed cuDNN places DLLs directly in bin\, while newer archives +:: use bin\x64\. Without cleanup the old DLLs in bin\ are found first. +del /Q "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\cudnn*.dll" 2>nul + :install_cudnn if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" @@ -202,9 +210,16 @@ if errorlevel 1 ( echo Failed to extract cuDNN archive %CUDNN_INSTALL_ZIP% exit /b 1 ) -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin" -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64" -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include" +echo Listing extracted cuDNN archive contents: +dir /S /B "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%" +xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\" +:: Newer cuDNN archives place DLLs under bin\x64\. Flatten them into bin\ +:: so they are found via PATH (which only includes bin\, not bin\x64\). +if exist "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\x64\*.*" ( + xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\x64\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\" +) +xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64\" +xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include\" call :install_zlib diff --git a/.ci/pytorch/windows/internal/smoke_test.bat b/.ci/pytorch/windows/internal/smoke_test.bat index f671a9d0e0abb..c920dc2aeb165 100644 --- a/.ci/pytorch/windows/internal/smoke_test.bat +++ b/.ci/pytorch/windows/internal/smoke_test.bat @@ -5,6 +5,10 @@ pushd %SRC_DIR%\.. if not "%CUDA_VERSION%" == "cpu" if not "%CUDA_VERSION%" == "xpu" call internal\driver_update.bat if errorlevel 1 exit /b 1 +echo "Check if CUDA and CUDNN versions need to be updated" +call internal\cuda_install.bat +if errorlevel 1 exit /b 1 + if "%CUDA_VERSION%" == "xpu" ( call internal\xpu_install.bat if errorlevel 1 exit /b 1 From 483b55d84c74b92b3c2c67be4b9b7c7359ec2bbc Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 17 Mar 2026 11:03:09 -0700 Subject: [PATCH 40/87] Update pytorch_sphinx_theme2 version to 0.4.6 (#177616) Update pytorch_sphinx_theme2 version to 0.4.6 (#177562) Changelog here: https://github.com/pytorch/pytorch_sphinx_theme/blob/pytorch_sphinx_theme2/CHANGELOG.md Pull Request resolved: https://github.com/pytorch/pytorch/pull/177562 Approved by: https://github.com/AlannaBurke, https://github.com/albanD (cherry picked from commit c5dcefde3fe9bd88f25e626aa8201c54b4143f87) Co-authored-by: Svetlana Karslioglu --- .ci/docker/requirements-docs.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index 7f3e0b5cc9215..7e556a80e0025 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -2,9 +2,9 @@ sphinx==7.2.6 #Description: This is used to generate PyTorch docs #Pinned versions: 7.2.6 -pytorch_sphinx_theme2==0.4.3 +pytorch_sphinx_theme2==0.4.6 #Description: This is needed to generate PyTorch docs -#Pinned versions: 0.4.3 +#Pinned versions: 0.4.6 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought that it is probably From db741c72097871e384b22ee6fff1d6083adf23cc Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 20 Mar 2026 20:04:26 -0700 Subject: [PATCH 41/87] [MPS] fix compiling of SDPA producing nan results (#178009) [MPS] fix compiling of SDPA producing nan results (#175481) Fixes #171764 Took me a while to figure out wth was going wrong. Mini reproducer: ```python import torch # (uint / 65536) % non_power of 2, gives wrong result lib = torch.mps.compile_shader(''' kernel void func(device int* out, uint idx [[thread_position_in_grid]]) { out[idx] = (idx / 65536) % 6; } ''') out = torch.empty(128, device='mps', dtype=torch.int32) lib.func(out) # Every value should be 0 since xindex/65536 == 0 for xindex in [0,127] for i in [0, 5, 6, 7, 63, 64]: print(f"{i=} got {out[i].item()}") ``` Same purely in swift ```swift import Metal let device = MTLCreateSystemDefaultDevice()! let queue = device.makeCommandQueue()! let shaderSource = """ kernel void func(device int* out [[buffer(0)]], uint idx [[thread_position_in_grid]]) { out[idx] = (idx / 65536) % 6; } """ let library = try device.makeLibrary(source: shaderSource, options: nil) let function = library.makeFunction(name: "func")! let pipeline = try device.makeComputePipelineState(function: function) let count = 128 let buffer = device.makeBuffer(length: count * MemoryLayout.stride, options: .storageModeShared)! let cmdBuf = queue.makeCommandBuffer()! let encoder = cmdBuf.makeComputeCommandEncoder()! encoder.setComputePipelineState(pipeline) encoder.setBuffer(buffer, offset: 0, index: 0) encoder.dispatchThreads( MTLSizeMake(count, 1, 1), threadsPerThreadgroup: MTLSizeMake(min(count, pipeline.maxTotalThreadsPerThreadgroup), 1, 1) ) encoder.endEncoding() cmdBuf.commit() cmdBuf.waitUntilCompleted() let ptr = buffer.contents().bindMemory(to: Int32.self, capacity: count) for i in [0, 5, 6, 7, 63, 64] { print("i=\(i) got \(ptr[i])") } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/175481 Approved by: https://github.com/malfet (cherry picked from commit 3a9554c6436a1636d98db225af699d7e40c3bf12) Co-authored-by: Isalia20 --- c10/metal/utils.h | 12 ++++++++++++ test/inductor/test_mps_basic.py | 19 +++++++++++++++++++ test/test_mps.py | 16 ++++++++++++++++ torch/_inductor/codegen/mps.py | 5 +++++ 4 files changed, 52 insertions(+) diff --git a/c10/metal/utils.h b/c10/metal/utils.h index 8d58d0dfdd1f2..cc946e4fc4aa4 100644 --- a/c10/metal/utils.h +++ b/c10/metal/utils.h @@ -189,6 +189,18 @@ inline common_dtype floor_divide(T x, U y) { return ::metal::floor(x / y); } +// Workaround for Metal compiler bug: the compiler produces wrong results +// when optimizing fused (x / A) % B expressions for integral types. +template < + typename T, + typename U, + ::metal::enable_if_t< + is_scalar_integral_v && is_scalar_integral_v, + bool> = true> +inline common_dtype safe_mod(volatile T x, U y) { + return x % y; +} + // fmod template < typename T, diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py index 5c1691e581ccc..49d25247a8e32 100644 --- a/test/inductor/test_mps_basic.py +++ b/test/inductor/test_mps_basic.py @@ -191,6 +191,25 @@ def fn(a, b): ), ) + def test_sdpa_split_qkv(self): + # regression test for metal compiler bug where fused (x / A) % B + # produces wrong results, causing incorrect reads from non-contiguous. + n_head, n_embd, seq_len = 6, 384, 1024 + x = torch.randn(16, seq_len, n_embd, device="mps") + c_attn = torch.nn.Linear(n_embd, 3 * n_embd).to("mps").eval() + qkv = c_attn(x) + q, k, v = qkv.split(n_embd, dim=2) + q = q.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2) + k = k.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2) + v = v.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2) + + def fn(q, k, v): + return torch.nn.functional.scaled_dot_product_attention( + q, k, v, is_causal=True + ) + + self.common(fn, (q, k, v), atol=1e-4, rtol=1e-4, check_lowp=False) + class MPSBasicTestsAOTI(TestCase): def check_model(self, m, inp, dynamic_shapes=None): diff --git a/test/test_mps.py b/test/test_mps.py index 02e291017582e..31a2e0161c2e6 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -13313,6 +13313,22 @@ def test_metal_error_buffer(self): with self.assertRaisesRegex(RuntimeError, "Index .* exceeds limit"): torch.mps.synchronize() + def test_metal_compiler_bug_workaround(self): + # (uint / 65536) % non_power of 2, gives wrong result without safe_mod + lib = torch.mps.compile_shader(''' + #include + + kernel void func(device int* out, uint idx [[thread_position_in_grid]]) { + out[idx] = c10::metal::safe_mod((idx / 65536), 6); + } + ''') + out = torch.empty(128, device='mps', dtype=torch.int32) + lib.func(out) + # Every value should be 0 since xindex/65536 == 0 for xindex in [0,127] + for i in [0, 5, 6, 7, 63, 64]: + self.assertEqual(out[i], 0) + + # TODO: Actually instantiate that test for the "mps" device to better reflect what it is doing. # This requires mps to be properly registered in the device generic test framework which is not the diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py index 4e409238d0b72..b413a6f43f636 100644 --- a/torch/_inductor/codegen/mps.py +++ b/torch/_inductor/codegen/mps.py @@ -80,6 +80,9 @@ def _print_FloorDiv(self, expr: sympy.Expr) -> str: def _print_ModularIndexing(self, expr: sympy.Expr) -> str: x, div, mod = expr.args + # Workaround for Metal compiler bug with fused (x / A) % B, see PR 175481 + use_safe_mod = div == 65536 and (mod & (mod - 1)) != 0 + x = self.doprint(x) if div != 1: div = self.doprint(div) @@ -88,6 +91,8 @@ def _print_ModularIndexing(self, expr: sympy.Expr) -> str: else: x = f"metal::floor({x}) / ({div})" mod = self.doprint(mod) + if use_safe_mod: + return f"c10::metal::safe_mod({x}, {mod})" return f"({x}) % ({mod})" def _print_Min(self, expr: sympy.Expr) -> str: From 3e05c5a9ca8aacd0d137541876f8bf4cfca7e940 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 20 Mar 2026 20:05:16 -0700 Subject: [PATCH 42/87] [MPS] Properly handle conjugated tensors in bmm (#178010) [MPS] Properly handle conjugated tensors in bmm (#177522) Both `bmm` and `addmm` lacked proper handling for conjugated inputs for some of its arguments - Add regression tests - Fixes` test_noncontiguous_samples_linalg_svd_complex64` Fixes https://github.com/pytorch/pytorch/issues/177474 Pull Request resolved: https://github.com/pytorch/pytorch/pull/177522 Approved by: https://github.com/Skylion007, https://github.com/kurtamohler (cherry picked from commit bd1afa6b33a9f933a6da464b2f688b042bb5f275) Co-authored-by: Nikita Shulga --- .../native/mps/operations/LinearAlgebra.mm | 24 +++++++++++-------- test/test_mps.py | 21 ++++++++++++++++ .../_internal/common_methods_invocations.py | 8 ------- .../_internal/opinfo/definitions/linalg.py | 11 --------- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm index d98134469ec04..fa3796e59b969 100644 --- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm +++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm @@ -888,7 +888,8 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const std::string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, *bias_}) + ":" + std::to_string(beta.toDouble()) + ":" + std::to_string(alpha.toDouble()); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - MPSGraphTensor* biasTensor = mpsGraphRankedPlaceHolder(mpsGraph, *bias_); + auto biasTensor = mpsGraphRankedPlaceHolder(mpsGraph, *bias_); + auto biasTensor_ = bias_->is_conj() ? [mpsGraph conjugateWithTensor:biasTensor name:nil] : biasTensor; // TODO: Use alpha and beta here with fill_.Scalar and mul auto [selfTensor, otherTensor, productTensor] = do_mm(mpsGraph, self, other); @@ -901,11 +902,11 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const secondaryTensor:alphaTensor name:@"MM/alpha*(mat1@mat2)"]; } - auto biasTimesBetaTensor = biasTensor; + auto biasTimesBetaTensor = biasTensor_; if (is_beta_non_zero && beta.toDouble() != 1.0) { auto betaTensor = [mpsGraph constantWithScalar:beta.toDouble() dataType:getMPSScalarType((*bias_).scalar_type())]; - biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:biasTensor + biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:biasTensor_ secondaryTensor:betaTensor name:@"MM/beta*input"]; } @@ -1112,7 +1113,8 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const // Call tiled implementation if the number of elements exceeds 2^32 uint64_t resultSize = batch1.size(0) * batch1.size(1) * batch2.size(2); if (resultSize > pow(2, 32)) { - result = tiled_bmm_out_mps_impl(batch1, batch2, result); + // Tiled path uses MPSNDArray directly, so resolve conjugate views upfront + result = tiled_bmm_out_mps_impl(batch1.resolve_conj(), batch2.resolve_conj(), result); return result; } @@ -1130,16 +1132,18 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const std::to_string(doTranspose); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - MPSGraphTensor* batch1Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch1.scalar_type())); - MPSGraphTensor* batch2Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch2.scalar_type())); - MPSGraphTensor* batch2TensorTranspose = batch2Tensor; + auto batch1Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch1.scalar_type())); + auto batch2Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch2.scalar_type())); + + auto batch1TensorOp = batch1.is_conj() ? [mpsGraph conjugateWithTensor:batch1Tensor name:nil] : batch1Tensor; + auto batch2TensorOp = batch2.is_conj() ? [mpsGraph conjugateWithTensor:batch2Tensor name:nil] : batch2Tensor; if (doTranspose) { - batch2TensorTranspose = [mpsGraph transposeTensor:batch2Tensor dimension:-1 withDimension:-2 name:nil]; + batch2TensorOp = [mpsGraph transposeTensor:batch2TensorOp dimension:-1 withDimension:-2 name:nil]; } - MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1Tensor - secondaryTensor:batch2TensorTranspose + MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1TensorOp + secondaryTensor:batch2TensorOp name:@"MM/(batch1@batch2)"]; newCachedGraph->batch1Tensor_ = batch1Tensor; diff --git a/test/test_mps.py b/test/test_mps.py index 31a2e0161c2e6..5ca9f2ea484b8 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -1207,6 +1207,27 @@ def test_bmm(self): self.assertEqual(output_cpu, output_mps) self.assertEqual(output_cpu.size(), output_mps.size()) + def test_bmm_conj(self): + # bmm must respect the conjugate bit on input tensors. + # See https://github.com/pytorch/pytorch/issues/177474 + a = torch.randn(4, 3, 5, dtype=torch.complex64, device="mps") + b = torch.randn(4, 5, 2, dtype=torch.complex64, device="mps") + result_mps = torch.bmm(a, b.conj()) + result_cpu = torch.bmm(a.cpu(), b.cpu().conj()) + self.assertEqual(result_cpu, result_mps) + result_mps = torch.bmm(a.conj(), b) + result_cpu = torch.bmm(a.cpu().conj(), b.cpu()) + self.assertEqual(result_cpu, result_mps) + + def test_addmm_conj(self): + # Regression test: addmm must respect the conjugate bit on the bias tensor. + bias = torch.randn(3, 2, dtype=torch.complex64, device="mps") + a = torch.randn(3, 5, dtype=torch.complex64, device="mps") + b = torch.randn(5, 2, dtype=torch.complex64, device="mps") + result_mps = torch.addmm(bias.conj(), a, b) + result_cpu = torch.addmm(bias.cpu().conj(), a.cpu(), b.cpu()) + self.assertEqual(result_cpu, result_mps) + @xfailIf(MACOS_VERSION < 15.0) @parametrize("dtype", [torch.float16, torch.bfloat16]) def test_large_bmm(self, dtype): diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index e3ac2e1c4aade..a019f5f1bfdce 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -19251,14 +19251,6 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): device_type='mps', dtypes=[torch.float32]), # The operator 'aten::take' is not currently implemented for the MPS device DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type='mps'), - # RuntimeError: svd_backward: The singular vectors in the complex - # case are specified up to multiplication by e^{i phi}. The - # specified loss function depends on this phase term, making it - # ill-defined. - DecorateInfo( - unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', - device_type='mps', dtypes=(torch.complex64,) - ), )), OpInfo('svd_lowrank', op=lambda *args, **kwargs: wrapper_set_seed( diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py index 70c650dc327d8..b1b68d744751f 100644 --- a/torch/testing/_internal/opinfo/definitions/linalg.py +++ b/torch/testing/_internal/opinfo/definitions/linalg.py @@ -2728,17 +2728,6 @@ def make_input(): "test_out_warning", device_type="mps", ), - # MPS: RuntimeError: svd_backward: The singular vectors in the - # complex case are specified up to multiplication by e^{i phi}. The - # specified loss function depends on this phase term, making it - # ill-defined. - DecorateInfo( - unittest.expectedFailure, - "TestCommon", - "test_noncontiguous_samples", - device_type="mps", - dtypes=(torch.complex64,), - ), ), ), OpInfo( From 70d99e998b4955e0049d13a98d77ae1b14db1f45 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 20 Mar 2026 23:11:45 -0400 Subject: [PATCH 43/87] [release only] Increase timeout for rocm libtorch and manywheel builds (#178006) --- .../linux_binary_build_workflow.yml.j2 | 2 +- ...enerated-linux-binary-libtorch-nightly.yml | 4 +-- ...nerated-linux-binary-manywheel-nightly.yml | 28 +++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index 3f41256728e52..f1b85e8a8bf65 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -79,7 +79,7 @@ jobs: timeout-minutes: 420 {%- elif config["gpu_arch_type"] == "rocm" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.24xlarge.ephemeral diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index 88152f2cf92dd..b05b969362f13 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -406,7 +406,7 @@ jobs: LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: libtorch-rocm7_1-shared-with-deps-release build_environment: linux-binary-libtorch secrets: @@ -524,7 +524,7 @@ jobs: LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: libtorch-rocm7_2-shared-with-deps-release build_environment: linux-binary-libtorch secrets: diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 3dd2d544fd7f0..08e238832f474 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -394,7 +394,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.1 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_10-rocm7_1 build_environment: linux-binary-manywheel secrets: @@ -509,7 +509,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.2 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_10-rocm7_2 build_environment: linux-binary-manywheel secrets: @@ -1058,7 +1058,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.1 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_11-rocm7_1 build_environment: linux-binary-manywheel secrets: @@ -1173,7 +1173,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.2 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_11-rocm7_2 build_environment: linux-binary-manywheel secrets: @@ -1722,7 +1722,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.1 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_12-rocm7_1 build_environment: linux-binary-manywheel secrets: @@ -1837,7 +1837,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.2 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_12-rocm7_2 build_environment: linux-binary-manywheel secrets: @@ -2386,7 +2386,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.1 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_13-rocm7_1 build_environment: linux-binary-manywheel secrets: @@ -2501,7 +2501,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.2 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_13-rocm7_2 build_environment: linux-binary-manywheel secrets: @@ -3050,7 +3050,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.1 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_13t-rocm7_1 build_environment: linux-binary-manywheel secrets: @@ -3165,7 +3165,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.2 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_13t-rocm7_2 build_environment: linux-binary-manywheel secrets: @@ -3714,7 +3714,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.1 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_14-rocm7_1 build_environment: linux-binary-manywheel secrets: @@ -3829,7 +3829,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.2 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_14-rocm7_2 build_environment: linux-binary-manywheel secrets: @@ -4378,7 +4378,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.1 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_14t-rocm7_1 build_environment: linux-binary-manywheel secrets: @@ -4493,7 +4493,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm7.2 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 + timeout-minutes: 420 build_name: manywheel-py3_14t-rocm7_2 build_environment: linux-binary-manywheel secrets: From 4e45bcd8746bd77118ebfe853cbeda940a540b83 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 16 Jul 2025 03:43:38 +0000 Subject: [PATCH 44/87] [release/2.8] Upgrade numpy versions; Use different package versions for py3.9; upgrade tensorboard compatible with numpy 2 Co-authored-by: Ethan Wee (cherry picked from commit e867a3de4b0196621e8e53d5338a8bb8bb62e828) (cherry picked from commit c7a1e32fbcf9e0a458d959a453de65c27c51452c) (cherry picked from commit 2a215e4a2115c999e4bb058956d888aed67787d1) (cherry picked from commit 866cc1dbb9c93f807af1ef59801c645062cbb95e) (cherry picked from commit 4b46310999bc247e0a5b97ea90a96a44b8579d09) --- .ci/docker/requirements-ci.txt | 23 +++++++++++------------ requirements.txt | 3 +++ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 9a033b90fcb46..0e59119e3c4f7 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -117,8 +117,10 @@ ninja==1.11.1.4 #Pinned versions: 1.11.1.4 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py -numba==0.57.1 ; python_version == "3.10" and platform_machine != "s390x" -numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" +numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x" +numba==0.60.0 ; python_version == "3.9" and platform_machine != "s390x" +numba==0.61.2 ; python_version > "3.9" and platform_machine != "s390x" + #Description: Just-In-Time Compiler for Numerical Functions #Pinned versions: 0.55.2, 0.60.0 #test that import: test_numba_integration.py @@ -136,13 +138,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, #test_binary_ufuncs.py -numpy==1.23.2; python_version == "3.10" -numpy==1.26.2; python_version == "3.11" or python_version == "3.12" -numpy==2.1.2; python_version >= "3.13" and python_version < "3.14" -numpy==2.3.4; python_version >= "3.14" +numpy==2.0.2 ; python_version == "3.9" +numpy==2.1.2 ; python_version > "3.9" -pandas==2.0.3; python_version < "3.12" -pandas==2.2.3; python_version >= "3.12" and python_version < "3.14" +pandas==2.2.3; python_version < "3.14" pandas==2.3.3; python_version >= "3.14" #onnxruntime @@ -254,9 +253,10 @@ scikit-image==0.22.0 #Pinned versions: 0.20.3 #test that import: -scipy==1.10.1 ; python_version <= "3.11" -scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14" +scipy==1.13.1 ; python_version == "3.9" +scipy==1.14.1 ; python_version > "3.9" and python_version < "3.14" scipy==1.16.2 ; python_version >= "3.14" + # Pin SciPy because of failing distribution tests (see #60347) #Description: scientific python #Pinned versions: 1.10.1 @@ -316,8 +316,7 @@ z3-solver==4.15.1.0 ; platform_machine != "s390x" #Pinned versions: #test that import: -tensorboard==2.13.0 ; python_version < "3.13" -tensorboard==2.18.0 ; python_version >= "3.13" +tensorboard==2.18.0 #Description: Also included in .ci/docker/requirements-docs.txt #Pinned versions: #test that import: test_tensorboard diff --git a/requirements.txt b/requirements.txt index ae7f335c883cf..4559c2ad331f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,9 @@ hypothesis jinja2 lintrunner ; platform_machine != "s390x" networkx>=2.5.1 +ninja +numpy==2.0.2 ; python_version == "3.9" +numpy==2.1.2 ; python_version > "3.9" optree>=0.13.0 psutil spin From 3482eafe33d25e146394b685e23e61758122002c Mon Sep 17 00:00:00 2001 From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com> Date: Mon, 14 Jul 2025 12:23:45 -0500 Subject: [PATCH 45/87] Clean up CUDA state between tests (#2335) This PR fixes the unit test, test/test_cuda.py::TestCuda::test_set_per_process_memory_fraction FAILED [0.1163s] ``` Traceback (most recent call last): File "/var/lib/jenkins/pytorch/test/test_cuda.py", line 471, in test_set_per_process_memory_fraction tmp_tensor = torch.empty(application, dtype=torch.int8, device="cuda") RuntimeError: Trying to create tensor with negative dimension -5681285432: [-5681285432] ``` This error occurs only on gfx1101 arch. This error is coming from an integer overflow when another unit test, test/test_cuda.py::TestCuda::test_randint_generation_for_large_numel creates a tensor with a huge numel, which overflows into a higher torch.cuda.max_memory_reserved() when you call test/test_cuda.py::TestCuda::test_set_per_process_memory_fraction afterward. To avoid this we introduced torch.cuda.empty_cache() and torch.cuda.reset_peak_memory_stats() to clean up CUDA states. JIRA: https://ontrack-internal.amd.com/browse/SWDEV-535295 (cherry picked from commit f86d18439897232a374504c36b40da99c14ade1a) (cherry picked from commit 1b442282359fd69384634c882051c18565a5f744) --- test/test_cuda.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_cuda.py b/test/test_cuda.py index 72a4e5e1296a6..bfbec8c308985 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -460,6 +460,9 @@ def test_out_of_memory_retry(self): IS_JETSON, "oom reporting has issues on jetson igx due to partial nvml support" ) def test_set_per_process_memory_fraction(self): + if torch.version.hip and ('gfx1101' in torch.cuda.get_device_properties(0).gcnArchName): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() orig = torch.cuda.get_per_process_memory_fraction(0) torch.cuda.reset_peak_memory_stats(0) try: From 55881ef8485b9929394af12bda963617c6490d2b Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Mon, 17 Nov 2025 16:34:42 -0800 Subject: [PATCH 46/87] reset per process memory fraction in test_cuda.py test_mempool_limited_memory_with_allocator (#2811) Use try/finally block. This follows a similar pattern elsewhere in test_cuda.py. Fixes #https://github.com/ROCm/TheRock/issues/2118. --- test/test_cuda.py | 109 +++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/test/test_cuda.py b/test/test_cuda.py index bfbec8c308985..df317f1f01cbf 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -6058,67 +6058,68 @@ def test_mempool_limited_memory_with_allocator(self): nelem_1mb = 1024 * 1024 // 4 self._setup_mempool_limited_memory_test(80) - # remaining free mem: 80 mb - # mempool_use [] 0 mb - # mempool_do_not_use [] 0 mb - # default pool [] 0 mb - with torch.cuda.use_mem_pool(pool_do_not_use): - a = torch.randn(40 * nelem_1mb, device="cuda") - with torch.cuda.use_mem_pool(pool_use): - b = torch.randn(40 * nelem_1mb, device="cuda") - a_dataptr = a.data_ptr() - b_dataptr = b.data_ptr() - # remaining free mem: 0 mb - # mempool_do_not_use [aaaa] 40 mb - # mempool_use [bbbb] 40 mb - # default pool [] 0 mb - with self.assertRaises(torch.OutOfMemoryError): - # out of memory - c = torch.randn(40 * nelem_1mb, device="cuda") - - del a, b - # remaining free mem: 0 mb - # mempool_do_not_use [____] 40 mb - # mempool_use [____] 40 mb - # default pool [] 0 mb - - # c should not oom and instead can use mempool_use as fallback - c = torch.randn(30 * nelem_1mb, device="cuda") - c_dataptr = c.data_ptr() - # remaining free mem: 0 mb - # mempool_do_not_use [____] 40 mb - # mempool_use [ccc_] 40 mb - # default pool [] 0 mb - with self.assertRaises(torch.OutOfMemoryError): - # out of memory since can't use mempool_do_not_use - d = torch.randn(30 * nelem_1mb, device="cuda") + try: + # remaining free mem: 80 mb + # mempool_use [] 0 mb + # mempool_do_not_use [] 0 mb + # default pool [] 0 mb + with torch.cuda.use_mem_pool(pool_do_not_use): + a = torch.randn(40 * nelem_1mb, device="cuda") + with torch.cuda.use_mem_pool(pool_use): + b = torch.randn(40 * nelem_1mb, device="cuda") + a_dataptr = a.data_ptr() + b_dataptr = b.data_ptr() + # remaining free mem: 0 mb + # mempool_do_not_use [aaaa] 40 mb + # mempool_use [bbbb] 40 mb + # default pool [] 0 mb + with self.assertRaises(torch.OutOfMemoryError): + # out of memory + c = torch.randn(40 * nelem_1mb, device="cuda") - del c - # remaining free mem: 0 mb - # mempool_do_not_use [____] 40 mb - # mempool_use [____] 40 mb - # default pool [] 0 mb + del a, b + # remaining free mem: 0 mb + # mempool_do_not_use [____] 40 mb + # mempool_use [____] 40 mb + # default pool [] 0 mb + + # c should not oom and instead can use mempool_use as fallback + c = torch.randn(30 * nelem_1mb, device="cuda") + c_dataptr = c.data_ptr() + # remaining free mem: 0 mb + # mempool_do_not_use [____] 40 mb + # mempool_use [ccc_] 40 mb + # default pool [] 0 mb + with self.assertRaises(torch.OutOfMemoryError): + # out of memory since can't use mempool_do_not_use + d = torch.randn(30 * nelem_1mb, device="cuda") - # expect that we used same memory address for both a and c - self.assertEqual(b_dataptr, c_dataptr) + del c + # remaining free mem: 0 mb + # mempool_do_not_use [____] 40 mb + # mempool_use [____] 40 mb + # default pool [] 0 mb - # make sure we can still use mempool_use as intended after c is deleted - with torch.cuda.use_mem_pool(pool_use): - e = torch.randn(20 * nelem_1mb, device="cuda") - # remaining free mem: 0 mb - # mempool_do_not_use [____] 40 mb - # mempool_use [ee__] 40 mb - # default pool [] 0 mb + # expect that we used same memory address for both a and c + self.assertEqual(b_dataptr, c_dataptr) - e_dataptr = e.data_ptr() - del e + # make sure we can still use mempool_use as intended after c is deleted + with torch.cuda.use_mem_pool(pool_use): + e = torch.randn(20 * nelem_1mb, device="cuda") + # remaining free mem: 0 mb + # mempool_do_not_use [____] 40 mb + # mempool_use [ee__] 40 mb + # default pool [] 0 mb - self.assertEqual(e_dataptr, c_dataptr) + e_dataptr = e.data_ptr() + del e - # pool's destructor calls emptyCache() - del pool_use, pool_do_not_use + self.assertEqual(e_dataptr, c_dataptr) - self._teardown_mempool_limited_memory_test() + # pool's destructor calls emptyCache() + del pool_use, pool_do_not_use + finally: + self._teardown_mempool_limited_memory_test() @serialTest() def test_mempool_no_split(self): From 54750ed7ba6aed71b473c44a2f0cb9e82b46ef57 Mon Sep 17 00:00:00 2001 From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Date: Sun, 10 Aug 2025 19:41:41 -0500 Subject: [PATCH 47/87] [AUTOGENERATED] [release/2.8] [SWDEV-539215] - Autotune support for persistent reduction and no_x_dim removal (#2454) Cherry-pick of https://github.com/ROCm/pytorch/pull/2417 Need to resolve conflicts --------- Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com> (cherry picked from commit eb4715850bcdab5abb35de94bef8981153a1f0fe) [release/2.9][ROCm][inductor] Add ROCm specific persistent reduction config. (#2861) In support of [SWDEV-566103](https://ontrack-internal.amd.com/browse/SWDEV-566103) [release/2.10] Fix Inductor Triton Heuristics (#2931) The ROCm release/2.10 branch was created by applying 15 commits to upstream release/2.10 branch. (See https://github.com/pytorch/pytorch/compare/release/2.10...ROCm:pytorch:release/2.10) This PR fixes the issue with the missing disable_pointwise_autotuning function. There are three commits in this PR: First commit is a revert: 1c96f23e68227384c34f3fe3191de44902ddd159 - Autotuning support for persistent reduction since it is already available in upstream release/2.10 and is not needed. (It reintroduced disable_pointwise_autotuning function.) The second commit (b9facd069668dad33f9bb550f85e1773b937bb91) is needed for provenance, so I can apply the third commit: e5eee742f431738f97a03ba8ff7c69e4541577e3 - Heuristics improvements for reduction kernels which was reverted last minute before the release/2.10 cutoff and then re-landed shortly afterwards the cutoff date but with a minor change. --------- Co-authored-by: Pandya, Vivek Vasudevbhai --- torch/_inductor/runtime/triton_heuristics.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 2a1447fbf0bda..8108636663b90 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -3691,6 +3691,17 @@ def _persistent_reduction_configs( if conf not in configs: configs.append(conf) + # Additional custom configs in support of customer workloads + configs.append( + triton_config_reduction( + size_hints, + 1, + rnumel, + num_stages=3, + num_warps=2, + ) # 18% improvement + ) + for c in configs: # we don't need Rn_BLOCK for persistent reduction for prefix in size_hints: From 0bfe1e3c39ecc644a620eecffc596d21f471d263 Mon Sep 17 00:00:00 2001 From: Chinmay Kuchinad Date: Wed, 17 Dec 2025 07:05:17 +0000 Subject: [PATCH 48/87] Update version to 2.11.0 --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index f925b7d0ce58a..46b81d815a23b 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -2.11.0a0 +2.11.0 From 03639f4d63fb3999058c24acea028616772acc43 Mon Sep 17 00:00:00 2001 From: "rocm-repo-management-api[bot]" <189310625+rocm-repo-management-api[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 08:16:02 -0700 Subject: [PATCH 49/87] [AUTOGENERATED] [release/2.11] Move getenv to main thread to avoid NCCL race condition (#3054) Cherry-pick of https://github.com/ROCm/pytorch/pull/3043 Co-authored-by: tom.jen Co-authored-by: Jeff Daily --- torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp | 4 +++- torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp | 13 +++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index 827a8c1b13db7..ae9a73c01e189 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -922,6 +922,7 @@ ProcessGroupNCCL::ProcessGroupNCCL( PrefixStore* prefixStore = dynamic_cast(store_.get()); globalStore_ = prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_; + debugInfoPipeFile_ = getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, ""); auto desyncDebug = getCvarBool(TORCH_NCCL_DESYNC_DEBUG, false) || (dist_debug_level_ >= DebugLevel::Detail); #ifdef ENABLE_NCCL_ERROR_CHECKING @@ -1778,7 +1779,8 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() { // DumpPipe is one per-trainer process, and its convenient to name them // after 'global' ranks in the system, So we assume processgroup (uid)==0 is // the global PG and has globally unique rank ids across trainers. - dumpPipe.emplace(pg_->globalRank()); + dumpPipe.emplace( + pg_->globalRank(), pg_->debugInfoPipeFile_, pg_->traceBufferSize_); } while (true) { // This won't have any lock since this lock is only used here. diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index 12aeb49660f6c..c5d3eec1a03db 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -195,16 +195,10 @@ static std::vector TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK = #if defined(__linux__) struct DumpPipe { - DumpPipe(int rank) { - std::string fileStem = - getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, ""); - // NOTE: This default value (2000) is duplicated in FlightRecorder.hpp. - // Keep in sync. See FlightRecorder.hpp for details. - if (fileStem.empty() || - getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 2000) <= 0) { + DumpPipe(int rank, const std::string& fileStem, int traceBufferSize) { + if (fileStem.empty() || traceBufferSize <= 0) { return; } - TORCH_CHECK(!fileStem.empty(), "TORCH_NCCL_DEBUG_INFO_PIPE_FILE is empty"); std::string filename = c10::str(fileStem, rank, ".pipe"); TORCH_CHECK( unlink(filename.c_str()) != -1 || errno == ENOENT, @@ -1357,6 +1351,9 @@ class TORCH_API ProcessGroupNCCL : public Backend { // Size of ring buffer where we store NCCL Traces for debugging. int traceBufferSize_; + // Stores TORCH_NCCL_DEBUG_INFO_PIPE_FILE + std::string debugInfoPipeFile_; + // We gate the cudaEventCache so that we can roll it out gradually. std::atomic cudaEventCacheEnabled_; From 41a0f89e53b36a3dd95fe906ab266dc2168cd2c0 Mon Sep 17 00:00:00 2001 From: Anatoliy Litvinenko Date: Thu, 12 Mar 2026 10:04:48 -0500 Subject: [PATCH 50/87] [Release/2.11] No-fence in normalization kernel (#175286) (#3057) Removing need for fences in normalization kernel by converting the stores into atomics+return. This is crucial for perf in architectures with split caches (e.g. MI300), where fences are inherently costly. This change speedups `batch_norm_stats ` function for tensors in `channels_last` format. ### Performance result on MI300: batchnorm_latency_comparison **Particular Example:** Before: Avg time for shape (20, 896, 59, 91): **1102.39 us** After: Avg time for shape (20, 896, 59, 91): **122.94 us** Reproducer: ``` import torch shapes = [(20, 896, 59, 91)] eps = 1e-5 for shape in shapes: x = torch.randn(shape, device='cuda', dtype=torch.bfloat16) x = x.to(memory_format=torch.channels_last) for _ in range(20): _ = torch.batch_norm_stats(x, eps) torch.cuda.synchronize() start_evt = torch.cuda.Event(enable_timing=True) end_evt = torch.cuda.Event(enable_timing=True) start_evt.record() for _ in range(100): _ = torch.batch_norm_stats(x, eps) end_evt.record() torch.cuda.synchronize() print(f"Avg time for shape {shape}: {start_evt.elapsed_time(end_evt) / 100 * 1e3:.2f} us") ``` Related fix which is released: https://github.com/pytorch/pytorch/pull/161180 Pull Request resolved: https://github.com/pytorch/pytorch/pull/175286 Approved by: https://github.com/amd-hhashemi, https://github.com/jerrymannil, https://github.com/jeffdaily --- aten/src/ATen/native/cuda/KernelUtils.cuh | 15 ++++++++++----- aten/src/ATen/native/cuda/Normalization.cuh | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh index 9c100ca206adf..12feeb6d63af3 100644 --- a/aten/src/ATen/native/cuda/KernelUtils.cuh +++ b/aten/src/ATen/native/cuda/KernelUtils.cuh @@ -228,7 +228,9 @@ __device__ __forceinline__ void fastAtomicAdd( // This function implements a committed store. // Upon returning, the store is committed to global memory. // This is useful in avoiding the need for fences. -template +// If multiple stores are done in a row there is option to skip +// waiting for commit for all but the last store. +template __device__ inline void cmtdStore(void* address, T value) { int constexpr num_long_per_val = sizeof(value)/sizeof(long); int constexpr num_int_per_val = sizeof(value)/sizeof(int); @@ -252,13 +254,16 @@ __device__ inline void cmtdStore(void* address, T value) { else if constexpr (num_char_per_val*sizeof(char) == sizeof(value)) for (int i=0; i(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); - __atomic_signal_fence(__ATOMIC_SEQ_CST); + if constexpr (wait_for_commit) + { + __atomic_signal_fence(__ATOMIC_SEQ_CST); #ifdef __gfx1250__ - asm volatile("s_wait_loadcnt(0)" ::: "memory"); + asm volatile("s_wait_loadcnt(0)" ::: "memory"); #else - asm volatile("s_waitcnt vmcnt(0)" ::: "memory"); + asm volatile("s_waitcnt vmcnt(0)" ::: "memory"); #endif - __atomic_signal_fence(__ATOMIC_SEQ_CST); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + } } #endif diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh index bbd65419bbb92..8e31f8fa9a694 100644 --- a/aten/src/ATen/native/cuda/Normalization.cuh +++ b/aten/src/ATen/native/cuda/Normalization.cuh @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -1063,12 +1064,22 @@ batch_norm_collect_statistics_channels_last_kernel( address_base = c_offset + blockIdx.y * stride; // write data to staging_data; if (threadIdx.y == 0 && c_offset < stride) { +#ifndef USE_ROCM staging_mean[address_base] = mean_th; staging_m2n[address_base] = m2_th; staging_count[address_base] = count_th; +#else + // In architectures with split caches, global fences are costly. + // Here we preempt need for fences by committing stores to global memory. + cmtdStore((void*)&staging_mean[address_base], mean_th); + cmtdStore((void*)&staging_m2n[address_base], m2_th); + cmtdStore((void*)&staging_count[address_base], count_th); +#endif } +#ifndef USE_ROCM __threadfence(); +#endif __syncthreads(); // ensuring writes to staging_ is visible to all blocks __shared__ bool is_last_block_done; @@ -1288,11 +1299,20 @@ __global__ void batch_norm_backward_reduce_channels_last_kernel( address_base = c_offset + blockIdx.y * stride; // write data to staging_data; if (threadIdx.y == 0 && c_offset < stride) { +#ifndef USE_ROCM staging_sum_dy[address_base] = sum_dy_th; staging_sum_dy_xmu[address_base] = sum_dy_xmu_th; +#else + // In architectures with split caches, global fences are costly. + // Here we preempt need for fences by committing stores to global memory. + cmtdStore((void*)&staging_sum_dy[address_base], sum_dy_th); + cmtdStore((void*)&staging_sum_dy_xmu[address_base], sum_dy_xmu_th); +#endif } +#ifndef USE_ROCM __threadfence(); +#endif __syncthreads(); // ensuring writes to staging_ is visible to all blocks __shared__ bool is_last_block_done; From be321e462bfbcf5319669a96be19c65e0ff71f48 Mon Sep 17 00:00:00 2001 From: Milica Stankovic Date: Tue, 17 Mar 2026 11:51:54 +0100 Subject: [PATCH 51/87] Prefer cublas when TORCH_BLAS_PREFER_CUBLASLT is false (#https://github.com/pytorch/pytorch/pull/174377) (#3077) ### Summary Set blas_preferred_backend = at::BlasBackend::Cublas when TORCH_BLAS_PREFER_CUBLASLT / TORCH_BLAS_PREFER_HIPBLASLT is explicitly set to false. For ROCm, if a gfx arch is in the list returned by getHipblasltPreferredArchs() hipBLASLt will be set as blas_preferred_backend by default regardless of TORCH_BLAS_PREFER_HIPBLASLT setting. This PR enables users to explicitly select cublas/rocblas and makes this env. variable behave like a binary toggle. ### Changes - Modified checks for TORCH_BLAS_PREFER_CUBLASLT/TORCH_BLAS_PREFER_HIPBLASLT env. variables - Updated test_preferred_blas_library_settings Pull Request resolved: https://github.com/pytorch/pytorch/pull/174377 Approved by: https://github.com/jeffdaily, https://github.com/nikitaved Co-authored-by: Filip Jankovic --- aten/src/ATen/Context.h | 6 +++++- test/test_cuda.py | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 9f24ea32245ec..de6a7dda66d73 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -502,7 +502,11 @@ class TORCH_API Context { (c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == true || c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") == true) // alias ? at::BlasBackend::Cublaslt - : at::BlasBackend::Default; + : ((c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == false || + c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") == + false) // alias + ? at::BlasBackend::Cublas + : at::BlasBackend::Default); at::ROCmFABackend rocm_fa_preferred_backend = c10::utils::check_env("TORCH_ROCM_FA_PREFER_CK") == true ? at::ROCmFABackend::Ck diff --git a/test/test_cuda.py b/test/test_cuda.py index df317f1f01cbf..2a3e27dab814e 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -660,11 +660,13 @@ def _check_default(): torch.backends.cuda.preferred_blas_library(1.0) # check env var override custom_envs = [ - {"TORCH_BLAS_PREFER_CUBLASLT": "1"}, - {"TORCH_BLAS_PREFER_HIPBLASLT": "1"}, + ({"TORCH_BLAS_PREFER_CUBLASLT": "1"}, "_BlasBackend.Cublaslt"), + ({"TORCH_BLAS_PREFER_HIPBLASLT": "1"}, "_BlasBackend.Cublaslt"), + ({"TORCH_BLAS_PREFER_CUBLASLT": "0"}, "_BlasBackend.Cublas"), + ({"TORCH_BLAS_PREFER_HIPBLASLT": "0"}, "_BlasBackend.Cublas"), ] test_script = "import torch;print(torch.backends.cuda.preferred_blas_library())" - for env_config in custom_envs: + for env_config, expected in custom_envs: env = os.environ.copy() for key, value in env_config.items(): env[key] = value @@ -673,7 +675,15 @@ def _check_default(): .decode("ascii") .strip() ) - self.assertEqual("_BlasBackend.Cublaslt", r) + self.assertEqual(expected, r) + + # explicitly check default when no env vars are set + if not any( + os.environ.get(v) + for v in ("TORCH_BLAS_PREFER_CUBLASLT", "TORCH_BLAS_PREFER_HIPBLASLT") + ): + torch.backends.cuda.preferred_blas_library("default") + _check_default() @unittest.skipIf(TEST_CUDAMALLOCASYNC, "temporarily disabled for async") @setBlasBackendsToDefaultFinally From cc97152722937113b932e4e1c39e11a4484635a8 Mon Sep 17 00:00:00 2001 From: Anatoliy Litvinenko Date: Thu, 19 Mar 2026 21:55:09 -0500 Subject: [PATCH 52/87] Increase precision for golden solution in transformer tests. (#3087) # Overview Force FP32 precision for "golden" solution computation when TF32 is set for compute test solution. # Rationale The `test/test_transformers.py` testing suite calculates the numerical tolerance by comparing output tensors from the same precision ("reference") and higher precision ("golden"), both calculated by `SDPBackend.MATH`. However, if the golden output is calculated with TF32 rather than FP32, which in fact is less accurate than the FA/ME backend if they used IEEE rather than TF32 for their accumulation. The loss of precison causes false negatives in SDPA tests like `TestSDPACudaOnlyCUDA.test_flash_attention_vs_math_ref_grads_batch_size_8_seq_len_q_143_seq_len_k_4_head_dim_203_is_causal_False_dropout_p_0_22_float16_scale_l1_enable_gqa_True_n_heads1_cuda_float16` , at least on ROCM platform. The false negative disappears after forcing `higher_precision_dtype = torch.float64` # Major Changes To restore the precision of golden output, a wrapper function is used where golden solution is calculated. This function sets FP32 precision in scope of calculation of golden solution and its gradient. This is based of PR https://github.com/pytorch/pytorch/pull/167157 Upstream PR: https://github.com/pytorch/pytorch/pull/169694 --- test/test_transformers.py | 74 ++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/test/test_transformers.py b/test/test_transformers.py index 777b85cb173d3..284eb5ad64704 100644 --- a/test/test_transformers.py +++ b/test/test_transformers.py @@ -53,6 +53,7 @@ PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, PLATFORM_SUPPORTS_FUSED_ATTENTION, PLATFORM_SUPPORTS_CUDNN_ATTENTION, + tf32_off, tf32_on_and_off, tf32_enabled, ) @@ -3575,8 +3576,9 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset, if dropout_p == 0.0: with sdpa_kernel(backends=[SDPBackend.MATH]): # High Precision Math Reference - out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, - dropout_p=dropout_p, is_causal=is_causal, scale=scale) + with tf32_off(): + out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, + dropout_p=dropout_p, is_causal=is_causal, scale=scale) # Low Precision Math Reference out_lp_ref = F.scaled_dot_product_attention(query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale) @@ -3587,8 +3589,9 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset, torch.manual_seed(seed) dropout_mask = _get_mem_eff_drop_mask(batch_size, n_heads, seq_len_q, seq_len_k, dropout_p, seed, 0, device=device) # High Precision Math Reference - out_ref = torch.ops.aten._scaled_dot_product_attention_math( - query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, scale=scale, dropout_mask=dropout_mask)[0] + with tf32_off(): + out_ref = torch.ops.aten._scaled_dot_product_attention_math( + query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, scale=scale, dropout_mask=dropout_mask)[0] # Low Precision Math Reference out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math( query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale, @@ -3598,7 +3601,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset, grads = torch.autograd.grad(out, (query, key, value), upstream_grad) grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad) - grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad) + with tf32_off(): + grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad) fudge_factors = { 'out': 3.0 , @@ -3695,8 +3699,9 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset, if dropout_p == 0.0: with sdpa_kernel(backends=[SDPBackend.MATH]): # High Precision Math Reference - out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, attn_mask_ref, - dropout_p=dropout_p, is_causal=is_causal, scale=scale) + with tf32_off(): + out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, attn_mask_ref, + dropout_p=dropout_p, is_causal=is_causal, scale=scale) # Low Precision Math Reference out_lp_ref = F.scaled_dot_product_attention(query, key, value, attn_mask, dropout_p=dropout_p, is_causal=is_causal, scale=scale) @@ -3708,9 +3713,10 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset, dropout_mask = _get_mem_eff_drop_mask(batch_size, n_heads, seq_len_q, seq_len_k, dropout_p, seed, 0, device=device) # High Precision Math Reference - out_ref = torch.ops.aten._scaled_dot_product_attention_math( - query_ref, key_ref, value_ref, attn_mask_ref, dropout_p=dropout_p, is_causal=is_causal, - scale=scale, dropout_mask=dropout_mask)[0] + with tf32_off(): + out_ref = torch.ops.aten._scaled_dot_product_attention_math( + query_ref, key_ref, value_ref, attn_mask_ref, dropout_p=dropout_p, is_causal=is_causal, + scale=scale, dropout_mask=dropout_mask)[0] # Low Precision Math Reference out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math( query, key, value, attn_mask, @@ -3721,7 +3727,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset, grads = torch.autograd.grad(out, (query, key, value, attn_mask), upstream_grad) grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value, attn_mask), upstream_grad) - grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref, attn_mask_ref), upstream_grad) + with tf32_off(): + grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref, attn_mask_ref), upstream_grad) fudge_factors = { "out": 4, @@ -3825,8 +3832,9 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa) with sdpa_kernel(backends=[SDPBackend.MATH]): # High Precision Math Reference - out_ref = F.scaled_dot_product_attention( - query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa) + with tf32_off(): + out_ref = F.scaled_dot_product_attention( + query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa) # Low Precision Math Reference out_lp_ref = F.scaled_dot_product_attention( query, key, value, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa) @@ -3855,9 +3863,10 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le causal=is_causal)[:, :, :seq_len_q, :seq_len_k] dropout_mask = softmax_mask >= 0 # High Precision Math Reference - out_ref = torch.ops.aten._scaled_dot_product_attention_math( - query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, - scale=scale, dropout_mask=dropout_mask, enable_gqa=enable_gqa)[0] + with tf32_off(): + out_ref = torch.ops.aten._scaled_dot_product_attention_math( + query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, + scale=scale, dropout_mask=dropout_mask, enable_gqa=enable_gqa)[0] # Low Precision Math Reference out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math( query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale, @@ -3872,7 +3881,8 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le grads = torch.autograd.grad(out, (query, key, value), upstream_grad) grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad) - grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad) + with tf32_off(): + grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad) fudge_factors = { 'out': 4, @@ -4034,8 +4044,9 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d with sdpa_kernel(backends=[SDPBackend.MATH]): if dropout_p == 0.0: # High Precision Math Reference - out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, - dropout_p=dropout_p, is_causal=is_causal) + with tf32_off(): + out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, + dropout_p=dropout_p, is_causal=is_causal) # Low Precision Math Reference out_lp_ref = F.scaled_dot_product_attention(query, key, value, dropout_p=dropout_p, is_causal=is_causal) @@ -4045,9 +4056,10 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d dropout_mask = get_dropout_mask(output_tuple, fused_kernel, batch_size, n_heads, seq_len_q, seq_len_k, dropout_p, device) # High Precision Math Reference - out_ref = torch.ops.aten._scaled_dot_product_attention_math( - query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, - dropout_mask=dropout_mask)[0] + with tf32_off(): + out_ref = torch.ops.aten._scaled_dot_product_attention_math( + query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, + dropout_mask=dropout_mask)[0] # Low Precision Math Reference out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math( query, key, value, dropout_p=dropout_p, is_causal=is_causal, @@ -4059,7 +4071,8 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d g1.replay() if fused_kernel != SDPBackend.CUDNN_ATTENTION or dropout_p == 0.0: grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad) - grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad) + with tf32_off(): + grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad) fudge_factors = { 'out': 3.0, @@ -4283,8 +4296,9 @@ def rand_nt(sequence_list, num_heads, head_dim): out = F.scaled_dot_product_attention(query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale) with sdpa_kernel(backends=[SDPBackend.MATH]): # High Precision Math Reference - out_ref = F.scaled_dot_product_attention( - query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale) + with tf32_off(): + out_ref = F.scaled_dot_product_attention( + query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale) # Low Precision Math Reference out_lp_ref = F.scaled_dot_product_attention( query_ref_lp, key_ref_lp, value_ref_lp, is_causal=is_causal, scale=scale) @@ -4319,9 +4333,10 @@ def rand_nt(sequence_list, num_heads, head_dim): nt_stack.append(torch.cat(batch_stack)) nested_dropout_mask = torch.nested.nested_tensor(nt_stack) # High Precision Math Reference - out_ref = torch.ops.aten._scaled_dot_product_attention_math( - query_ref, key_ref, value_ref, dropout_p=dropout_p, - is_causal=is_causal, scale=scale, dropout_mask=nested_dropout_mask)[0] + with tf32_off(): + out_ref = torch.ops.aten._scaled_dot_product_attention_math( + query_ref, key_ref, value_ref, dropout_p=dropout_p, + is_causal=is_causal, scale=scale, dropout_mask=nested_dropout_mask)[0] # Low Precision Math Reference out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math( query_ref_lp, key_ref_lp, value_ref_lp, dropout_p=dropout_p, is_causal=is_causal, scale=scale, @@ -4330,7 +4345,8 @@ def rand_nt(sequence_list, num_heads, head_dim): upstream_grad = out.detach().clone().contiguous() out.backward(upstream_grad) - out_ref.backward(upstream_grad.to(out_ref.dtype)) + with tf32_off(): + out_ref.backward(upstream_grad.to(out_ref.dtype)) out_lp_ref.backward(upstream_grad.to(out_lp_ref.dtype)) dropout_fudge_factor = 1.0 if dropout_p == 0.0 else 2.0 From dfff4e13b4a95f6f5e3e4c50aaf402e76c5cdedb Mon Sep 17 00:00:00 2001 From: Milica Stankovic Date: Fri, 20 Mar 2026 11:29:15 +0100 Subject: [PATCH 53/87] [ROCm][CI] Fix failing FP8 tests on RDNA4 (#174873) (#3090) ## Summary This PR fixes FP8 inductor test failures that occur on AMD RDNA4 GPUs when testing matrix multiplications with small M dimensions (M < 16). ## Problem On gfx120x GPUs, FP8 scaled matrix multiplication tests fail with: - 92.4% NaN outputs when M < BLOCK_M (typically 16) - Large numerical mismatches between eager and compiled results - Only occurs in `max-autotune` mode **Root cause:** Autotuned Triton kernels on gfx120x generate incorrect tensor indexing for small M values, using partial indices instead of full computed indices in load/store operations. ## Solution - Added GPU-specific compile mode selection for small M values - gfx120x with M < 16: use `compile_mode="default"` - All other cases: use `compile_mode="max-autotune"` Pull Request resolved: https://github.com/pytorch/pytorch/pull/174873 Approved by: https://github.com/jeffdaily (cherry picked from commit d667ffef1f48caafc745b5c4266d1e1f23be1d5a) --- test/inductor/test_fp8.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py index aab220511c374..1d540ffca635e 100644 --- a/test/inductor/test_fp8.py +++ b/test/inductor/test_fp8.py @@ -1045,9 +1045,20 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias): w_inverse_scale, bias, ) + + # On gfx120x, autotuned kernels have issues with small M + compile_mode = "max-autotune" + if ( + torch.version.hip is not None + and M < 16 + and torch.cuda.is_available() + and "gfx120" in torch.cuda.get_device_properties(0).gcnArchName + ): + compile_mode = "default" + with config.patch({"triton.enable_persistent_tma_matmul": persistent_matmul}): linear_compiled = torch.compile( - linear, backend="inductor", mode="max-autotune" + linear, backend="inductor", mode=compile_mode ) y_compiled = linear_compiled( x_fp8, @@ -1344,9 +1355,20 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias): w_inverse_scale, bias, ) + + # On gfx120x, autotuned kernels have issues with small M + compile_mode = "max-autotune" + if ( + torch.version.hip is not None + and M < 16 + and torch.cuda.is_available() + and "gfx120" in torch.cuda.get_device_properties(0).gcnArchName + ): + compile_mode = "default" + with config.patch({"triton.enable_persistent_tma_matmul": persistent_matmul}): linear_compiled = torch.compile( - linear, backend="inductor", mode="max-autotune" + linear, backend="inductor", mode=compile_mode ) y_compiled = linear_compiled( x_fp8, From 83524a4c7d748e5de89a7a93cd72dab4bd1fa42d Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Wed, 25 Mar 2026 13:12:51 -0700 Subject: [PATCH 54/87] [CI][release/2.11]Pin all Python dependency versions in requirements files (#3098) Pin dependencies in release/2.11. Tested and installed with python 3.10, 3.11, 3.12, 3.13. Build failed because triton requires cmake less than 4 which we fixed in release/2.10 with https://github.com/ROCm/triton/commit/8edc6c4f11ac73ec145e9a0dbe311b83d58d54d7 ``` #63 35.94 writing manifest file 'python/triton.egg-info/SOURCES.txt' #63 35.98 #63 35.98 ERROR Missing dependencies: #63 35.98 cmake<4.0,>=3.20 #63 35.98 ERROR conda.cli.main_run:execute(142): `conda run python -m build --wheel --no-isolation` failed. (See above for error) #63 ERROR: process "/bin/sh -c if [ -n \"${TRITON}\" ]; then bash ./install_triton.sh; fi" did not complete successfully: exit code: 1 ------ > [stage-0 55/62] RUN if [ -n "yes" ]; then bash ./install_triton.sh; fi: 35.92 writing top-level names to python/triton.egg-info/top_level.txt 35.92 writing manifest file 'python/triton.egg-info/SOURCES.txt' 35.93 reading manifest file 'python/triton.egg-info/SOURCES.txt' 35.93 reading manifest template 'MANIFEST.in' 35.94 adding license file 'LICENSE' 35.94 writing manifest file 'python/triton.egg-info/SOURCES.txt' 35.98 35.98 ERROR Missing dependencies: 35.98 cmake<4.0,>=3.20 35.98 ERROR conda.cli.main_run:execute(142): `conda run python -m build --wheel --no-isolation` failed. (See above for error) ------ Dockerfile:122 -------------------- 120 | COPY ci_commit_pins/triton.txt triton.txt 121 | COPY triton_version.txt triton_version.txt 122 | >>> RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi 123 | RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt ``` --- .ci/docker/requirements-ci.txt | 41 +++++++++++++++++----------------- requirements-build.txt | 23 ++++++++++--------- requirements.txt | 29 ++++++++++++------------ 3 files changed, 48 insertions(+), 45 deletions(-) diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 0e59119e3c4f7..24be093b31e7e 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -15,13 +15,13 @@ build==1.3.0 #Pinned versions: 1.3.0 #test that import: -click +click==8.3.1 #Description: Command Line Interface Creation Kit -#Pinned versions: +#Pinned versions: 8.3.1 #test that import: coremltools==5.0b5 ; python_version < "3.12" -coremltools==8.3 ; python_version == "3.12" +coremltools==8.3.0 ; python_version == "3.12" #Description: Apple framework for ML integration #Pinned versions: 5.0b5 #test that import: @@ -68,7 +68,7 @@ lark==0.12.0 #Pinned versions: 0.12.0 #test that import: -librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x" +librosa==0.10.2 ; python_version < "3.11" and platform_machine != "s390x" librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x" #Description: A python package for music and audio analysis #Pinned versions: >=0.6.2 @@ -149,9 +149,9 @@ pandas==2.3.3; python_version >= "3.14" #Pinned versions: 1.9.0 #test that import: -opt-einsum==3.3 +opt-einsum==3.3.0 #Description: Python library to optimize tensor contraction order, used in einsum -#Pinned versions: 3.3 +#Pinned versions: 3.3.0 #test that import: test_linalg.py optree==0.13.0 ; python_version < "3.14" @@ -178,9 +178,9 @@ protobuf==6.33.5 #Pinned versions: 6.33.2 #test that import: test_tensorboard.py, test/onnx/* -psutil +psutil==7.2.2 #Description: information on running processes and system utilization -#Pinned versions: +#Pinned versions: 7.2.2 #test that import: test_profiler.py, test_openmp.py, test_dataloader.py pytest==7.3.2 @@ -198,9 +198,9 @@ pytest-flakefinder==1.1.0 #Pinned versions: 1.1.0 #test that import: -pytest-rerunfailures>=10.3 +pytest-rerunfailures==14.0 #Description: plugin for rerunning failure tests in pytest -#Pinned versions: +#Pinned versions: 14.0 #test that import: pytest-subtests==0.13.1 @@ -270,8 +270,7 @@ scipy==1.16.2 ; python_version >= "3.14" #test that import: # needed by torchgen utils -typing-extensions==4.12.2 ; python_version < "3.14" -typing-extensions==4.15.0 ; python_version >= "3.14" +typing-extensions==4.15.0 #Description: type hints for python #Pinned versions: #test that import: @@ -281,7 +280,7 @@ typing-extensions==4.15.0 ; python_version >= "3.14" #Pinned versions: #test that import: -unittest-xml-reporting<=3.2.0,>=2.0.0 +unittest-xml-reporting==3.2.0 #Description: saves unit test results to xml #Pinned versions: #test that import: @@ -292,7 +291,7 @@ lintrunner==0.12.11 #Pinned versions: 0.12.11 #test that import: -redis>=4.0.0 +redis==7.4.0 #Description: redis database #test that import: anything that tests OSS caching/mocking (inductor/test_codecache.py, inductor/test_max_autotune.py) @@ -370,10 +369,10 @@ pwlf==2.2.1 # To build PyTorch itself pyyaml==6.0.3 -pyzstd -setuptools==78.1.1 -packaging==24.0 -six +pyzstd==0.16.2 +setuptools==79.0.1 +packaging==25.0 +six==1.17.0 scons==4.5.2 ; platform_machine == "aarch64" @@ -387,7 +386,7 @@ dataclasses_json==0.6.7 #Pinned versions: 0.6.7 #test that import: -cmake==3.31.6 +cmake==4.0.0 #Description: required for building tlparse==0.4.0 @@ -396,7 +395,7 @@ tlparse==0.4.0 filelock==3.20.3 #Description: required for inductor testing -cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" and platform_system != "Darwin" +cuda-bindings==12.9.6 ; platform_machine != "s390x" and platform_system != "Darwin" #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits. #test that import: test_cuda.py @@ -406,7 +405,7 @@ pyre-extensions==0.0.32 tabulate==0.9.0 #Description: These package are needed to build FBGEMM and torchrec on PyTorch CI -tqdm>=4.66.0 +tqdm==4.67.3 #Description: progress bar library required for dynamo benchmarks #test that import: benchmarks/dynamo/* diff --git a/requirements-build.txt b/requirements-build.txt index 863bc9f921d8d..88a80dfaf1b30 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,11 +1,14 @@ # Build System requirements -setuptools>=70.1.0,<82 -cmake>=3.27 -ninja -numpy -packaging -pyyaml -requests -six # dependency chain: NNPACK -> PeachPy -> six -typing-extensions>=4.15.0 -pip # not technically needed, but this makes setup.py invocation work +# setuptools and cmake pinned to match rocm/pytorch release/2.10: +# https://github.com/ROCm/pytorch/blob/0b21eac93ff682d862b257770fff5f9fc069b30a/requirements-build.txt +setuptools==79.0.1 +cmake==4.0.0 +ninja==1.11.1.4 +numpy==2.0.2 ; python_version == "3.9" +numpy==2.1.2 ; python_version > "3.9" +packaging==25.0 +pyyaml==6.0.3 +requests==2.32.5 +six==1.17.0 # dependency chain: NNPACK -> PeachPy -> six +typing_extensions==4.15.0 +pip==26.0.1 # not technically needed, but this makes setup.py invocation work diff --git a/requirements.txt b/requirements.txt index 4559c2ad331f0..f8b6ebfd25ce1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,19 +4,20 @@ --requirement requirements-build.txt # Install / Development extra requirements -build[uv] # for building sdist and wheel -expecttest>=0.3.0 -filelock -fsspec>=0.8.5 -hypothesis -jinja2 -lintrunner ; platform_machine != "s390x" -networkx>=2.5.1 -ninja +build[uv]==1.3.0 # for building sdist and wheel +expecttest==0.3.0 +filelock==3.20.3 +fsspec==2026.2.0 +hypothesis==6.56.4 +jinja2==3.1.6 +lintrunner==0.12.11 ; platform_machine != "s390x" +networkx==2.8.8 +ninja==1.11.1.4 numpy==2.0.2 ; python_version == "3.9" numpy==2.1.2 ; python_version > "3.9" -optree>=0.13.0 -psutil -spin -sympy>=1.13.3 -wheel +optree==0.13.0 ; python_version < "3.14" +optree==0.17.0 ; python_version >= "3.14" +psutil==7.2.2 +spin==0.17 +sympy==1.13.3 +wheel==0.46.3 From bb7978b519536a4f38b7f4988a36af8a4e606869 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 25 Mar 2026 21:50:11 +0000 Subject: [PATCH 55/87] Update triton pin to tip of https://github.com/ROCm/triton/commits/release/internal/3.6.x --- .ci/docker/ci_commit_pins/triton.txt | 2 +- .ci/docker/common/install_triton.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index 23407b4d540c4..0a2a5f707f24f 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1 @@ -9844da955a9db14ec69c9aac828ee9803085e288 +b31789602ee0e40b06a1fbc6e63dfae6df7e131d diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index 1b68e3c247839..b2fdebdcc4747 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then TRITON_REPO="https://github.com/triton-lang/triton-cpu" TRITON_TEXT_FILE="triton-cpu" else - TRITON_REPO="https://github.com/triton-lang/triton" + TRITON_REPO="https://github.com/ROCm/triton" TRITON_TEXT_FILE="triton" fi From 752cc24c376a7e329963ff3a5dce11ce6f480c02 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 25 Mar 2026 21:52:30 +0000 Subject: [PATCH 56/87] Skip nccl_device.h header include for ROCm (causes build failures in theRock nightly builds); changes ported from https://github.com/pytorch/pytorch/pull/175443 --- torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp b/torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp index 63b3d452b7e4f..fbf0cf6b50c3e 100644 --- a/torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp +++ b/torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp @@ -10,9 +10,11 @@ #endif #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0) +#if !defined(USE_ROCM) #define NCCL_HAS_SYMMEM_DEVICE_SUPPORT #include #endif +#endif #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 29, 0) #define NCCL_HAS_ONE_SIDED_API From 2fea1465481a731ee40accce0b0a420edfbd22d3 Mon Sep 17 00:00:00 2001 From: Prachi Gupta Date: Fri, 27 Mar 2026 16:39:30 -0500 Subject: [PATCH 57/87] [ROCm] Reland: Enable expandable segments (#173330) (#177974) (#3106) Summary: Original pull request: https://github.com/pytorch/pytorch/pull/173330 Fixes https://github.com/pytorch/pytorch/issues/168737. Fixes https://github.com/pytorch/pytorch/issues/168736. The original diff enabled expandable segments for ROCm by adding `#ifdef USE_ROCM` guards throughout CUDACachingAllocator.cpp to use HIP APIs (hipMemAddressReserve, hipMemCreate, hipMemMap, etc.) instead of CUDA driver APIs when building for ROCm. Root cause: In HIP/ROCm 6.2.1, the field name for memory allocation properties is `requestedHandleType` (singular), not `requestedHandleTypes` (plural) as in CUDA. Additionally, `hipMemHandleTypeFabric` does not exist in HIP, so the `CU_MEM_HANDLE_TYPE_FABRIC` assignment must be skipped on ROCm. Fix applied on top of the original diff (from D96652342): - Use `prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor` under `#ifdef USE_ROCM` (singular field name, HIP constant) - Use `prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR` for CUDA (plural field name, CUDA constant) - Skip the `CU_MEM_HANDLE_TYPE_FABRIC` assignment entirely on ROCm under `#ifndef USE_ROCM`, as `hipMemHandleTypeFabric` does not exist in HIP Co-authored-by: Prachi Gupta prachi.gupta@amd.com Co-authored-by: Jeff Daily jeff.daily@amd.com Co-authored-by: moonshadow-25 moonshadow-25@users.noreply.github.com Co-authored-by: Vighanesh Sharma vighaneshsharma@gmail.com Test Plan: ``` fbpkg build //aps_models/ads/ecosystem/eval/cogwheel_tests/amd:cogwheel_aps_ads_icvr_kd_eval_amd_test_harness --build-remote ``` https://www.internalfb.com/sandcastle/workflow/1049338713192153464 Differential Revision: D97211385 Pull Request resolved: https://github.com/pytorch/pytorch/pull/177974 Approved by: https://github.com/jeffdaily, https://github.com/echen4096 (cherry picked from commit 57927012e4360a14acf8f48801a1f4f2c49a32ad) ## Motivation ## Technical Details ## Test Plan ## Test Result ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. Co-authored-by: Haoyu Zhang --- c10/cuda/CUDAAllocatorConfig.h | 2 +- c10/cuda/CUDACachingAllocator.cpp | 122 +++++++++++++++++++++++- test/distributed/test_cupy_as_tensor.py | 6 +- test/test_cuda.py | 9 +- test/test_cuda_expandable_segments.py | 9 +- torch/_C/__init__.pyi.in | 1 + torch/_dynamo/trace_rules.py | 1 + torch/csrc/DeviceAccelerator.cpp | 4 + 8 files changed, 142 insertions(+), 12 deletions(-) diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index 4e6097a406bc2..cd9c9b86285c4 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -34,7 +34,7 @@ class C10_CUDA_API CUDAAllocatorConfig { static bool expandable_segments() { bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig:: use_expandable_segments(); -#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED +#if !defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && !defined(USE_ROCM) if (enabled) { TORCH_WARN_ONCE("expandable_segments not supported on this platform") } diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 2ab4effc7853d..7a053b8134ef7 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -17,11 +17,17 @@ #include #include -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) +#if defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || defined(USE_ROCM) +#if defined(PYTORCH_C10_DRIVER_API_SUPPORTED) #include +#endif +#ifndef _WIN32 #include #include #include +#else +#include +#endif #endif #include @@ -269,7 +275,8 @@ struct SegmentRange { SegmentRange(void* p, size_t s) : ptr(static_cast(p)), size(s) {} }; -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || \ + defined(USE_ROCM) /* Note [Expandable Segments] @@ -383,8 +390,13 @@ struct ExpandableSegment { // This allows for some cases where we have to unmap pages earlier in the // segment to put them at the end. max_handles_ = numSegments(prop.totalGlobalMem + prop.totalGlobalMem / 8); +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemAddressReserve( + &ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressReserve_( &ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL)); +#endif } ExpandableSegment(const ExpandableSegment&) = delete; ExpandableSegment(ExpandableSegment&&) = delete; @@ -408,12 +420,14 @@ struct ExpandableSegment { // if it fails, use posix file handle if (CUDAAllocatorConfig::expandable_segments_handle_type() == Expandable_Segments_Handle_Type::UNSPECIFIED) { +#ifndef USE_ROCM CUDAAllocatorConfig::set_expandable_segments_handle_type( Expandable_Segments_Handle_Type::FABRIC_HANDLE); auto output = map(range); if (output.ptr != nullptr) { return output; } +#endif // if fabric handle is not supported, use posix file handle. CUDAAllocatorConfig::set_expandable_segments_handle_type( Expandable_Segments_Handle_Type::POSIX_FD); @@ -445,33 +459,60 @@ struct ExpandableSegment { if (enable_ipc_handles) { if (CUDAAllocatorConfig::expandable_segments_handle_type() != Expandable_Segments_Handle_Type::FABRIC_HANDLE) { +#ifdef USE_ROCM + prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor; +#else prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif } else { +#ifndef USE_ROCM prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; +#endif } } int flag = 0; +#ifndef USE_ROCM C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuDeviceGetAttribute_( &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, device_)); +#endif if (flag) prop.allocFlags.gpuDirectRDMACapable = 1; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; // NOLINTNEXTLINE(bugprone-signed-char-misuse) prop.location.id = static_cast(device_); +#ifdef USE_ROCM + auto status = hipMemCreate(&handle, segment_size_, &prop, 0); +#else auto status = DriverAPI::get()->cuMemCreate_(&handle, segment_size_, &prop, 0); +#endif if (status != CUDA_SUCCESS) { if (status == CUDA_ERROR_OUT_OF_MEMORY) { +#ifdef USE_ROCM + // hipMemCreate above returned hipErrorOutOfMemory and treated it + // like a sticky runtime error. Which means we need to clear it. + // Unlike the corresponding CUDA Driver API. + (void)hipGetLastError(); +#endif for (auto j : c10::irange(begin, i)) { // NOLINTNEXTLINE(bugprone-unchecked-optional-access) auto h = handles_.at(j).value(); handles_.at(j) = std::nullopt; +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemRelease(h.handle)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle)); +#endif } trimHandles(); return rangeFromHandles(begin, begin); +#ifdef USE_ROCM + } else { + C10_CUDA_CHECK(status); + } +#else } else if ( CUDAAllocatorConfig::expandable_segments_handle_type() == Expandable_Segments_Handle_Type::FABRIC_HANDLE) { @@ -487,6 +528,7 @@ struct ExpandableSegment { } else { C10_CUDA_DRIVER_CHECK(status); } +#endif } handles_.at(i) = Handle{handle, std::nullopt}; } @@ -522,7 +564,11 @@ struct ExpandableSegment { // thereby ensuring that the handle can be correctly matched in // ipcMemHandle_to_devptr. ShareHeader header{}; +#ifdef _WIN32 + header.pid = _getpid(); +#else header.pid = getpid(); +#endif header.segment_size = segment_size_; header.num_handles = end - begin; @@ -534,8 +580,13 @@ struct ExpandableSegment { Expandable_Segments_Handle_Type::FABRIC_HANDLE) { if (!handle.shareable_handle) { int fd = 0; +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemExportToShareableHandle( + &fd, handle.handle, hipMemHandleTypePosixFileDescriptor, 0)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_( &fd, handle.handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0)); +#endif handle.shareable_handle = fd; LOG(INFO) << "use posix fd to share expandable segments."; } @@ -546,6 +597,10 @@ struct ExpandableSegment { reinterpret_cast(&*handle.shareable_handle), sizeof(int)); } else { +#ifdef USE_ROCM + TORCH_INTERNAL_ASSERT( + false, "expandable segment with fabric handle not supported"); +#else if (!handle.shareable_handle) { CUmemFabricHandle fabric_handle; C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_( @@ -559,6 +614,7 @@ struct ExpandableSegment { buf.write( reinterpret_cast(&*handle.shareable_handle), sizeof(CUmemFabricHandle)); +#endif } } return rangeFromHandles(begin, end); @@ -574,14 +630,20 @@ struct ExpandableSegment { device, std::nullopt, header.segment_size, std::move(peers)); // older build setups (e.g. multiwheels) do not have this syscall, added 2020 // but the kernel on the system might still support it. +#ifndef _WIN32 #ifndef SYS_pidfd_open #define SYS_pidfd_open 434 #endif #ifndef SYS_pidfd_getfd #define SYS_pidfd_getfd 438 #endif +#endif // !_WIN32 if (CUDAAllocatorConfig::expandable_segments_handle_type() != Expandable_Segments_Handle_Type::FABRIC_HANDLE) { +#ifdef _WIN32 + TORCH_CHECK( + false, "IPC expandable segments are not supported on Windows"); +#else auto pidfd = syscall(SYS_pidfd_open, header.pid, 0); TORCH_CHECK( pidfd != -1 || errno != ENOSYS, @@ -597,9 +659,13 @@ struct ExpandableSegment { auto err = errno; close(static_cast(pidfd)); for (auto& h : segment->handles_) { +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemRelease(h.value().handle)); +#else C10_CUDA_DRIVER_CHECK( // NOLINTNEXTLINE(bugprone-unchecked-optional-access) DriverAPI::get()->cuMemRelease_(h.value().handle)); +#endif h = std::nullopt; } TORCH_CHECK( @@ -609,17 +675,33 @@ struct ExpandableSegment { TORCH_CHECK(false, "pidfd_getfd: ", c10::utils::str_error(err)); } CUmemGenericAllocationHandle handle = 0; +#ifdef USE_ROCM +#if ROCM_VERSION >= 70100 + void* myfd_handle = + reinterpret_cast(static_cast(myfd)); +#else + void* myfd_handle = (void*)(uintptr_t)&myfd; +#endif + C10_CUDA_CHECK(hipMemImportFromShareableHandle( + &handle, myfd_handle, hipMemHandleTypePosixFileDescriptor)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_( &handle, // NOLINTNEXTLINE(performance-no-int-to-ptr) (void*)(uintptr_t)myfd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); +#endif LOG(INFO) << "use posix fd to import expandable segments."; close(static_cast(myfd)); segment->handles_.emplace_back(Handle{handle, std::nullopt}); } close(static_cast(pidfd)); +#endif // !_WIN32 } else { +#ifdef USE_ROCM + TORCH_INTERNAL_ASSERT( + false, "expandable segment with fabric handle not supported"); +#else for (auto i : c10::irange(header.num_handles)) { (void)i; CUmemFabricHandle fabric_handle; @@ -634,6 +716,7 @@ struct ExpandableSegment { LOG(INFO) << "use fabric handle to import expandable segments."; segment->handles_.emplace_back(Handle{handle, std::nullopt}); } +#endif } segment->mapAndSetAccess(0, header.num_handles); return segment; @@ -669,8 +752,12 @@ struct ExpandableSegment { ~ExpandableSegment() { forEachAllocatedRange( [&](size_t begin, size_t end) { unmapHandles(begin, end); }); +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemAddressFree(ptr_, segment_size_ * max_handles_)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressFree_( ptr_, segment_size_ * max_handles_)); +#endif } private: @@ -680,12 +767,28 @@ struct ExpandableSegment { // NOLINTNEXTLINE(bugprone-signed-char-misuse) desc.location.id = static_cast(device); desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemSetAccess( + ptr() + begin * segment_size_, + (end - begin) * segment_size_, + &desc, + 1)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemSetAccess_( ptr_ + begin * segment_size_, (end - begin) * segment_size_, &desc, 1)); +#endif } void mapAndSetAccess(size_t begin, size_t end) { for (auto i : c10::irange(begin, end)) { +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemMap( + ptr() + i * segment_size_, + segment_size_, + 0, + handles_.at(i).value().handle, + 0ULL)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemMap_( ptr_ + i * segment_size_, segment_size_, @@ -693,6 +796,7 @@ struct ExpandableSegment { // NOLINTNEXTLINE(bugprone-unchecked-optional-access) handles_.at(i).value().handle, 0ULL)); +#endif } mapped_size_ += (end - begin) * segment_size_; setAccess(device_, begin, end); @@ -719,12 +823,22 @@ struct ExpandableSegment { // NOLINTNEXTLINE(bugprone-unchecked-optional-access) Handle h = handles_.at(i).value(); handles_.at(i) = std::nullopt; +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemUnmap(ptr() + segment_size_ * i, segment_size_)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemUnmap_( ptr_ + segment_size_ * i, segment_size_)); +#endif if (h.shareable_handle) { +#ifndef _WIN32 close(std::get(*h.shareable_handle)); +#endif } +#ifdef USE_ROCM + C10_CUDA_CHECK(hipMemRelease(h.handle)); +#else C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle)); +#endif } trimHandles(); } @@ -770,7 +884,11 @@ struct ExpandableSegment { std::optional> shareable_handle; }; struct ShareHeader { +#ifdef _WIN32 + int pid; +#else pid_t pid; +#endif size_t segment_size; size_t num_handles; }; diff --git a/test/distributed/test_cupy_as_tensor.py b/test/distributed/test_cupy_as_tensor.py index e0a98ae960426..57b44ff496adf 100644 --- a/test/distributed/test_cupy_as_tensor.py +++ b/test/distributed/test_cupy_as_tensor.py @@ -8,7 +8,10 @@ import torch from torch.multiprocessing.reductions import reduce_tensor from torch.testing._internal.common_cuda import SM100OrLater -from torch.testing._internal.common_distributed import MultiProcContinuousTest +from torch.testing._internal.common_distributed import ( + MultiProcContinuousTest, + skip_if_rocm_multiprocess, +) from torch.testing._internal.common_utils import ( requires_cuda_p2p_access, run_tests, @@ -64,6 +67,7 @@ def _init_device(self) -> None: def device(self) -> torch.device: return torch.device(device_type, self.rank) + @skip_if_rocm_multiprocess # RuntimeError: pidfd_getfd Operation not permitted" @skip_but_pass_in_sandcastle_if( SM100OrLater, "Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)", diff --git a/test/test_cuda.py b/test/test_cuda.py index 2a3e27dab814e..df9bdd5b0be11 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -4990,6 +4990,14 @@ def cb(device, alloc, device_alloc, device_free): def test_allocator_fuzz(self): # fuzz + if ( + torch.version.hip + and "expandable_segments:True" + in torch._C._accelerator_getAllocatorSettings() + ): + raise unittest.SkipTest( + "ROCm needs https://github.com/ROCm/rocm-systems/pull/3023" + ) state = random.getstate() random.seed(123) N = 10000 @@ -6448,7 +6456,6 @@ def test_graph_capture_reclaim_4_streams(self): "graph_capture_record_stream_reuse:False" ) - @skipIfRocm(msg="expandable_segments mode is not supported on ROCm") @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode") def test_mempool_expandable(self): torch.cuda.empty_cache() diff --git a/test/test_cuda_expandable_segments.py b/test/test_cuda_expandable_segments.py index 78e4cddab84ed..f22b50c64313e 100644 --- a/test/test_cuda_expandable_segments.py +++ b/test/test_cuda_expandable_segments.py @@ -12,7 +12,7 @@ import torch from torch.testing._internal.common_cuda import IS_JETSON, IS_WINDOWS -from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM +from torch.testing._internal.common_utils import run_tests REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent @@ -25,12 +25,7 @@ sys.path.remove(str(REPO_ROOT)) if __name__ == "__main__": - if ( - torch.cuda.is_available() - and not IS_JETSON - and not IS_WINDOWS - and not TEST_WITH_ROCM - ): + if torch.cuda.is_available() and not IS_JETSON and not IS_WINDOWS: get_disabled_tests(".") torch.cuda.memory._set_allocator_settings("expandable_segments:True") diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 565a2035fc663..14c5e699cf4c1 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -2598,6 +2598,7 @@ def _accelerator_getDeviceStats(device_index: _int) -> dict[str, Any]: ... def _accelerator_resetAccumulatedStats(device_index: _int) -> None: ... def _accelerator_resetPeakStats(device_index: _int) -> None: ... def _accelerator_getMemoryInfo(device_index: _int) -> tuple[_int, _int]: ... +def _accelerator_getAllocatorSettings() -> str: ... def _accelerator_setAllocatorSettings(env: str) -> None: ... # Defined in torch/csrc/jit/python/python_tracer.cpp diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py index 17cd668bb8eaa..797bbcb2c79db 100644 --- a/torch/_dynamo/trace_rules.py +++ b/torch/_dynamo/trace_rules.py @@ -463,6 +463,7 @@ "torch._C._accelerator_getAccelerator", "torch._C._accelerator_getDeviceIndex", "torch._C._accelerator_getStream", + "torch._C._accelerator_getAllocatorSettings", "torch._C._accelerator_setAllocatorSettings", "torch._C._accelerator_setStream", "torch._C._accelerator_synchronizeDevice", diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp index c6ffa893d95ae..c75643e2fa129 100644 --- a/torch/csrc/DeviceAccelerator.cpp +++ b/torch/csrc/DeviceAccelerator.cpp @@ -164,6 +164,10 @@ void initModule(PyObject* module) { return at::accelerator::getMemoryInfo(device_index); }); + m.def("_accelerator_getAllocatorSettings", []() { + return c10::CachingAllocator::getAllocatorSettings(); + }); + m.def("_accelerator_setAllocatorSettings", [](std::string env) { c10::CachingAllocator::setAllocatorSettings(env); }); From 5f21bad281c1046eb6017ffa513cf2a815ea82f2 Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Tue, 31 Mar 2026 11:58:40 -0700 Subject: [PATCH 58/87] [release/2.11] Enable wheels (#3111) http://rocm-ci.amd.com/job/pytorch2.11-manylinux-wheels_rel-7.2/ Enabling wheel build for release/2.11 --- .circleci/scripts/binary_populate_env.sh | 7 ++++- .github/scripts/build_triton_wheel.py | 36 +++++++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 74ad225db933b..c25e351768607 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -5,7 +5,9 @@ export TZ=UTC tagged_version() { GIT_DIR="${workdir}/pytorch/.git" GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*" - if [[ ! -d "${GIT_DIR}" ]]; then + if [[ -n "${CIRCLE_TAG:-}" ]]; then + echo "${CIRCLE_TAG}" + elif [[ ! -d "${GIT_DIR}" ]]; then echo "Abort, abort! Git dir ${GIT_DIR} does not exists!" kill $$ elif ${GIT_DESCRIBE} --exact >/dev/null; then @@ -69,6 +71,8 @@ fi export PYTORCH_BUILD_NUMBER=1 +# This part is done in the builder scripts so commenting the duplicate code +: <<'BLOCK_COMMENT' # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) TRITON_CONSTRAINT="platform_system == 'Linux'" @@ -110,6 +114,7 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}" fi fi +BLOCK_COMMENT USE_GLOO_WITH_OPENSSL="ON" if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py index 34c6c3549f9c7..fa5f81bf06d9b 100644 --- a/.github/scripts/build_triton_wheel.py +++ b/.github/scripts/build_triton_wheel.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os +import re import shutil import sys from pathlib import Path @@ -51,6 +52,31 @@ def patch_init_py( f.write(orig) +def get_rocm_version() -> str: + rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm" + rocm_version = "0.0.0" + rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h" + if not os.path.isfile(rocm_version_h): + rocm_version_h = f"{rocm_path}/include/rocm_version.h" + if os.path.isfile(rocm_version_h): + RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)") + RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)") + RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)") + major, minor, patch = 0, 0, 0 + for line in open(rocm_version_h): + match = RE_MAJOR.search(line) + if match: + major = int(match.group(1)) + match = RE_MINOR.search(line) + if match: + minor = int(match.group(1)) + match = RE_PATCH.search(line) + if match: + patch = int(match.group(1)) + rocm_version = str(major) + "." + str(minor) + "." + str(patch) + return rocm_version + + def build_triton( *, version: str, @@ -65,13 +91,20 @@ def build_triton( max_jobs = os.cpu_count() or 1 env["MAX_JOBS"] = str(max_jobs) + version_suffix = "" + if not release: + rocm_version = get_rocm_version() + version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}" + version += version_suffix + with TemporaryDirectory() as tmpdir: triton_basedir = Path(tmpdir) / "triton" triton_pythondir = triton_basedir / "python" triton_repo = "https://github.com/openai/triton" if device == "rocm": - triton_pkg_name = "triton-rocm" + triton_repo = "https://github.com/ROCm/triton" + triton_pkg_name = "triton" elif device == "xpu": triton_pkg_name = "triton-xpu" triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton" @@ -89,6 +122,7 @@ def build_triton( # change built wheel name and version env["TRITON_WHEEL_NAME"] = triton_pkg_name + env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix if with_clang_ldd: env["TRITON_BUILD_WITH_CLANG_LLD"] = "1" From c8e635b1be2fcb9d8a7aa7ebdd0f8128afc5e28f Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Tue, 31 Mar 2026 12:24:33 -0700 Subject: [PATCH 59/87] [release/2.11] Only skip linalg.eig assertion in test_torch_return_types_returns (#3097) --- test/functorch/test_vmap.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py index 28572e528ebbd..7033b41c90f95 100644 --- a/test/functorch/test_vmap.py +++ b/test/functorch/test_vmap.py @@ -71,6 +71,7 @@ run_tests, skipIfTorchDynamo, subtest, + TEST_WITH_ROCM, TEST_WITH_TORCHDYNAMO, TestCase, unMarkDynamoStrictTest, @@ -5064,9 +5065,12 @@ def test_torch_return_types_returns(self, device): vmap(torch.topk, (0, None, None))(t, 1, 0), torch.return_types.topk ) ) - self.assertTrue( - isinstance(vmap(torch.linalg.eig, (0))(t), torch.return_types.linalg_eig) - ) + if not (TEST_WITH_ROCM and not torch.cuda.has_magma): + self.assertTrue( + isinstance( + vmap(torch.linalg.eig, (0))(t), torch.return_types.linalg_eig + ) + ) def test_namedtuple_returns(self, device): Point = namedtuple("Point", ["x", "y"]) From 8ecd5da996b21f178791f2ada27409d5ac6f3257 Mon Sep 17 00:00:00 2001 From: Arash Pakbin Date: Fri, 27 Feb 2026 19:21:06 +0000 Subject: [PATCH 60/87] [ROCm] Optimize RadixSelect synchronization overhead (#174837) **Summary:** This PR optimizes the `radixSelect` kernel on ROCm by reducing synchronization overhead when aggregating radix counts across warps. The previous implementation used 3 block-level `__syncthreads()` calls plus atomic operations on 4 radix buckets (contended by all warps). The new implementation uses 2 `__syncthreads()` calls with no atomic contention, reducing synchronization overhead and improving performance. **Background:** The `radixSelect` algorithm finds the k-th element by iteratively uncovering its bit pattern through multiple passes over the data. Each pass determines 2 bits of the top-k value's bitmap (up to 16 passes for float32). Each iteration involves: 1. Counting input elements that match the already uncovered pattern 2. Grouping them by radix bucket (4 buckets per iteration) 3. Aggregating counts across all warps 4. Broadcasting the aggregated counts back to all threads **Previous Implementation:** The original sequence for each iteration was: ```cpp initialize smem[RadixSize] to 0 __syncthreads() // Sync 1 count within warp if (lane_id == 0) { atomicAdd(&smem[i], counts[i]) // Atomic contention on 4 buckets } __syncthreads() // Sync 2 read back total counts from smem __syncthreads() // Sync 3 ``` This involved **3 synchronizations** and **atomic contention** on 4 buckets from all warps. **Changes:** * **Warp-level reduction without atomics:** - Each warp's lane 0 writes its counts to a dedicated location in shared memory - Warp 0's lanes perform parallel reduction: each lane reduces one radix bin across all warps - This eliminates atomic contention while maintaining correctness * **Double-buffering for concurrent iterations:** - Observation: Due to block-level synchronization, at most two consecutive iterations can be in-flight simultaneously - When threads are in "section 2" (post-sync) of iteration `i`, other threads can only reach "section 1" (pre-sync) of iteration `i+1` and wait there - We use `buffer_index` (0 or 1) to alternate between two shared memory segments, allowing safe concurrent execution - This enables removing the first and last `__syncthreads()` calls, reducing from 3 to 2 synchronizations per iteration (2 = 3 - 2 + 1, where the +1 is required for the new warp-level aggregation step that replaces atomics) **Performance:** Measured on AMD MI350 (gfx950) using single-block TopK operator, where RadixSelect accounts for ~80% of total latency for typical workloads. - **Smaller datatypes (bfloat16, float16):** 4-5% improvement on smaller inputs, ~1% on larger inputs - **float32:** Similar improvements, slightly less pronounced - **Average improvement:** ~2% (weighted by larger input latencies) **Testing:** - Verified correctness across multiple data types (float32, float16, bfloat16) and input shapes - Tested with various K values to ensure correct behavior across all iteration counts - Performance benchmarks included below topk_latency_comparison **Testing code:** - benchmark code: [code](https://github.com/user-attachments/files/24484540/benchmark.py) Pull Request resolved: https://github.com/pytorch/pytorch/pull/174837 Approved by: https://github.com/jeffdaily --- aten/src/ATen/native/cuda/Sorting.cu | 14 +++ .../ATen/native/cuda/SortingRadixSelect.cuh | 117 +++++++++++++----- aten/src/ATen/native/cuda/TensorTopK.cu | 6 +- 3 files changed, 107 insertions(+), 30 deletions(-) diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu index 1c4f06fe262e6..717e08140226e 100644 --- a/aten/src/ATen/native/cuda/Sorting.cu +++ b/aten/src/ATen/native/cuda/Sorting.cu @@ -31,7 +31,14 @@ __global__ void gatherKthValue( cuda::detail::TensorInfo indices) { // Indices are limited to integer fp precision, so counts can fit in // int32, regardless of index_t +#ifndef USE_ROCM __shared__ int smem[C10_WARP_SIZE]; // one per each warp, up to warp limit +#else + // Maximum shared memory size for radix select (used in countRadixAggregateCounts): NUM_BUFFERS * MAX_WARPS * RADIX_SIZE. + // HIP workgroups have at most 1024 threads. Warp size is at least 32 (can be 64 on some + // architectures), so we use 32 for safety: 2 buffers * (1024/32) warps * 4 radix bins = 256. + __shared__ int smem[256]; +#endif index_t slice = getLinearBlockId(); if (slice >= numInputSlices) { @@ -108,7 +115,14 @@ __global__ void gatherMedian( bool ignore_nan) { // Shared memory for the subroutine RadixSelect. Note that RadixSelect converts the // floating point type to int with the same relative ordering. +#ifndef USE_ROCM __shared__ int smem[C10_WARP_SIZE]; // one per each warp, up to warp limit +#else + // Maximum shared memory size for radix select (used in countRadixAggregateCounts): NUM_BUFFERS * MAX_WARPS * RADIX_SIZE. + // HIP workgroups have at most 1024 threads. Warp size is at least 32 (can be 64 on some + // architectures), so we use 32 for safety: 2 buffers * (1024/32) warps * 4 radix bins = 256. + __shared__ int smem[256]; +#endif index_t slice = getLinearBlockId(); if (slice >= numInputSlices) { diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh index 977fc76b295be..a82cff9f227d6 100644 --- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh +++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh @@ -436,6 +436,76 @@ __device__ __forceinline__ void countRadixLoop( } } +// Aggregates radix matches across all warps and distributes results back to all threads. +// Uses double-buffering via buffer_index (0 or 1) to alternate between two smem segments, +// preventing race conditions between concurrent iterations. Since countRadixUsingMaskDataSmem +// performs __syncthreads() internally, at most two loop iterations can be in flight +// simultaneously, so two buffers are sufficient. buffer_index is toggled after each +// countRadixUsingMaskDataSmem invocation. +template < + typename CountType, + int RadixSize, + int RadixBits> +__device__ __forceinline__ void countRadixAggregateCounts( + CountType counts[RadixSize], // counts[i] will be the number of matching + // elements ((val & desiredMask) == desired) + // that have the digits [radixDigitPos, + // radixDigitPos+RADIX_BITS-1] set to i. + CountType* smem, // shared memory for inter-warp reduction of counts. + int buffer_index){ // buffer index for smem. + + // Maximum number of warps per workgroup. HIP workgroups have at most 1024 threads. + // Warp size is at least 32 (can be 64 on some architectures), so we use 32 for safety. + // This sizes shared memory buffers to accommodate all possible warps: 1024/32 = 32. + constexpr uint MAX_WARPS = 1024/32; + const int buffer_offset = buffer_index * MAX_WARPS * RadixSize; // offset of the buffer in smem. + const uint WARP_BITS = __builtin_ctz(warpSize); + + const uint num_warps = blockDim.x >> WARP_BITS; // Actual number of warps in this block + const uint warp_id = threadIdx.x >> WARP_BITS; // = threadIdx.x / warpSize + const int lane_id = at::cuda::getLaneId(); // = threadIdx.x % warpSize + + // Stage 1: Each warp's lane 0 stores its counts in smem. + // Layout after Stage 1: [warp0: all radix bins], [warp1: all radix bins], ... + // this layout starts from index buffer_offset. + if (lane_id == 0) { +#pragma unroll + for (uint32_t i = 0; i < RadixSize; ++i) { + smem[ + buffer_offset + + warp_id * RadixSize + + i + ] = counts[i]; + } + } + + __syncthreads(); // wait for all warps to finish storing their counts to smem. + + // Stage 2: Warp0 performs reduction for all bins. + // Layout after Stage 2: [final radix0 sum], [final radix1 sum], ..., [final radix(RadixSize-1) sum] + // this layout starts from index buffer_offset. + if (warp_id == 0 && lane_id < RadixSize) { + CountType sum = 0; +#pragma unroll + for (int w = 0; w < num_warps; ++w) { + sum += smem[ + buffer_offset + + w * RadixSize + + lane_id + ]; + } + smem[buffer_offset + lane_id] = sum; + } + + __syncthreads(); // Wait for warp 0 to finish reduction. + + // Stage 3: Each thread reads the final counts from smem. +#pragma unroll + for (uint32_t i = 0; i < RadixSize; ++i) { + counts[i] = smem[buffer_offset + i]; + } +} + // This function counts the distribution of all input values in a // slice we are selecting by radix digit at `radixDigitPos`, but only // those that pass the filter `((v & desiredMask) == desired)`. @@ -457,6 +527,7 @@ __device__ void countRadixUsingMaskDataSmem( // digits [radixDigitPos, radixDigitPos+RADIX_BITS-1] // set to i in the warp. CountType* smem, // shared memory for inter-warp reduction of counts. + int buffer_index, // buffer index for smem. bitwise_t desired, // combined with desiredMask to filter relevant elements. An // element is relevant if ((val & desiredMask) == desired). @@ -479,14 +550,6 @@ __device__ void countRadixUsingMaskDataSmem( counts[i] = 0; // initialize counts to 0. } - // initialize smem to 0. This is for reduction of counts across all warps. - if (threadIdx.x < RadixSize) { - smem[threadIdx.x] = 0; - } - - __syncthreads(); // wait for all threads in the block to finish initializing - // smem. - // count the distribution of the bits in the radix digit at `radixDigitPos` to // `radixDigitPos`+RADIX_BITS-1 for values that match the desired pattern // ((val & desiredMask) == desired). counts[] will hold the results for the @@ -512,27 +575,11 @@ __device__ void countRadixUsingMaskDataSmem( }); } - // accumulate the counts across all warps. - // sum for each warp is added to smem by thread 0 in the warp. - if (at::cuda::getLaneId() == 0) { -#pragma unroll - for (uint32_t i = 0; i < RadixSize; ++i) { - gpuAtomicAddNoReturn( - &smem[i], - counts[i]); // thread0 in warp atomically adds the counts to smem. - } - } - - __syncthreads(); // wait for all warps to finish adding their counts to smem. - -// each thread reads the final counts from smem. -#pragma unroll - for (uint32_t i = 0; i < RadixSize; ++i) { - counts[i] = smem[i]; - } - - __syncthreads(); // wait for all threads in the block to finish reading the - // counts. + // aggregate counts across all warps and distribute results back to all threads. + countRadixAggregateCounts( + counts, + smem, + buffer_index); } // This is the main loop of the findPattern function that finds the unique value @@ -849,6 +896,14 @@ __device__ void radixSelect( __syncthreads(); // so the initialization is visible to all threads in the // blocks. + // buffer index for smem. We use two segments of smem for inter-warp communication of counts. + // Given the counting operation in countRadixUsingMaskDataSmem performs __syncthreads() internally, + // we need to alternate between the at most two segments of smem to avoid race conditions. + // No more than two iterations of the loop will be "in flight" at any given time because + // of the __syncthreads() in countRadixUsingMaskDataSmem. + // buffer_index is either 0 or 1. It is toggled after each countRadixUsingMaskDataSmem invocation. + int buffer_index = 0; + #endif // We only consider elements x such that (x & desiredMask) == desired @@ -895,6 +950,7 @@ __device__ void radixSelect( RADIX_BITS>( counts, smem, + buffer_index, desired, desiredMask, digitPos, @@ -903,6 +959,9 @@ __device__ void radixSelect( data, dataSmem, dataSmemSize); + + buffer_index ^= 1; // toggle buffer index. + #else countRadixUsingMask< scalar_t, diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index e3025bace508e..1b8585ff3beab 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -257,7 +257,11 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo inpu // Indices are limited to integer fp precision, so counts can fit in // int32, regardless of IndexType - __shared__ int smem[64]; + + // Maximum shared memory size for radix select (used in countRadixAggregateCounts): NUM_BUFFERS * MAX_WARPS * RADIX_SIZE. + // HIP workgroups have at most 1024 threads. Warp size is at least 32 (can be 64 on some + // architectures), so we use 32 for safety: 2 buffers * (1024/32) warps * 4 radix bins = 256. + __shared__ int smem[256]; __shared__ int writeIndexStart; // index to track where to write results. This is shared by all threads in the block. Increases atomically. IndexType slice = getLinearBlockId(); From e0fbdce0b47d00092fd9c84d9e9c3fecba998a93 Mon Sep 17 00:00:00 2001 From: Arash Pakbin Date: Tue, 24 Feb 2026 16:54:35 +0000 Subject: [PATCH 61/87] [ROCm] RadixSelect: Remove loop padding and make prefetching conditional (#174897) **Summary:** This PR optimizes the `radixSelect` kernel on ROCm by removing unnecessary loop padding and making prefetching conditional. The previous implementation padded loop bounds to ensure all threads participate in warp-level operations, which added overhead. This does not seem necessary as other parts in PyTorch have not been doing it (see [example](https://github.com/pytorch/pytorch/blob/c8062c4fe279e840407ebf9e2457573498bee464/aten/src/ATen/native/cuda/TensorTopK.cu#L120)). Additionally, prefetching was always enabled even when accessing shared memory, where it provides no benefit and hurts performance by adding long dependency chains within the loop. This PR makes prefetching a compile-time conditional feature and removes the padding overhead. **Previous Implementation:** - Loop bounds were padded: `i < round_up(loopBound, warpSize)` - Work was guarded: `if (i < loopBound) { ... }` - Prefetching in `countRadixLoop` was always enabled, even for shared memory access **Changes:** * **Removed loop padding:** - Changed loop bounds from `round_up(loopBound, warpSize)` to `loopBound` - Removed `if (i < loopBound)` guards since padding is no longer needed * **Conditional prefetching:** - Added `bool prefetch` template parameter to `countRadixLoop` function - Prefetching is enabled (`prefetch = true`) only for global memory access - Prefetching is disabled (`prefetch = false`) for shared memory access, where it hurts performance - Uses `if constexpr` for compile-time optimization, ensuring zero runtime overhead **Performance:** Measured on AMD MI350 (gfx950) using single-block TopK operator. - **Overall average improvement:** ~2.4% across all tested configurations - **By data type:** - float16: ~3.5% average improvement (best gains) - bfloat16: ~2.0% average improvement - float32: ~1.6% average improvement - **By input size:** - Small inputs (10K and 100K elements): ~4.6% improvement - Medium inputs (1M elements): ~2.4% improvement - Large inputs (10M and 100M elements): ~0.2% improvement - The optimization provides the greatest benefit on smaller inputs and half-precision types, where the overhead of padding and unnecessary prefetching has a larger relative impact. Although some regressions occur (primarily on float32 inputs), the overall impact remains positive across all data types, with float32 still achieving a 1.6% average improvement. topk_latency_comparison **Testing:** - Verified correctness across multiple data types (float32, float16, bfloat16) and input shapes - Tested with various K values and input sizes to ensure correct behavior - Confirmed that warp-level operations (`WARP_BALLOT`) work correctly without padding **Benchmark code:** See [benchmark.py](https://github.com/user-attachments/files/24484540/benchmark.py) for the performance measurement script. Pull Request resolved: https://github.com/pytorch/pytorch/pull/174897 Approved by: https://github.com/jerrymannil, https://github.com/jeffdaily --- .../ATen/native/cuda/SortingRadixSelect.cuh | 77 ++++++++++--------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh index a82cff9f227d6..afde242cc98b7 100644 --- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh +++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh @@ -318,6 +318,7 @@ template < typename CountType, int RadixSize, int RadixBits, + bool prefetch, typename DataAccessor> __device__ __forceinline__ void countRadixLoop( CountType counts[RadixSize], // counts[i] will be the number of matching @@ -395,32 +396,38 @@ __device__ __forceinline__ void countRadixLoop( // phase 2: processing 1 element at an iteration. - // prefetching. This is specifically useful for global memory access. - scalar_t v = unroll_segment + threadIdx.x < loopBound - ? getData(unroll_segment + threadIdx.x) - : static_cast(0); - // we pad loopbound to round_up(loopbound, warpSize) to make sure all threads - // in the warp participate in the ballot. + // prefetching pattern if prefetch is true. + // prefetching pattern is only useful for global memory access. + scalar_t v_curr; + if constexpr (prefetch) { + v_curr = unroll_segment + threadIdx.x < loopBound + ? getData(unroll_segment + threadIdx.x) + : static_cast(0); + } for (index_t i = unroll_segment + threadIdx.x; - i < round_up( - static_cast(loopBound), static_cast(warpSize)); + i < loopBound; i += blockDim.x) { - // prefetch the next element. - scalar_t v_next = i + blockDim.x < loopBound ? getData(i + blockDim.x) - : static_cast(0); - - bool hasVal = false; - bitwise_t digitInRadix = static_cast(0); - if (i < loopBound) { - bitwise_t val = TopKTypeConfig::convert(v); - // check if bit pattern matches the pattern we have already discovered for - // topk value v. - hasVal = ((val & desiredMask) == desired); - // get the bits [radixDigitPos, radixDigitPos+RADIX_BITS-1] of the value - // v. - digitInRadix = at::cuda::Bitfield::getBitfield( - val, radixDigitPos, RadixBits); - } + scalar_t v_local; // the current element. + scalar_t v_next; // the next element. Used for prefetching. + + if constexpr (prefetch) { + // prefetch the next element. + v_local = v_curr; + v_next = i + blockDim.x < loopBound ? getData(i + blockDim.x) + : static_cast(0); + } + else { + v_local = getData(i); // if no prefetching, just get the current element. + } + + bitwise_t val = TopKTypeConfig::convert(v_local); + // check if bit pattern matches the pattern we have already discovered for + // topk value v. + bool hasVal = ((val & desiredMask) == desired); + // get the bits [radixDigitPos, radixDigitPos+RADIX_BITS-1] of the value + // v. + bitwise_t digitInRadix = at::cuda::Bitfield::getBitfield( + val, radixDigitPos, RadixBits); // counting across the warp. #pragma unroll @@ -432,7 +439,9 @@ __device__ __forceinline__ void countRadixLoop( counts[j] += __popcll(WARP_BALLOT(vote)); } - v = v_next; // closing the prefetching loop. + if constexpr (prefetch) { + v_curr = v_next; // closing the prefetching loop. + } } } @@ -556,7 +565,7 @@ __device__ void countRadixUsingMaskDataSmem( // current warp. if (dataSmemSize > 0) { // if shared memory is filled, use dataSmem as the input data. - countRadixLoop( + countRadixLoop( counts, desired, desiredMask, @@ -564,7 +573,7 @@ __device__ void countRadixUsingMaskDataSmem( dataSmemSize, [&](index_t i) -> scalar_t { return dataSmem[i]; }); } else { // if shared memory is not filled, fall back to global memory. - countRadixLoop( + countRadixLoop( counts, desired, desiredMask, @@ -787,21 +796,15 @@ __device__ __forceinline__ void fillDataSmem( scalar_t v = threadIdx.x < sliceSize ? doLdg(&data[threadIdx.x * withinSliceStride]) : static_cast(0); - // we pad sliceSize to round_up(sliceSize, warpSize) to make sure all - // threads in the warp participate in the ballot. - for (index_t i = threadIdx.x; i < - round_up(static_cast(sliceSize), - static_cast(warpSize)); + + for (index_t i = threadIdx.x; i < sliceSize; i += blockDim.x) { scalar_t v_next = (i + blockDim.x) < sliceSize ? doLdg(&data[(i + blockDim.x) * withinSliceStride]) : static_cast(0); - bool match = false; - if (i < sliceSize) { - match = - ((TopKTypeConfig::convert(v) & desiredMask) == desired); - } + bool match = + (TopKTypeConfig::convert(v) & desiredMask) == desired; // Warp-level ballot uint64_t ballot = WARP_BALLOT( From ccba6b0ee03e1b194bc3ffd0fd4d5a4a2a55a843 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Thu, 12 Mar 2026 02:59:57 +0000 Subject: [PATCH 62/87] [ROCm] fix radixselect (#177149) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: https://github.com/pytorch/pytorch/pull/174837 (D94770109) introduced a race condition situation. ## symptom running it in reference service under high qps, the service will crash on such error: `HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception. code: 0x1016` ## The Race After countRadixAggregateCounts Stage 3 (line 514): counts[i] = smem[buffer_offset + i]; // ALL threads read, buffer_offset could be 0 // NO __syncthreads() — function returns immediately Then in the radixSelect main loop: buffer_index ^= 1; // line 966, toggle // ... bucket loop evaluates counts[] ... found_unique(i, count) fires → findPatternDataSmem((scalar_t*)smem) // line 1014 smem[0] = (scalar_t)0; // line 689 — WRITE to smem[0] smem[1] = (scalar_t)0; // line 690 — WRITE to smem[1] __syncthreads(); // line 693 When buffer_index = 0 was used for counting, Stage 3 reads from smem[0..3]. Then findPatternDataSmem writes smem[0] and smem[1] (cast to scalar_t*, same physical memory). There is no __syncthreads() between these reads and writes. Since warps execute independently, warp 0 (containing thread 0) can reach line 689 and write smem[0] while a lagging warp is still at line 514 reading smem[0]. ## Why This Can Be Dangerous The corruption writes scalar_t(0) (bit pattern 0x00000000 for float) over the int count values. If the lagging warp reads a corrupted counts[0] or counts[1] (now 0 instead of the real count): 1. Divergent control flow: The lagging warp's found_unique(i, count) check uses the corrupted count. If the corrupted bucket was the one that should have triggered found_unique (count was 1, now reads as 0), that warp skips it and falls through the bucket loop without returning. 2. Mismatched __ syncthreads(): Warp 0 is inside findPatternLoop which has __ syncthreads() at lines 646 and 652. The lagging warp is NOT in findPatternLoop — it continued to the next digitPos iteration, hitting __ syncthreads() inside fillDataSmem or countRadixAggregateCounts. Divergent __syncthreads() is undefined behavior on GPUs and can cause hangs or crashes. 3. Incorrect kth_value: Even if it doesn't hang, the lagging warp may compute a different kth_value entirely. When the kernel later uses this for the gather phases, it could: - Collect the wrong number of elements → CUDA_KERNEL_ASSERT(write_index < k) → ASSERT_TRAP (exactly matching the error in D94938108's description) - Or produce silently incorrect TopK results When Can This Happen? ┌───────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────┐ │ Condition │ Likelihood │ ├───────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────┤ │ buffer_index = 0 during final counting │ 50% per radixSelect call │ ├───────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────┤ │ found_unique fires (count == 1, kToFind == 1) │ Common — happens in final radix iterations │ ├───────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────┤ │ Sufficient warp skew (~100+ instructions) │ Rare normally, but increases under GPU memory pressure / high utilization │ └───────────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────┘ Test Plan: hard to repro in single test case. we run it in a service under high qps, it will crash after the qps reaches some high number. with this fix, the job can succeed - https://www.internalfb.com/vanguard/serving_test_cases/909207988628161 Differential Revision: D96015407 Pull Request resolved: https://github.com/pytorch/pytorch/pull/177149 Approved by: https://github.com/jeanschmidt --- aten/src/ATen/native/cuda/SortingRadixSelect.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh index afde242cc98b7..faaf1b06d7989 100644 --- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh +++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh @@ -513,6 +513,7 @@ __device__ __forceinline__ void countRadixAggregateCounts( for (uint32_t i = 0; i < RadixSize; ++i) { counts[i] = smem[buffer_offset + i]; } + __syncthreads(); // Wait for all threads to finish reading the final counts. } // This function counts the distribution of all input values in a From 7a10d22bd2662cd9113bb2cf88166875464035ff Mon Sep 17 00:00:00 2001 From: Arash Pakbin Date: Tue, 31 Mar 2026 12:25:41 +0000 Subject: [PATCH 63/87] [ROCm] Reduce RadixSelect sync overhead by moving __syncthreads to findPatternDataSmem (#178188) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary PR #177149 fixed a race condition introduced by #174837: after `countRadixAggregateCounts` Stage 3 reads counts from smem, warp 0 may get ahead of lagging warps still in Stage 3 and call `findPatternDataSmem`, overwriting `smem[0]`/`smem[1]` while lagging warps are still reading `smem[buffer_offset + i]` (which overlaps with `smem[0]`/`smem[1]` when `buffer_offset == 0`). The fix placed a `__syncthreads()` at the end of Stage 3, which runs on every iteration of the radix digit loop, negating part of the synchronization overhead that #174837 worked to eliminate. This patch moves that sync to the **beginning of `findPatternDataSmem`** instead. ## Why this is correct 1. All threads evaluate the same `counts[]` values and all reach `found_unique()` together, so `__syncthreads()` inside `findPatternDataSmem` is collectively reachable by all threads in the block. 2. By the time any thread enters `findPatternDataSmem`, every thread has already finished reading Stage 3 (they all had to evaluate the bucket loop to get here), so syncing before the `smem[0]`/`smem[1]` writes is sufficient to prevent the race. ## Performance `findPatternDataSmem` is called **at most once** per `radixSelect` invocation — only when `count == 1` (a unique element is identified), at which point the function returns immediately. The removed sync ran on every radix digit iteration (up to 16 times for float32). This saves up to 15 `__syncthreads()` calls in the common case. Pull Request resolved: https://github.com/pytorch/pytorch/pull/178188 Approved by: https://github.com/jeffdaily, https://github.com/jeanschmidt --- aten/src/ATen/native/cuda/SortingRadixSelect.cuh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh index faaf1b06d7989..c6e5f299a0bb3 100644 --- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh +++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh @@ -513,7 +513,6 @@ __device__ __forceinline__ void countRadixAggregateCounts( for (uint32_t i = 0; i < RadixSize; ++i) { counts[i] = smem[buffer_offset + i]; } - __syncthreads(); // Wait for all threads to finish reading the final counts. } // This function counts the distribution of all input values in a @@ -684,6 +683,15 @@ __device__ scalar_t findPatternDataSmem( const scalar_t* dataSmem, // input data stored in shared memory. index_t dataSmemSize) { // input data size stored in shared memory. + // Ensure all threads have finished reading from smem before overwriting it. + // countRadixAggregateCounts Stage 3 reads from smem[buffer_offset + i]; + // when buffer_offset == 0, those locations overlap with smem[0]/smem[1] + // written below. Warp 0 (which writes smem[0]/smem[1]) may get ahead of + // lagging warps still in Stage 3. Syncing here (rather than at the end of + // Stage 3) is cheaper because findPatternDataSmem is called at most once per + // radixSelect invocation, only when a unique element is found (count == 1). + __syncthreads(); + // initialize smem to 0. // smem[0] is a flag to indicate if a value has been found. // smem[1] is the found value. From f5a3aa1b70625cfa410b8f37bbe06bde199ca46d Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Fri, 3 Apr 2026 07:14:29 -0700 Subject: [PATCH 64/87] [release/2.11] Use triton with updated pyproject.toml to use cmake 4 (#3124) Fixes internal CI build failures on release/2.11 due to triton build. Build was able to pass the point where triton failed previously. e.g. https://ml-ci-internal.amd.com/blue/organizations/jenkins/pytorch%2Fpytorch-ci-pipeline/detail/release%2F2.10/31/pipeline With our change to triton pin: https://ml-ci-internal.amd.com/job/pytorch/job/pytorch-ci-pipeline/job/PR-3124/2/pipeline-overview/ --- .ci/docker/ci_commit_pins/triton.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index 0a2a5f707f24f..f0849cc7d8f63 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1 @@ -b31789602ee0e40b06a1fbc6e63dfae6df7e131d +4ed888920c5a0871957f1cf912e557bc79fbe56c From a5c71cc3527efe43e3b3914ef3c2223be444f5b5 Mon Sep 17 00:00:00 2001 From: Xinya Zhang Date: Fri, 3 Apr 2026 11:57:51 -0500 Subject: [PATCH 65/87] [release/2.11] [ROCm] Fix test/dynamo/test_repros.py::ReproTestsDeviceCUDA::test_flash_attn_backward_mixed_strides_cuda#179086 (#3127) `dv` tensor should be created with `empty_like(v)` rather than `empty_like(k)`. This fixes #168540, #168541, and supersedes #178499 This is cherry-picked from upstream PR https://github.com/pytorch/pytorch/pull/179086 --- .../ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip | 2 +- test/dynamo/test_repros.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip index b96e80d5e5a9e..e809f23e61def 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip @@ -635,7 +635,7 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size); } else { - dv = at::empty_like(k); + dv = at::empty_like(v); } auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left, diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py index 65881c21b93c6..747fc7a03308f 100644 --- a/test/dynamo/test_repros.py +++ b/test/dynamo/test_repros.py @@ -7990,7 +7990,7 @@ def f(): @skipIfHpu @unittest.skipIf( - TEST_WITH_ROCM or not PLATFORM_SUPPORTS_FLASH_ATTENTION, + not PLATFORM_SUPPORTS_FLASH_ATTENTION, "flash attention not supported", ) def test_flash_attn_backward_mixed_strides(self, device): From 8f4963dd3b6edc4f0e710cf4bdc2bd19fdc7a8a0 Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Fri, 3 Apr 2026 12:31:59 -0700 Subject: [PATCH 66/87] [release/2.11][CI] Add related_commits file (#3131) Build validation: http://rocm-ci.amd.com/job/pytorch2.11-manylinux-wheels_rel-7.2/7/ : Connection issues https://github.com/ROCm/TheRock/actions/runs/23953043418/job/69864879059 : Build succeeded --------- Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> --- related_commits | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 related_commits diff --git a/related_commits b/related_commits new file mode 100644 index 0000000000000..cbc1b48c2931b --- /dev/null +++ b/related_commits @@ -0,0 +1,10 @@ +ubuntu|pytorch|apex|release/1.11.0|4fe55b966de2458e4591bed2b0c0f990ffcca683|https://github.com/ROCm/apex +centos|pytorch|apex|release/1.11.0|4fe55b966de2458e4591bed2b0c0f990ffcca683|https://github.com/ROCm/apex +ubuntu|pytorch|torchvision|release/0.26|336d36e8db990a905498c73933e35231876e28bc|https://github.com/pytorch/vision +centos|pytorch|torchvision|release/0.26|336d36e8db990a905498c73933e35231876e28bc|https://github.com/pytorch/vision +ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data +centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data +ubuntu|pytorch|torchaudio|release/2.11|34c52a67e8941bbd8e6adaca0eb0b9eabec11d78|https://github.com/pytorch/audio +centos|pytorch|torchaudio|release/2.11|34c52a67e8941bbd8e6adaca0eb0b9eabec11d78|https://github.com/pytorch/audio +ubuntu|pytorch|ao|release/0.17.0|afb2844be99514f0d5ff42badd9c3ed0d1811d73|https://github.com/pytorch/ao +centos|pytorch|ao|release/0.17.0|afb2844be99514f0d5ff42badd9c3ed0d1811d73|https://github.com/pytorch/ao From 4e323059f79a8dd73a4770b9d6f9f234f865e64e Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Tue, 24 Mar 2026 23:05:41 +0000 Subject: [PATCH 67/87] [ROCm] Require rocm_smi package (#175648) Fixes #158725 This is essentially @AngryLoki patch: https://github.com/gentoo/gentoo/blob/8cdbe88fa388ce264d1d70047222fcad190fec3d/sci-ml/caffe2/files/caffe2-2.9.0-rocm-distributed-link.patch Pull Request resolved: https://github.com/pytorch/pytorch/pull/175648 Approved by: https://github.com/jeffdaily, https://github.com/mlazos (cherry picked from commit 9bff6e149a649234c146fdae8058fb035bfb43b7) --- cmake/Dependencies.cmake | 7 +++++++ cmake/public/LoadHIP.cmake | 1 + 2 files changed, 8 insertions(+) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index dac0d1f41c3bd..203cdc7c029db 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1089,6 +1089,13 @@ if(USE_ROCM) ) endif() + # ROCM-SMI needed to support symmetric memory + if(USE_DISTRIBUTED AND UNIX) + list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS + rocm_smi64 + ) + endif() + # ---[ Kernel asserts # Kernel asserts is disabled for ROCm by default. # It can be turned on by turning on the env USE_ROCM_KERNEL_ASSERT to the build system. diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index 3ff7b3d2c1b36..78b8acfe9db9a 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -198,6 +198,7 @@ if(HIP_FOUND) if(UNIX) find_package_and_print_version(rccl) find_package_and_print_version(hsa-runtime64 REQUIRED) + find_package_and_print_version(rocm_smi REQUIRED) endif() # Optional components. From 0446f7ba2fdcc4ffd2921949bf20f86d79677544 Mon Sep 17 00:00:00 2001 From: Yanyao Wang Date: Thu, 9 Apr 2026 11:42:50 -0500 Subject: [PATCH 68/87] [release/2.11] Fix numpy compatibility for Python 3.14 (#3100) (#3143) ## Motivation Fix numpy compatibility for Python 3.14 for release/2.11 ## Technical Details - `numpy==2.1.2` has no cp314 wheels on PyPI, causing Python 3.14 builds in TheRock CI to fail with a meson/sccache error when pip falls back to building numpy from source - Add `python_version` markers to use `numpy==2.4.3` for Python 3.14+, while keeping the existing `numpy==2.1.2` pin for older Python versions ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. Co-authored-by: Subodh Dubey --- requirements-build.txt | 3 ++- requirements.txt | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements-build.txt b/requirements-build.txt index 88a80dfaf1b30..7ca3c2cb1cb9c 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -5,7 +5,8 @@ setuptools==79.0.1 cmake==4.0.0 ninja==1.11.1.4 numpy==2.0.2 ; python_version == "3.9" -numpy==2.1.2 ; python_version > "3.9" +numpy==2.1.2 ; python_version > "3.9" and python_version < "3.14" +numpy==2.4.3 ; python_version >= "3.14" packaging==25.0 pyyaml==6.0.3 requests==2.32.5 diff --git a/requirements.txt b/requirements.txt index f8b6ebfd25ce1..ceb41d722e320 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,8 @@ lintrunner==0.12.11 ; platform_machine != "s390x" networkx==2.8.8 ninja==1.11.1.4 numpy==2.0.2 ; python_version == "3.9" -numpy==2.1.2 ; python_version > "3.9" +numpy==2.1.2 ; python_version > "3.9" and python_version < "3.14" +numpy==2.4.3 ; python_version >= "3.14" optree==0.13.0 ; python_version < "3.14" optree==0.17.0 ; python_version >= "3.14" psutil==7.2.2 From 8543095e3275db694084a6679bd5b61f7d2ece76 Mon Sep 17 00:00:00 2001 From: Ken Date: Thu, 16 Apr 2026 11:38:05 -0500 Subject: [PATCH 69/87] [ROCm][CI][release/2.11] Backport checking existence of /etc/rocm_env.sh before sourcing (#3163) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Fixes the `pytorch_ut` failure introduced in PyTorch 2.11 where `test.sh` exits immediately with code 1 before any tests run. **Root cause:** PR pytorch/pytorch#168377 added `source /etc/rocm_env.sh` to `.ci/pytorch/common.sh` targeting AMD's internal Jenkins CI, which provisions this file. When cherry-picked into `release/2.11`, this line breaks all TheRock Docker-based CI environments that do **not** provision `/etc/rocm_env.sh`. Since `set -e` is active in `test.sh`, the script exits before a single test runs — causing 0-pass, 1-fail on every host. **The fix:** Add a `[[ -f /etc/rocm_env.sh ]]` existence check so environments without the file skip sourcing it gracefully, while Jenkins CI (which does provision the file) continues working as before. This matches the fix already present on `pytorch/pytorch main`. ```bash # Before (broken): if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then source /etc/rocm_env.sh fi # After (fixed): if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ -f /etc/rocm_env.sh ]]; then source /etc/rocm_env.sh fi ``` **Impact without this fix:** - 86/97 `pytorch_ut` runs failed on TheRock build 7.13.0-1208 - Affects all GFX variants and Python versions (3.11, 3.12, 3.13) - PyTorch 2.10 is unaffected (does not have `source /etc/rocm_env.sh`) **References:** - Jira: ROCM-21809 - Upstream issue: pytorch/pytorch#170983 - Regression introduced by: pytorch/pytorch#168377 --- .ci/pytorch/common.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/pytorch/common.sh b/.ci/pytorch/common.sh index 072b8da9b10c6..eae12816fe71e 100644 --- a/.ci/pytorch/common.sh +++ b/.ci/pytorch/common.sh @@ -6,7 +6,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" set -ex -o pipefail # for ROCm environment variables -if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then +if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ -f /etc/rocm_env.sh ]]; then # shellcheck disable=SC1091 source /etc/rocm_env.sh fi From 520641b7cdcabd7dca4c3301fd054c5948c7ffae Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Mon, 20 Apr 2026 09:45:04 -0700 Subject: [PATCH 70/87] [release/2.11] Fix int4mm device memcpy error on Windows (#175410) (#3164) On Windows with HIP/ROCm, std::memcpy is a __host__ function and cannot be called from __device__ code. Use raw memcpy (which the HIP compiler provides as a device builtin) when building on Windows. This will allow builds for of pytorch for gfx942 on Windows. gfx950 is yet to be tested but it should likely build as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/175410 Approved by: https://github.com/jeffdaily Co-authored-by: Aaryaman Vasishta --- aten/src/ATen/native/cuda/int4mm.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu index ca00c944b3259..8765bed83345a 100644 --- a/aten/src/ATen/native/cuda/int4mm.cu +++ b/aten/src/ATen/native/cuda/int4mm.cu @@ -576,7 +576,14 @@ struct BLayout_TC_int4 { // type pun, the __nv_bfloat162 value in bf16x2x4 is a struct and // can't be used as a 32-bit asm register argument for `mma` static_assert(sizeof(bf16x2x4) == sizeof(out[0][0]), ""); + // On Windows with ROCm, std::memcpy resolves to a __host__-only + // function and cannot be called from __device__ code. Use the raw + // memcpy which the HIP compiler provides as a __device__ builtin. +#if defined(_WIN32) && defined(USE_ROCM) + memcpy(&out[i][j], &v, sizeof(bf16x2x4_u32)); +#else std::memcpy(&out[i][j], &v, sizeof(bf16x2x4_u32)); +#endif } } } From 141ba657575b42e5d0869002b509af4a75899edc Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com> Date: Tue, 21 Apr 2026 10:06:00 -0500 Subject: [PATCH 71/87] [UP][release/2.11] [ROCm][TunableOp] Support FP64 on hipBLASLt (#178195) (#3169) Cherry-pick of upstream https://github.com/pytorch/pytorch/pull/178195 into `release/2.11`. Related PR: - https://github.com/ROCm/pytorch/pull/3168 ## Motivation For MI350, FP64 is supported in hipBLASLt. This PR enables FP64 on hipBLASLt in TunableOp and re-enables the FP64 unit test on MI350. ## Technical Details - Map `double` GEMM to `HIPBLAS_COMPUTE_64F` via a new `HipBlasComputeTypeFor()` helper (defaults to `HIPBLAS_COMPUTE_32F`, specialized to `HIPBLAS_COMPUTE_64F` for `double`). - Use `at::opmath_type`-typed `alpha` / `beta` in the hipBLASLt path so FP64 tuning and execution use consistent compute semantics. - Set the matmul descriptor scale type with `HipDataTypeFor()`. - Guard the TF32 override with `if constexpr (std::is_same_v)` so FP64 doesn't get downgraded. - Removes the MI350 skip on `test_matmul_small_brute_force_tunableop_cuda_float64`. The cherry-pick applied cleanly (no conflicts). ## Test Plan Build PyTorch on MI350 with ROCm, then run: \`\`\` PYTORCH_TEST_WITH_ROCM=1 python test/test_linalg.py -v -k tunableop \`\`\` ## Test Result \`\`\` Ran 69 tests in 156.726s OK (skipped=42) \`\`\` All tunableop tests pass. Skipped tests are CPU-only variants and gfx942-only variants (FP8/TF32). Upstream PR: https://github.com/pytorch/pytorch/pull/178195 Upstream commit: 0550897ab3dcb3627dba1cfa43fd238fa4358418 Made with [Cursor](https://cursor.com) --- aten/src/ATen/cuda/tunable/GemmHipblaslt.h | 56 ++++++++++++++-------- test/test_linalg.py | 3 -- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h index 29affa2d21ff1..29c15720f4a66 100644 --- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h +++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h @@ -94,6 +94,16 @@ constexpr hipDataType HipDataTypeFor() { #endif } +template +constexpr hipblasComputeType_t HipBlasComputeTypeFor() { + return HIPBLAS_COMPUTE_32F; +} + +template <> +constexpr hipblasComputeType_t HipBlasComputeTypeFor() { + return HIPBLAS_COMPUTE_64F; +} + template int GetBatchFromParams(const GemmParams* params) { return 1; @@ -175,43 +185,43 @@ int GetStrideCFromParams(const ScaledGemmParams* params) { } template -float GetAlphaFromParams(const GemmParams* params) { +at::opmath_type GetAlphaFromParams(const GemmParams* params) { return params->alpha; } template -float GetAlphaFromParams(const GemmAndBiasParams* params) { +at::opmath_type GetAlphaFromParams(const GemmAndBiasParams* params) { return params->alpha; } template -float GetAlphaFromParams(const GemmStridedBatchedParams* params) { +at::opmath_type GetAlphaFromParams(const GemmStridedBatchedParams* params) { return params->alpha; } template -float GetAlphaFromParams(const ScaledGemmParams* params) { - return 1.0; +at::opmath_type GetAlphaFromParams(const ScaledGemmParams* params) { + return at::opmath_type{1.0}; } template -float GetBetaFromParams(const GemmParams* params) { +at::opmath_type GetBetaFromParams(const GemmParams* params) { return params->beta; } template -float GetBetaFromParams(const GemmAndBiasParams* params) { - return 0.0; +at::opmath_type GetBetaFromParams(const GemmAndBiasParams* params) { + return at::opmath_type{0.0}; } template -float GetBetaFromParams(const GemmStridedBatchedParams* params) { +at::opmath_type GetBetaFromParams(const GemmStridedBatchedParams* params) { return params->beta; } template -float GetBetaFromParams(const ScaledGemmParams* params) { - return 0.0; +at::opmath_type GetBetaFromParams(const ScaledGemmParams* params) { + return at::opmath_type{0.0}; } template @@ -467,8 +477,9 @@ class HipblasltGemmOp : public Callable { TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen"); - float alpha = GetAlphaFromParams(params); - float beta = GetBetaFromParams(params); + using opmath_t = at::opmath_type; + opmath_t alpha = GetAlphaFromParams(params); + opmath_t beta = GetBetaFromParams(params); hipblasLtMatrixLayout_t mat_a, mat_b, mat_c; if (opa == HIPBLAS_OP_N) { @@ -505,11 +516,14 @@ class HipblasltGemmOp : public Callable { mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c))); } - hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; - if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { - computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; + hipblasComputeType_t computeType = HipBlasComputeTypeFor(); + if constexpr (std::is_same_v) { + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { + computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; + } } - HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F); + auto scale_type = HipDataTypeFor(); + HipBlasLtMatmulDescriptor matmul(computeType, scale_type); matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa); matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb); @@ -630,9 +644,11 @@ auto GetHipBlasLtTypeStringAndOps() { } #endif - hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; - if (at::globalContext().allowTF32CuBLAS()) { - computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; + hipblasComputeType_t computeType = HipBlasComputeTypeFor(); + if constexpr (std::is_same_v) { + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { + computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; + } } hipblasLtHandle_t handle; diff --git a/test/test_linalg.py b/test/test_linalg.py index 346a6c0204479..045da268fd99c 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -5116,9 +5116,6 @@ def test_matmul_small_brute_force_tunableop(self, device, dtype): # We set the TunableOp numerical check environment variable here because it is # possible to hit some invalid numerical solutions due to the small matrix sizes. - if torch.version.hip and isRocmArchAnyOf(MI350_ARCH) and dtype is torch.double: - self.skipTest("Currently hangs on rocm mi350") - with self._tunableop_ctx(): torch.cuda.tunable.set_rotating_buffer_size(0) # Numerical check adds significant overhead, unsure if this is needed From 50bfde7c08dc92b69b71d2b76d3b2d3709cf28f6 Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:10:27 -0500 Subject: [PATCH 72/87] [release/2.11][ROCm][inductor] Additional GEMM, pointwise and reduction configs. (#3145) New Inductor configs in support of a customer request. See https://amd-hub.atlassian.net/browse/AIPYTORCH-373 --- torch/_inductor/runtime/triton_heuristics.py | 7 +++++++ torch/_inductor/template_heuristics/triton.py | 3 +++ 2 files changed, 10 insertions(+) diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 8108636663b90..16bc308839b99 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -2897,6 +2897,12 @@ def pointwise( num_stages=2, waves_per_eu=1, # 20% improvement ), + triton_config_with_settings( + size_hints, + 512, + num_warps=4, + num_stages=4, # 30% improvement + ), ] ) if inductor_meta.get("atomic_add_found"): @@ -3257,6 +3263,7 @@ def outer_config_opt(): [ make_config(1024, 8, num_warps=4, num_stages=1, waves_per_eu=2), make_config(512, 8, num_warps=4, num_stages=1, waves_per_eu=1), + make_config(32, 128, num_warps=1, num_stages=1), # 30% improvement ] ) diff --git a/torch/_inductor/template_heuristics/triton.py b/torch/_inductor/template_heuristics/triton.py index 96457817ff5e3..e15ff07ee5a4f 100644 --- a/torch/_inductor/template_heuristics/triton.py +++ b/torch/_inductor/template_heuristics/triton.py @@ -1436,6 +1436,9 @@ def __init__(self) -> None: ), ROCmGemmConfig(256, 128, 32, self.default_num_stages, 8, group_m=16), ROCmGemmConfig(256, 128, 64, self.default_num_stages, 8, group_m=4), + ROCmGemmConfig(256, 128, 64, self.default_num_stages, 8, group_m=16, matrix_instr_nonkdim=0), + ROCmGemmConfig(256, 128, 64, self.default_num_stages, 8, group_m=8, matrix_instr_nonkdim=0), + ROCmGemmConfig(128, 128, 64, self.default_num_stages, 8, group_m=4, matrix_instr_nonkdim=0), ROCmGemmConfig(256, 256, 64, self.default_num_stages, 8, group_m=4), ] From 0320cc5b2fbba866c7ac1aa5deb8c14dd9a37b95 Mon Sep 17 00:00:00 2001 From: sohbodas <144367600+sohbodas@users.noreply.github.com> Date: Thu, 23 Apr 2026 10:17:51 -0400 Subject: [PATCH 73/87] [release/2.11] Update Numba version constraints to support Python 3.14 (#3148) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - This PR updates the Numba version constraints to correctly handle Python 3.14 and aligns the platform conditions with Numba’s current support matrix. - Add a new rule selecting numba==0.64.0 for Python ≥ 3.14 --------- Co-authored-by: sohbodas Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> --- .ci/docker/requirements-ci.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 24be093b31e7e..3a97ddf174e2a 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -119,8 +119,8 @@ ninja==1.11.1.4 numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x" numba==0.60.0 ; python_version == "3.9" and platform_machine != "s390x" -numba==0.61.2 ; python_version > "3.9" and platform_machine != "s390x" - +numba==0.61.2 ; python_version >= "3.10" and python_version < "3.14" and platform_machine != "s390x" +numba==0.64.0 ; python_version >= "3.14" and platform_machine != "s390x" #Description: Just-In-Time Compiler for Numerical Functions #Pinned versions: 0.55.2, 0.60.0 #test that import: test_numba_integration.py From 3aaa914af1e6fb268b242bfb871e614fbdb6c1bc Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Fri, 24 Apr 2026 17:53:00 -0700 Subject: [PATCH 74/87] [release/2.11] Fix MIOpen CTC loss crash on Windows (#179264) (#3181) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit

Fix MIOpen CTC loss access violation on Windows discrete GPUs

Problem

A failing unit test on Windows started showing a couple weeks ago and a missing #include was added in [](https://github.com/pytorch/pytorch/pull/178284), but CI on TheRock kept failing. The fix was tested on gfx1151 (APU), where the test passed, but CI showed failures on gfx1100.

test_CTCLoss_no_batch_dim (and any code path hitting miopen_ctc_loss) crashes with a fatal access violation on Windows systems with discrete AMD GPUs:

Windows fatal exception: access violation Exception Code:
0xC0000005
#0 miopen::CTCLossDescriptor::GetCTCLossWorkspaceSize
(MIOpen.dll+0x14fde4) #1 miopenGetCTCLossWorkspaceSize
(MIOpen.dll+0x150912) #2 at::native::miopen_ctc_loss (torch_hip.dll)

Root Cause

miopenGetCTCLossWorkspaceSize and miopenCTCLoss read the labels, label_lengths, and input_lengths arrays on the host side to plan the computation and calculate workspace requirements. The existing code copies these arrays to GPU memory and passes device pointers:

Tensor labels_gpu = targets_t.to(Device(at::kCUDA),
at::kInt); // ... hipMemcpy to GPU ...
MIOPEN_CHECK(miopenGetCTCLossWorkspaceSize(...,
    labels_gpu.data_ptr<int>(),          // device pointer
    label_lengths_gpu.data_ptr<int>(),   // device pointer
    input_lengths_gpu.data_ptr<int>()    // device pointer
));

This works on:

  • Linux — HSA (Heterogeneous System Architecture) maps GPU allocations into the process virtual address space, making device pointers host-readable
  • Windows APUs — CPU and iGPU share system RAM, so device pointers point to host-accessible memory

This crashes on:

  • Windows dGPUs — GPU has dedicated VRAM across PCIe; device pointers are opaque handles that cannot be dereferenced from host code

Verification

Tested on gfx1201:

CheckResult
hipDeviceAttributeIntegrated0 (discrete GPU)
hipDeviceAttributeCanUseHostPointerForRegisteredMem0
hipDeviceAttributeManagedMemory0x7FFFFFFF (unsupported)
hipDeviceAttributeUnifiedAddressing0x7FFFFFFF (unsupported)
Host read of hipMalloc pointer via ctypesAccess violation
CTC loss with CPU pointersPass (forward + backward)

Fix

Use host pointers since this is what MIOpen expects should be used.

Testing

Run all existing CTCLoss unit tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/179264 Approved by: https://github.com/jeffdaily Co-authored-by: Milica Stankovic --- .../src/ATen/native/miopen/LossCTC_miopen.cpp | 33 ++++--------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/aten/src/ATen/native/miopen/LossCTC_miopen.cpp b/aten/src/ATen/native/miopen/LossCTC_miopen.cpp index 21797e7537d59..9c9ee2687aa99 100644 --- a/aten/src/ATen/native/miopen/LossCTC_miopen.cpp +++ b/aten/src/ATen/native/miopen/LossCTC_miopen.cpp @@ -206,35 +206,16 @@ std::tuple miopen_ctc_loss( Tensor costs = at::empty({batch_size}, log_probs->options()); Tensor grad = at::empty_like(log_probs_t, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - // MIOpen requires labels and lengths on GPU - Tensor labels_gpu = targets_t.to(Device(at::kCUDA), at::kInt); - Tensor label_lengths_gpu = at::empty( - {static_cast(target_lengths.size())}, - at::TensorOptions().dtype(at::kInt).device(at::kCUDA)); - Tensor input_lengths_gpu = at::empty( - {static_cast(input_lengths.size())}, - at::TensorOptions().dtype(at::kInt).device(at::kCUDA)); - - C10_CUDA_CHECK(hipMemcpy( - label_lengths_gpu.data_ptr(), - target_lengths.data(), - target_lengths.size() * sizeof(int), - hipMemcpyHostToDevice)); - C10_CUDA_CHECK(hipMemcpy( - input_lengths_gpu.data_ptr(), - input_lengths.data(), - input_lengths.size() * sizeof(int), - hipMemcpyHostToDevice)); - + // MIOpen reads labels/lengths on the host. size_t workspace_size; (void)deterministic; // MIOpen only supports deterministic algorithm MIOPEN_CHECK(miopenGetCTCLossWorkspaceSize( handle, probs_desc, grads_desc, - labels_gpu.data_ptr(), - label_lengths_gpu.data_ptr(), - input_lengths_gpu.data_ptr(), + targets_t.data_ptr(), + target_lengths.data(), + input_lengths.data(), MIOPEN_CTC_LOSS_ALGO_DETERMINISTIC, ctc_desc, &workspace_size)); @@ -245,9 +226,9 @@ std::tuple miopen_ctc_loss( handle, probs_desc, log_probs_t.data_ptr(), - labels_gpu.data_ptr(), - label_lengths_gpu.data_ptr(), - input_lengths_gpu.data_ptr(), + targets_t.data_ptr(), + target_lengths.data(), + input_lengths.data(), costs.data_ptr(), grads_desc, grad.data_ptr(), From 48211a7882d719c26bbeb9c3cca5c60a936bdc34 Mon Sep 17 00:00:00 2001 From: Tijana Vukovic <127323445+tvukovic-amd@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:03:15 -0700 Subject: [PATCH 75/87] [release/2.11] Fix Windows access violation in MIOpen CTC loss dispatch (#3161) Cherry pick of https://github.com/pytorch/pytorch/pull/178284 Fixes https://github.com/ROCm/TheRock/issues/3987 Co-authored-by: Milica Stankovic --- aten/src/ATen/native/miopen/LossCTC_miopen.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/aten/src/ATen/native/miopen/LossCTC_miopen.cpp b/aten/src/ATen/native/miopen/LossCTC_miopen.cpp index 9c9ee2687aa99..6200f7ede7df5 100644 --- a/aten/src/ATen/native/miopen/LossCTC_miopen.cpp +++ b/aten/src/ATen/native/miopen/LossCTC_miopen.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #endif // TODO: Remove the condition on AT_ROCM_ENABLED entirely, From e16e349eb30bac8fd72b5c34ab220527fea5c58c Mon Sep 17 00:00:00 2001 From: Tijana Vukovic <127323445+tvukovic-amd@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:04:26 -0700 Subject: [PATCH 76/87] [release/2.11] Fix missing native header includes causing DLL export (#3160) Cherry pick of https://github.com/pytorch/pytorch/pull/179138 Fixes: https://github.com/ROCm/TheRock/issues/4086 https://github.com/ROCm/rocm-libraries/issues/5205 https://github.com/ROCm/TheRock/issues/4079 Co-authored-by: Stefan Sokolovic --- aten/src/ATen/native/cuda/Blas.cpp | 1 + aten/src/ATen/native/cuda/GroupedBlas.cpp | 1 + aten/src/ATen/native/cuda/ScaledBlas.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 4a03faa02ef56..74d650463e8d2 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -34,6 +34,7 @@ #else #include #include +#include #include #include #include diff --git a/aten/src/ATen/native/cuda/GroupedBlas.cpp b/aten/src/ATen/native/cuda/GroupedBlas.cpp index 5875c9a805724..70c33e27aa0a3 100644 --- a/aten/src/ATen/native/cuda/GroupedBlas.cpp +++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp @@ -37,6 +37,7 @@ #else #include #include +#include #include #include #include diff --git a/aten/src/ATen/native/cuda/ScaledBlas.cpp b/aten/src/ATen/native/cuda/ScaledBlas.cpp index cafcc28b3d2c1..223f10c53a318 100644 --- a/aten/src/ATen/native/cuda/ScaledBlas.cpp +++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include From 1a6ad28e29f7825ae7e0367f1b0793d579c66d29 Mon Sep 17 00:00:00 2001 From: Tijana Vukovic <127323445+tvukovic-amd@users.noreply.github.com> Date: Tue, 28 Apr 2026 08:54:05 -0700 Subject: [PATCH 77/87] [release/2.11] Windows specific test fixes (#176024) (#3182) Cherry pick of https://github.com/pytorch/pytorch/pull/176024 Co-authored-by: nkhasbag Co-authored-by: PyTorch MergeBot Co-authored-by: Nikita Shulga --- test/cpp_extensions/setup.py | 6 +- test/export/test_export_opinfo.py | 10 ++ test/export/test_serialize.py | 5 + test/test_cuda.py | 234 ++++++++++++++++++++++++++- test/test_fx.py | 25 ++- test/torch_np/test_nep50_examples.py | 34 +++- 6 files changed, 301 insertions(+), 13 deletions(-) diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py index 35da0b4391884..2fd8f1b2667ff 100644 --- a/test/cpp_extensions/setup.py +++ b/test/cpp_extensions/setup.py @@ -42,6 +42,8 @@ ), ] +NVCC_FLAGS = ["-O2"] + (["-DUSE_CUDA"] if IS_WINDOWS else []) + if torch.cuda.is_available() and (CUDA_HOME is not None or ROCM_HOME is not None): extension = CUDAExtension( "torch_test_cpp_extension.cuda", @@ -50,7 +52,7 @@ "cuda_extension_kernel.cu", "cuda_extension_kernel2.cu", ], - extra_compile_args={"cxx": CXX_FLAGS, "nvcc": ["-O2"]}, + extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, ) ext_modules.append(extension) @@ -58,7 +60,7 @@ extension = CUDAExtension( "torch_test_cpp_extension.torch_library", ["torch_library.cu"], - extra_compile_args={"cxx": CXX_FLAGS, "nvcc": ["-O2"]}, + extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, ) ext_modules.append(extension) diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py index 361674a69c7ae..b33aeb45438a3 100644 --- a/test/export/test_export_opinfo.py +++ b/test/export/test_export_opinfo.py @@ -22,6 +22,7 @@ ) from torch.testing._internal.common_utils import ( IS_FBCODE, + IS_WINDOWS, run_tests, skipIfRocm, TestCase, @@ -152,6 +153,11 @@ class TestExportOnFakeCuda(TestCase): # We set CUDA_VISIBLE_DEVICES="" to simulate a CPU machine with cuda build # Running this on all ops in op_db is too slow, so we only run on a selected subset @onlyCUDA + @unittest.skipIf( + IS_WINDOWS, + 'Subprocess with CUDA_VISIBLE_DEVICES="" imports op_db which triggers ' + "get_device_capability(); 0 devices raises Invalid device id on Windows.", + ) @ops(selected_op_db, allowed_dtypes=(torch.float,)) def test_fake_export(self, device, dtype, op): test_script = f"""\ @@ -218,6 +224,10 @@ def forward(self, *args): self.assertEqual(r, "") @unittest.skipIf(not torch.backends.cuda.is_built(), "requires CUDA build") + @unittest.skipIf( + IS_WINDOWS, + "Failing on Windows, device_count() changes from 0 to 1 ", + ) def test_preserve_original_behavior(self): test_script = f"""\ import torch diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py index 6e4d41fc1937a..7df7867e3613e 100644 --- a/test/export/test_serialize.py +++ b/test/export/test_serialize.py @@ -24,6 +24,11 @@ from torch.library import wrap_triton from torch.utils._triton import has_triton +else: + + def has_triton(): + return False + import torch import torch._dynamo as torchdynamo diff --git a/test/test_cuda.py b/test/test_cuda.py index df9bdd5b0be11..56625d716244c 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -3,6 +3,7 @@ import contextlib import ctypes +import functools import gc import json import os @@ -38,6 +39,7 @@ _get_torch_cuda_version, PLATFORM_SUPPORTS_GREEN_CONTEXT, SM70OrLater, + SM89OrLater, TEST_CUDNN, TEST_MULTIGPU, tf32_on_and_off, @@ -105,7 +107,8 @@ load_tests = load_tests # noqa: PLW0127 try: - # import torchvision.models # noqa: F401 + import torchvision.models # noqa: F401 + # from torchvision.models import resnet18 # noqa: F401 HAS_TORCHVISION = True @@ -130,6 +133,50 @@ _cycles_per_ms = None +_wait_for_cpu_kernel = None + + +def skip_background_threads_on_windows(f): + @functools.wraps(f) + def wrapped(self, **kwargs): + if IS_WINDOWS and SM89OrLater and kwargs.get("use_background_threads"): + raise unittest.SkipTest("using background threads fails on Windows") + return f(self, **kwargs) + + return wrapped + + +def get_wait_for_cpu_kernel(): + """Returns a compiled CUDA spin-wait kernel that blocks the GPU stream until + the host sets a pinned int32 flag to non-zero. Requires SM70+. + + Usage:: + + kernel = get_wait_for_cpu_kernel() + flag = torch.zeros(1, dtype=torch.int32, device="cpu").pin_memory() + with torch.cuda.stream(s): + kernel(grid=(1, 1, 1), block=(1, 1, 1), args=[flag]) + # stream s is now blocked until: + flag[0] = 1 + """ + global _wait_for_cpu_kernel + if _wait_for_cpu_kernel is None: + from torch.cuda import _compile_kernel + + _wait_for_cpu_kernel = _compile_kernel( + r""" + __global__ void wait_for_cpu(int *pinned_cpu_flag) { + int flag = 0; + do { + asm volatile("ld.relaxed.sys.global.s32 %0, [%1];" : "=r"(flag) : "l"(pinned_cpu_flag) : "memory"); + } while (flag == 0); + } + """, + "wait_for_cpu", + ) + return _wait_for_cpu_kernel + + @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests") @torch.testing._internal.common_utils.markDynamoStrictTest class TestCuda(TestCase): @@ -290,6 +337,9 @@ def test_pinned_memory_empty_cache(self): "pinned_use_cuda_host_register:False" ) + # Pinned allocator background thread does not shut down cleanly on Windows + # Python process hangs + @unittest.skipIf(IS_WINDOWS and SM89OrLater, "Fails on windows with SM89+") def test_pinned_memory_use_background_threads(self): script = """ import torch @@ -432,6 +482,9 @@ def test_out_of_memory(self): tensor.fill_(1) self.assertTrue((tensor == 1).all()) + # CUDA memory allocations on windows do not OOM on rtx even when they cross allowed memory + # Skip test until this is investigated + @unittest.skipIf(IS_WINDOWS and SM89OrLater, "Fails on windows with SM89+") @unittest.skipIf( TEST_CUDAMALLOCASYNC or IS_JETSON, "Segmentation fault (core dumped)" ) @@ -616,6 +669,9 @@ def test_serialization_array_with_storage(self): q_copy[1].fill_(10) self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10)) + @unittest.skipIf( + IS_WINDOWS and SM89OrLater, "preferred_blas_library not supported on Windows" + ) @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Does not work in fbcode yet") @setBlasBackendsToDefaultFinally def test_preferred_blas_library_settings(self): @@ -685,6 +741,9 @@ def _check_default(): torch.backends.cuda.preferred_blas_library("default") _check_default() + @unittest.skipIf( + IS_WINDOWS and SM89OrLater, "preferred_blas_library not supported on Windows" + ) @unittest.skipIf(TEST_CUDAMALLOCASYNC, "temporarily disabled for async") @setBlasBackendsToDefaultFinally def test_cublas_workspace_explicit_allocation(self): @@ -4086,6 +4145,9 @@ def test_gds_fails_in_ci(self): with self.assertRaisesRegex(RuntimeError, error_msg): torch.cuda.gds.GdsFile(f, os.O_CREAT | os.O_RDWR) + @unittest.skipIf( + IS_WINDOWS, "test relies on fork; Windows multiprocessing uses spawn" + ) def test_is_pinned_no_context(self): test_script = """\ import torch @@ -5043,7 +5105,11 @@ def test_temperature(self): @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available") def test_device_memory_used(self): """ - Verify used device memory in bytes + Verify used device memory in bytes. + On Windows the NVML used value has been observed not to increase after + a CUDA allocation (delta 0); we only assert API sanity there (non-negative, + non-decreasing after alloc, <= total memory). Need to investigate expected behavior + with Windows WDDM """ torch.cuda.synchronize() gc.collect() @@ -5054,9 +5120,20 @@ def test_device_memory_used(self): torch.cuda.synchronize() torch.cuda.empty_cache() b = torch.cuda.device_memory_used() - mem_bytes = b - a - # test the order of magnitude - self.assertTrue(num_bytes // 32 <= mem_bytes <= num_bytes * 32) + if IS_WINDOWS: + # NVML used memory does not reflect CUDA allocations on WDDM; only check API sanity + self.assertGreaterEqual(a, 0, "device_memory_used should be non-negative") + self.assertGreaterEqual(b, 0, "device_memory_used should be non-negative") + self.assertGreaterEqual( + b, a, "used memory should not decrease after allocation" + ) + total = torch.cuda.get_device_properties(0).total_memory + self.assertLessEqual(a, total, "used should not exceed total device memory") + self.assertLessEqual(b, total, "used should not exceed total device memory") + else: + mem_bytes = b - a + # test the order of magnitude + self.assertTrue(num_bytes // 32 <= mem_bytes <= num_bytes * 32) @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available") def test_power_draw(self): @@ -5748,6 +5825,7 @@ def test_pin_memory_use(self, use_cuda_host_register): "use_memory, delete_memory", [(True, True), (True, False), (False, True), (False, False)], ) + @skip_background_threads_on_windows def test_two_graphs( self, use_background_threads, use_cuda_host_register, use_memory, delete_memory ): @@ -6456,6 +6534,152 @@ def test_graph_capture_reclaim_4_streams(self): "graph_capture_record_stream_reuse:False" ) + + @unittest.skipIf( + not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs" + ) + def test_graph_capture_reclaim_shared_pool(self): + torch.cuda.memory._set_allocator_settings( + "graph_capture_record_stream_reuse:True" + ) + torch.cuda.empty_cache() + torch.cuda.synchronize() + + shared_pool = torch.cuda.graph_pool_handle() + cap_stream = torch.cuda.Stream() + side_stream = torch.cuda.Stream() + + g1 = torch.cuda.CUDAGraph() + g2 = torch.cuda.CUDAGraph() + + numel = (8 * 1024 * 1024) // 4 + + with torch.cuda.stream(cap_stream): + g1.capture_begin(pool=shared_pool) + data = torch.empty(numel, device="cuda") + data_ptr = data.data_ptr() + + side_stream.wait_stream(cap_stream) + with torch.cuda.stream(side_stream): + data.add_(1.0) + data.record_stream(side_stream) + + cap_stream.wait_stream(side_stream) + + del data + g1.capture_end() + + torch.cuda.current_stream().wait_stream(cap_stream) + torch.cuda.synchronize() + + with torch.cuda.stream(cap_stream): + g2.capture_begin(pool=shared_pool) + data2 = torch.empty(numel, device="cuda") + data2.fill_(42.0) + data2_ptr = data2.data_ptr() + g2.capture_end() + + torch.cuda.current_stream().wait_stream(cap_stream) + torch.cuda.synchronize() + + self.assertEqual(data_ptr, data2_ptr) + + torch.cuda.memory._set_allocator_settings( + "graph_capture_record_stream_reuse:False" + ) + + @unittest.skipIf( + not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs" + ) + @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc") + @unittest.skipIf( + not SM70OrLater, "Compute capability >= SM70 required for relaxed ptx flag" + ) + def test_graph_capture_pre_capture_stream_use(self): + # Tests that a block with pre-capture stream uses is correctly handled + # when freed during a subsequent capture on the same pool. + # Exercises the insert_events path in endAllocateToPool. + spin_wait_kernel = get_wait_for_cpu_kernel() + + torch.cuda.memory._set_allocator_settings( + "graph_capture_record_stream_reuse:True" + ) + torch.cuda.empty_cache() + torch.cuda.synchronize() + + shared_pool = torch.cuda.graph_pool_handle() + cap_stream = torch.cuda.Stream() + side_stream = torch.cuda.Stream() + flag_cpu = torch.zeros(1, dtype=torch.int32, device="cpu").pin_memory() + + g1 = torch.cuda.CUDAGraph() + g2 = torch.cuda.CUDAGraph() + g3 = torch.cuda.CUDAGraph() + g4 = torch.cuda.CUDAGraph() + + numel = (8 * 1024 * 1024) // 4 + + # First capture: allocate data in the shared pool, keep it alive. + with torch.cuda.stream(cap_stream): + g1.capture_begin(pool=shared_pool) + data = torch.empty(numel, device="cuda") + data_ptr = data.data_ptr() + g1.capture_end() + + torch.cuda.synchronize() + + # Between captures: block side_stream with a spin-wait kernel + # (pre-capture stream use). The kernel holds the stream busy until + # we explicitly set the flag from the host. + with torch.cuda.stream(side_stream): + spin_wait_kernel(grid=(1, 1, 1), block=(1, 1, 1), args=[flag_cpu]) + data.record_stream(side_stream) + + # Second capture: free data during capture. + with torch.cuda.stream(cap_stream): + g2.capture_begin(pool=shared_pool) + del data + g2.capture_end() + + # Trigger process_events. The spin kernel is still holding side_stream, + # so cudaEventQuery returns cudaErrorNotReady and the block stays pending. + torch.empty(1, device="cuda") + + # Allocate from the same pool: block must NOT be reused yet. + with torch.cuda.stream(cap_stream): + g3.capture_begin(pool=shared_pool) + not_reused = torch.empty(numel, device="cuda") + not_reused_ptr = not_reused.data_ptr() + g3.capture_end() + + self.assertNotEqual(data_ptr, not_reused_ptr) + + # Release the spin kernel so side_stream can finish. + flag_cpu[0] = 1 + torch.cuda.synchronize() + + # Trigger process_events to reclaim the block. + torch.empty(1, device="cuda") + + # Fourth capture: the block should now be reusable. + with torch.cuda.stream(cap_stream): + g4.capture_begin(pool=shared_pool) + reused = torch.empty(numel, device="cuda") + reused_ptr = reused.data_ptr() + g4.capture_end() + + self.assertEqual(data_ptr, reused_ptr) + + torch.cuda.memory._set_allocator_settings( + "graph_capture_record_stream_reuse:False" + ) + + # expandable_segments not supported (PYTORCH_C10_DRIVER_API_SUPPORTED not defined for windows builds) + @unittest.skipIf( + IS_WINDOWS and SM89OrLater, + "expandable_segments not supported (PYTORCH_C10_DRIVER_API_SUPPORTED not defined for windows builds)", + ) + @skipIfRocm(msg="expandable_segments mode is not supported on ROCm") @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode") def test_mempool_expandable(self): torch.cuda.empty_cache() diff --git a/test/test_fx.py b/test/test_fx.py index a9bbf8c22d699..36abe391485b1 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -4476,8 +4476,29 @@ def forward(self, x): else: kernel_event = "cudaLaunchKernel" kernel_event_relu = "cudaLaunchKernel" - - expected = f"""\ + if IS_WINDOWS: + expected = f"""\ +event=aten::t node=t stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::transpose node=t stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::as_strided node=t stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::addmm node=addmm stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::expand node=addmm stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::as_strided node=addmm stack_trace=return F.linear(input, self.weight, self.bias) +event={kernel_event} node=addmm stack_trace=return F.linear(input, self.weight, self.bias) +event={kernel_event} node=addmm stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::relu node=relu stack_trace=return F.relu(input, inplace=self.inplace) +event=aten::clamp_min node=relu stack_trace=return F.relu(input, inplace=self.inplace) +event={kernel_event_relu} node=relu stack_trace=return F.relu(input, inplace=self.inplace) +event=aten::t node=t_1 stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::transpose node=t_1 stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::as_strided node=t_1 stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::addmm node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::expand node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias) +event=aten::as_strided node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias) +event={kernel_event} node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias) +event={kernel_event} node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias)""" + else: + expected = f"""\ event=aten::t node=t stack_trace=x = self.linear1(x) event=aten::transpose node=t stack_trace=x = self.linear1(x) event=aten::as_strided node=t stack_trace=x = self.linear1(x) diff --git a/test/torch_np/test_nep50_examples.py b/test/torch_np/test_nep50_examples.py index a3ad346bf9f1c..964683bd74c81 100644 --- a/test/torch_np/test_nep50_examples.py +++ b/test/torch_np/test_nep50_examples.py @@ -31,6 +31,7 @@ from torch._numpy.testing import assert_allclose from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, + IS_WINDOWS, parametrize, run_tests, TestCase, @@ -216,11 +217,36 @@ def test_compare_ufuncs(self, name, scalar, array): # TypeError: ufunc 'hypot' not supported for the input types result_numpy = None + type_mismatch = False + expected_numpy_dtype = None + expected_torch_dtype = None + if result is not None and result_numpy is not None: - if result.tensor.numpy().dtype != result_numpy.dtype: - raise AssertionError( - f"Expected result dtype == {result_numpy.dtype}, got {result.tensor.numpy().dtype}" - ) + expected_numpy_dtype = result_numpy.dtype + expected_torch_dtype = result.tensor.numpy().dtype + if IS_WINDOWS: + if ( + array.tensor.numpy().dtype != _np.bool_ + and result.tensor.numpy().dtype != result_numpy.dtype + ): + type_mismatch = True + + if ( + array.tensor.numpy().dtype == _np.bool_ + and result_numpy.dtype == _np.int32 + and result.tensor.numpy().dtype != _np.int64 + ): + expected_numpy_dtype = _np.int32 + expected_torch_dtype = tnp.int64 + type_mismatch = True + else: + if result.tensor.numpy().dtype != result_numpy.dtype: + type_mismatch = True + + if type_mismatch: + raise AssertionError( + f"Expected result numpy dtype == {expected_numpy_dtype}, torch dtype == {expected_torch_dtype}" + ) finally: _np._set_promotion_state(state) From 9413e9b96bcbeb8af1aa0280a3a9bc7dd048857e Mon Sep 17 00:00:00 2001 From: "rocm-repo-management-api-6[bot]" <212817015+rocm-repo-management-api-6[bot]@users.noreply.github.com> Date: Tue, 28 Apr 2026 08:57:11 -0700 Subject: [PATCH 78/87] [AUTOGENERATED] [release/2.11] [UP][UT][ROCm][TunableOp] Fix test_call_count_tunableop to correctly extract kernel names for RDNA (#3185) Cherry-pick of https://github.com/ROCm/pytorch/pull/2954 Co-authored-by: Uros Markovic --- test/test_linalg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_linalg.py b/test/test_linalg.py index 045da268fd99c..25a157343db15 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -6427,9 +6427,9 @@ def test_call_count_tunableop(self, device, dtype): # launched per PyTorch API. The kernels have string # that always starts with `Cijk*` mm_key = 'Cijk' - events = prof.key_averages() + events = prof.events() for evt in events: - if mm_key in evt.key: + if mm_key in evt.name: self.assertEqual(evt.count, 1) kernel_count = kernel_count + 1 From 7d37be22a5b0fa41bbedc7394d38de3df0a61a30 Mon Sep 17 00:00:00 2001 From: Harkirat Gill Date: Tue, 28 Apr 2026 18:18:50 -0400 Subject: [PATCH 79/87] [release/2.11] Update composable_kernel submodule with gfx1033 support (#3144) ## Motivation - Enabling gfx103X-all wheels in TheRock is currently blocked due to PyTorch CI failures caused by a lack of `gfx1033` support in CK. https://github.com/ROCm/rocm-libraries/pull/5141 resolves these issues. ## Technical Details - The aforementioned fix has been cherrypicked into the `pytorch/release/2.11/` branch of ROCm/composable_kernel - this PR bumps the `third_party/composable_kernel` branch to pick up these changes. ## Test Plan - Trigger a build and verify it passes ## Test Result - Build succeeds for `cherrypick-gfx1033-CK-support-torch2.11` branch. https://github.com/ROCm/TheRock/actions/runs/24195531659/job/70624339554 - Testing Pasting offline comments from @harkgill-amd > In https://github.com/ROCm/TheRock/actions/runs/24906345786/job/72942139688 Pytorch 3.10 + release/2.11 -> Pass Pytorch 3.11 + release/2.11 -> TestNN.test_Embedding_discontiguous_cuda failed but this seems to be a known flaky test and will be disabled with https://github.com/ROCm/TheRock/pull/4775 Pytorch 3.12 + release/2.11 -> Pass Pytorch 3.13 + release/2.11 -> Pass In https://github.com/ROCm/TheRock/actions/runs/25002732513/job/73225027260 Pytorch 3.14 + release/2.11 -> The failing tests here all share the same miopenStatusUnknownError message. These are the same failures as seen in the main branch run here https://github.com/ROCm/TheRock/actions/runs/24985367049 so they aren't related to my PR ## Submission Checklist - [X] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --- third_party/composable_kernel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/composable_kernel b/third_party/composable_kernel index fcc9372c009c8..7182f6d1391ed 160000 --- a/third_party/composable_kernel +++ b/third_party/composable_kernel @@ -1 +1 @@ -Subproject commit fcc9372c009c8e0a23fece77b582da83b04a654f +Subproject commit 7182f6d1391ed75fe0a9dd1328f2b2683a12d041 From 345ca6fbeb70f3a6edda9f13b8aafcadccbedd4e Mon Sep 17 00:00:00 2001 From: zichguan-amd Date: Tue, 28 Apr 2026 18:37:21 -0400 Subject: [PATCH 80/87] Cleanup custom op polluting global state for subsequent tests (#3170) `my_lib` in `test_storage_preserve_nonhermetic_in_hermetic_context` leaks into global op space after the test ends and affect subsequent tests in the same process using dynamo. Without the fix, running any tests requiring checkpoint/compile or dynamo-related after `test_storage_preserve_nonhermetic_in_hermetic_context` fails with ``` torch._dynamo.exc.BackendCompilerFailed: backend='aot_eager' raised: TypeError: 'CustomDecompTable' object is not a mapping ``` e.g. `python -m pytest -v pytorch/test/test_torch.py::TestTorch::test_storage_preserve_nonhermetic_in_hermetic_context pytorch/test/test_autograd.py::TestAutograd::test_checkpoint_compile_no_recompile` Upstream PR: https://github.com/pytorch/pytorch/pull/180998 Signed-off-by: zichguan-amd --- test/test_torch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_torch.py b/test/test_torch.py index fa254541d1ece..54a756b5940cf 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -10335,6 +10335,7 @@ def test_storage_preserve_nonhermetic_in_hermetic_context(self): global _my_storage my_lib = Library("my_lib", "DEF") # noqa: TOR901 + self.addCleanup(my_lib._destroy) my_lib.define('my_func() -> None') a = torch.tensor([1.]) From 443606eb94430d90554ab4c21202494576afedce Mon Sep 17 00:00:00 2001 From: "rocm-repo-management-api-6[bot]" <212817015+rocm-repo-management-api-6[bot]@users.noreply.github.com> Date: Wed, 29 Apr 2026 09:55:15 -0700 Subject: [PATCH 81/87] [AUTOGENERATED] [release/2.11] Fix SIGSEGV on AMD RDNA due to reduction mask optimization #176269 (#3156) Cherry-pick of https://github.com/ROCm/pytorch/pull/3055 Co-authored-by: Strahinja Stamenkovic --- test/inductor/test_torchinductor.py | 35 ++++++++++++++++++++++++ torch/_inductor/codegen/triton.py | 12 +++++++- torch/_inductor/runtime/triton_compat.py | 7 ++--- 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index 6c4f1b3f92890..60561f9dc812d 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -16528,6 +16528,41 @@ def test_has_constant_mask(self, block_multiple, ynumel_exceed_ygrid_size): self.assertTrue("ymask = yindex < ynumel" in code) self.assertTrue("xmask = xindex < xnumel" in code) + @parametrize( + "rnumel", + [16, 32], + ) + @config.patch("triton.persistent_reductions", True) + def test_has_constant_mask_small_persistent_reduction(self, rnumel): + from torch._inductor.runtime.hints import DeviceProperties + + def fn(x): + return x.sum(dim=-1) + + x = torch.randn(1024, rnumel, device=GPU_TYPE) + opt_fn = torch.compile(fn) + code = run_and_get_triton_code(opt_fn, x) + + device = torch.device(GPU_TYPE, 0) + warp_size = DeviceProperties.create(device).warp_size or 32 + + rblock = 1 + while rblock < rnumel: + rblock *= 2 + + if rblock < warp_size: + self.assertTrue( + "r0_index < r0_numel" in code or "rindex < rnumel" in code, + f"Expected dynamic reduction mask for RBLOCK={rblock} < warp_size={warp_size}", + ) + else: + self.assertTrue( + "r0_mask = tl.full" in code or "rmask = tl.full" in code, + f"Expected constant reduction mask for RBLOCK={rblock} >= warp_size={warp_size}", + ) + + self.assertEqual(fn(x), opt_fn(x)) + @config.patch("triton.native_matmul", False) def test_kernel_names_descriptive(self): @torch.compile(backend="inductor") diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index 039b53ee1f2fc..1fb7b7fdf3a7b 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -5886,12 +5886,22 @@ def _has_constant_mask(self, tree: IterationRangesRoot) -> bool: return True elif not self.is_combo_kernel: if V.graph.sizevars.statically_known_equals(tree.numel, 1): - return True + if not (tree.is_reduction and self.persistent_reduction): + return True # Masks are superfluous if numel is a multiple of BLOCK # (We use the fact that BLOCK is required by triton to be a power of 2) if tree.is_reduction and self.persistent_reduction: max_block = self._get_persistent_RBLOCK(tree.numel) + # Triton's auto-tuner can map a full hardware warp along the + # reduction axis. When RBLOCK < warp_size the excess lanes + # would execute out-of-bounds global loads. This results in + # faults on AMD hardware. Keep the dynamic mask so that all + # hardware stays correct. + device = V.graph.get_current_device_or_throw() + warp_size = DeviceProperties.create(device).warp_size or 32 + if isinstance(max_block, int) and max_block < warp_size: + return False elif tree.prefix == "x" and self.no_x_dim: max_block = 1 else: diff --git a/torch/_inductor/runtime/triton_compat.py b/torch/_inductor/runtime/triton_compat.py index 49ceacb50bc3d..d237350a667c8 100644 --- a/torch/_inductor/runtime/triton_compat.py +++ b/torch/_inductor/runtime/triton_compat.py @@ -140,11 +140,10 @@ class JITFunction: # type: ignore[no-redef] def cc_warp_size(cc: str | int) -> int: if torch.version.hip: - cc_str = str(cc) - if "gfx10" in cc_str or "gfx11" in cc_str: - return 32 - else: + if "gfx9" in str(cc): return 64 + else: + return 32 else: return 32 From 5223630054ce5ecd7b774d0ea31f2a1b472fb9b3 Mon Sep 17 00:00:00 2001 From: "tom.jen" Date: Thu, 7 May 2026 21:51:03 +0800 Subject: [PATCH 82/87] [Inductor] Fix ReinterpretView stride mismatch in TritonTemplateKernel (#3191) Fixes a bug where FlexibleLayout on a ReinterpretView incorrectly returns underlying physical buffer strides (e.g., 4D) instead of logical view strides (3D). This patch skips speculative layout and constraint tracking for ReinterpretView nodes, forcing the use of node.get_stride() to prevent Illegal Memory Access (IMA) on ROCm. Manual backport from PyTorch 2.12. Ref commit: https://github.com/pytorch/pytorch/commit/0e1f56285ea65c0fc960ea110cc4088e92eab453 ## Motivation ## Technical Details ## Test Plan ## Test Result ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --- torch/_inductor/select_algorithm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py index 9c02233204dbb..2fb2852eef785 100644 --- a/torch/_inductor/select_algorithm.py +++ b/torch/_inductor/select_algorithm.py @@ -1580,7 +1580,13 @@ def get_stride_and_maybe_freeze_layout(self, node) -> list[int]: layout = node.data.layout node_name = node.get_name() - if isinstance(layout, ir.FlexibleLayout): + # For ReinterpretView, the view's strides are already determined by its layout. + # We skip constraint tracking because node.get_name() returns the underlying + # buffer name, not the view's identity, so constraints would be incorrectly + # associated with the underlying buffer rather than the view. + if isinstance(layout, ir.FlexibleLayout) and not isinstance( + node, ir.ReinterpretView + ): if not use_aten_gemm_kernels(): # No ExternKernel fallback available, freeze immediately node.data.freeze_layout() From 96bfee122869125d32aa4ec9acc8c3597059188b Mon Sep 17 00:00:00 2001 From: Jack Taylor <108682042+jataylo@users.noreply.github.com> Date: Tue, 12 May 2026 16:58:07 +0100 Subject: [PATCH 83/87] Dynamo/fix ignore logging functions (#178506) (#3206) ## PR Summary Fixes #178455 ignore_logger_methods was renamed to ignore_logging_functions in torch 2.11 but wasn't added to blocklist in _get_dynamo_config_for_logging() ## Repro ``` import torch import torch._dynamo.config import torch._dynamo.utils torch._dynamo.config.ignore_logging_functions.add(print) torch._dynamo.utils._get_dynamo_config_for_logging() ``` ## Changes * Include `ignore_logging_functions` from `_get_dynamo_config_for_logging()` (consistent with existing `ignore_logger_methods`) * Add a regression test to ensure no crash when logging config includes builtin functions * Added a test that: * Inserts `print` into `ignore_logging_functions` * Verifies `_get_dynamo_config_for_logging()` returns valid JSON without errors related issue: #178455 Pull Request resolved: https://github.com/pytorch/pytorch/pull/178506 Approved by: https://github.com/Lucaskabela (cherry picked from commit 7eea8eacbb4457a503467a46eae47d93e39e49e8) ## Motivation ## Technical Details ## Test Plan ## Test Result ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. Co-authored-by: vvvdwbvvv Co-authored-by: PyTorch MergeBot --- test/dynamo/test_utils.py | 9 +++++++++ torch/_dynamo/utils.py | 1 + 2 files changed, 10 insertions(+) diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py index 22c3d26207ef1..b0f3d4a079d44 100644 --- a/test/dynamo/test_utils.py +++ b/test/dynamo/test_utils.py @@ -1,5 +1,6 @@ # Owner(s): ["module: dynamo"] import dataclasses +import json import os import pprint import sys @@ -298,6 +299,14 @@ def test_reinplace_counters_use_trigger_name_not_enum_value(self): "Should not use enum value (integer) in key, should use trigger.name instead", ) + def test_get_dynamo_config_for_logging_ignores_logging_functions(self): + with dynamo_config.patch(ignore_logging_functions={print}): + result = utils._get_dynamo_config_for_logging() + parsed = json.loads(result) + + self.assertIsInstance(parsed, dict) + self.assertNotIn("ignore_logging_functions", parsed) + class TestModel(torch.nn.Module): def __init__(self): diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py index 210e289be2ff1..51837a6fa283d 100644 --- a/torch/_dynamo/utils.py +++ b/torch/_dynamo/utils.py @@ -1649,6 +1649,7 @@ def clean_for_json(d: dict[str, Any]) -> dict[str, Any]: "_autograd_backward_strict_mode_banned_ops", "reorderable_logging_functions", "ignore_logger_methods", + "ignore_logging_functions", "traceable_tensor_subclasses", "nontraceable_tensor_subclasses", "_custom_ops_profile", From 0c210c5f20ff65125cf0c618dc5d76c6dc603238 Mon Sep 17 00:00:00 2001 From: zichguan-amd Date: Tue, 19 May 2026 11:02:45 -0400 Subject: [PATCH 84/87] Cleanup custom op polluting global state for subsequent tests (#180998) (#3221) Cherry pick to 2.11 release `my_lib` in `test_storage_preserve_nonhermetic_in_hermetic_context` leaks into global op space after the test ends and affect subsequent tests in the same process using dynamo. Without the fix, running any tests requiring checkpoint/compile or dynamo-related after `test_storage_preserve_nonhermetic_in_hermetic_context` fails with ``` torch._dynamo.exc.BackendCompilerFailed: backend='aot_eager' raised: TypeError: 'CustomDecompTable' object is not a mapping ``` e.g. `python -m pytest -v pytorch/test/test_torch.py::TestTorch::test_storage_preserve_nonhermetic_in_hermetic_context pytorch/test/test_autograd.py::TestAutograd::test_checkpoint_compile_no_recompile` Pull Request resolved: https://github.com/pytorch/pytorch/pull/180998 Approved by: https://github.com/albanD, https://github.com/ezyang --------- Co-authored-by: Claude Opus 4 --- test/dynamo/test_compiler_bisector.py | 3 +- test/dynamo/test_decorators.py | 6 +- test/functorch/test_aotdispatch.py | 207 +++++++++++++------------- test/test_fake_tensor.py | 8 +- test/test_fx_passes.py | 2 +- test/test_proxy_tensor.py | 51 ++++--- test/test_torch.py | 39 +++-- 7 files changed, 154 insertions(+), 162 deletions(-) diff --git a/test/dynamo/test_compiler_bisector.py b/test/dynamo/test_compiler_bisector.py index c1bc667a02041..24116f41809d5 100644 --- a/test/dynamo/test_compiler_bisector.py +++ b/test/dynamo/test_compiler_bisector.py @@ -29,8 +29,7 @@ def tearDown(self): if hasattr(torch.ops, self.test_ns): delattr(torch.ops, self.test_ns) if hasattr(self, "lib"): - del self.lib.m - del self.lib + self.lib._destroy() def get_op(self, name): return getattr(getattr(torch.ops, self.test_ns), name).default diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py index 58f5127499562..6f9b0c2765659 100644 --- a/test/dynamo/test_decorators.py +++ b/test/dynamo/test_decorators.py @@ -48,10 +48,8 @@ def fn(a): def test_disable_for_custom_op(self): import torch.library - from torch.library import Library - foo = Library("foo", "DEF") # noqa: TOR901 - try: + with torch.library._scoped_library("foo", "DEF") as foo: foo.define("custom(Tensor self) -> Tensor") # Dynamic shape data dependent operator. For static shape compilation, Dynamo @@ -81,8 +79,6 @@ def fn(x): self.assertEqual(ref, res) finally: torch.ops.foo.custom = orig_custom - finally: - foo._destroy() def test_disable_ignores_outer_wraps(self): def orig_inner(): diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py index 2375778a29e43..e523c671dc5fb 100644 --- a/test/functorch/test_aotdispatch.py +++ b/test/functorch/test_aotdispatch.py @@ -8183,122 +8183,125 @@ def unpack_cpu(x): @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable") @unittest.skipIf(not SM80OrLater, "bfloat16, float8") def test_saved_tensors_hooks_params(self): - lib = torch.library.Library("_test_aotdispatch_lib", "FRAGMENT") - logged_shapes = [] - logged_dtypes = [] - lib.define("log(Tensor x) -> Tensor") - - def log_impl(x): - logged_shapes.append(list(x.shape)) - logged_dtypes.append(x.dtype) - return x.clone() + with torch.library._scoped_library("_test_aotdispatch_lib", "FRAGMENT") as lib: + logged_shapes = [] + logged_dtypes = [] + lib.define("log(Tensor x) -> Tensor") + + def log_impl(x): + logged_shapes.append(list(x.shape)) + logged_dtypes.append(x.dtype) + return x.clone() - def log_meta(x): - return x.clone() + def log_meta(x): + return x.clone() - for backend in ["CPU", "CUDA"]: - lib.impl( - "log", - log_impl, - backend, - ) - lib.impl("log", log_meta, "Meta") + for backend in ["CPU", "CUDA"]: + lib.impl( + "log", + log_impl, + backend, + ) + lib.impl("log", log_meta, "Meta") - def pack_fp8_with_scale_and_log(x): - torch.ops._test_aotdispatch_lib.log(x) - return _pack_fp8_with_scale_wrap(x) + def pack_fp8_with_scale_and_log(x): + torch.ops._test_aotdispatch_lib.log(x) + return _pack_fp8_with_scale_wrap(x) - def unpack_fp8_with_scale_and_log(packed): - return _unpack_fp8_with_scale_wrap(packed) + def unpack_fp8_with_scale_and_log(packed): + return _unpack_fp8_with_scale_wrap(packed) - def m_inp_fn(): - x = torch.ones( - 2, 2, 2, device=device, dtype=torch.float64, requires_grad=True - ) - torch._dynamo.mark_dynamic(x, 0) - torch._dynamo.mark_dynamic(x, 1) - return (x,) + def m_inp_fn(): + x = torch.ones( + 2, 2, 2, device=device, dtype=torch.float64, requires_grad=True + ) + torch._dynamo.mark_dynamic(x, 0) + torch._dynamo.mark_dynamic(x, 1) + return (x,) - class SAF0(torch.autograd.Function): - @staticmethod - def forward(ctx, x): - ctx.save_for_backward(x) - return x + class SAF0(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return x - @staticmethod - def backward(ctx, gx): - (saved_x,) = ctx.saved_tensors - return gx + saved_x + @staticmethod + def backward(ctx, gx): + (saved_x,) = ctx.saved_tensors + return gx + saved_x - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.fc1 = nn.Linear(2, 2) - self.relu = nn.ReLU() - self.fc2 = nn.Linear(2, 2) + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(2, 2) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(2, 2) - def forward(self, x): - x = SAF0.apply(x) - x = x.to(dtype=torch.float32) - x = self.fc1(x) - x = self.relu(x) - x = self.fc2(x) - return x + def forward(self, x): + x = SAF0.apply(x) + x = x.to(dtype=torch.float32) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + return x - def _reset_logged(): - logged_shapes.clear() - logged_dtypes.clear() + def _reset_logged(): + logged_shapes.clear() + logged_dtypes.clear() - device = torch.device("cuda:0") - m = M().to(device=device) + device = torch.device("cuda:0") + m = M().to(device=device) - def _test_m(): - self._test_pack_hooks( - m, - m_inp_fn, - [ - ( + def _test_m(): + self._test_pack_hooks( + m, + m_inp_fn, + [ ( - pack_fp8_with_scale_and_log, - unpack_fp8_with_scale_and_log, - ), - True, - ) - ], - pre_compile_fn=_reset_logged, - backend="aot_eager", - ) - - with patch( - "torch._functorch.config.saved_tensors_hooks_filtering_mode", "donated" - ): - _reset_logged() - _test_m() - # Check that hooks were not applied to Parameters - # parameters excluded - self.assertFalse([2, 2] in logged_shapes) - self.assertTrue([2, 2, 2] in logged_shapes) - # input excluded - self.assertFalse(torch.float64 in logged_dtypes) + ( + pack_fp8_with_scale_and_log, + unpack_fp8_with_scale_and_log, + ), + True, + ) + ], + pre_compile_fn=_reset_logged, + backend="aot_eager", + ) - with patch( - "torch._functorch.config.saved_tensors_hooks_filtering_mode", "no_static" - ): - _reset_logged() - _test_m() - # Check that hooks were not applied to Parameters - # parameters excluded - self.assertFalse([2, 2] in logged_shapes) - self.assertTrue([2, 2, 2] in logged_shapes) - self.assertTrue(torch.float64 in logged_dtypes) - - with patch("torch._functorch.config.saved_tensors_hooks_filtering_mode", "all"): - _reset_logged() - _test_m() - # Check that hooks were applied to all saved tensors - self.assertTrue([2, 2] in logged_shapes) - self.assertTrue([2, 2, 2] in logged_shapes) - self.assertTrue(torch.float64 in logged_dtypes) + with patch( + "torch._functorch.config.saved_tensors_hooks_filtering_mode", "donated" + ): + _reset_logged() + _test_m() + # Check that hooks were not applied to Parameters + # parameters excluded + self.assertFalse([2, 2] in logged_shapes) + self.assertTrue([2, 2, 2] in logged_shapes) + # input excluded + self.assertFalse(torch.float64 in logged_dtypes) + + with patch( + "torch._functorch.config.saved_tensors_hooks_filtering_mode", + "no_static", + ): + _reset_logged() + _test_m() + # Check that hooks were not applied to Parameters + # parameters excluded + self.assertFalse([2, 2] in logged_shapes) + self.assertTrue([2, 2, 2] in logged_shapes) + self.assertTrue(torch.float64 in logged_dtypes) + + with patch( + "torch._functorch.config.saved_tensors_hooks_filtering_mode", "all" + ): + _reset_logged() + _test_m() + # Check that hooks were applied to all saved tensors + self.assertTrue([2, 2] in logged_shapes) + self.assertTrue([2, 2, 2] in logged_shapes) + self.assertTrue(torch.float64 in logged_dtypes) @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable") @unittest.skipIf(not SM80OrLater, "bfloat16, float8") diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py index 1ee0668c9dc63..a7f31ce07be77 100644 --- a/test/test_fake_tensor.py +++ b/test/test_fake_tensor.py @@ -115,10 +115,9 @@ def test_basic(self): self.assertTrue(isinstance(z, FakeTensor)) def test_custom_op_fallback(self): - from torch.library import impl, Library + from torch.library import _scoped_library, impl - try: - test_lib = Library("my_test_op", "DEF") # noqa: TOR901 + with _scoped_library("my_test_op", "DEF") as test_lib: test_lib.define("foo(Tensor self) -> Tensor") @impl(test_lib, "foo", "CPU") @@ -133,9 +132,6 @@ def foo_impl(self): x = mode.from_tensor(x) torch.ops.my_test_op.foo(x) - finally: - test_lib._destroy() - def test_parameter_instantiation(self): with FakeTensorMode(): x = torch.rand([4]) diff --git a/test/test_fx_passes.py b/test/test_fx_passes.py index 4a48d0cd966aa..2f551338d7e63 100644 --- a/test/test_fx_passes.py +++ b/test/test_fx_passes.py @@ -855,7 +855,7 @@ def setup(cls): @classmethod def tearDown(cls): - del cls.quantization + cls.quantization._destroy() @staticmethod def forward(self, arg0_1, arg1_1): diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py index 8296f386f0977..b2c4a000fb980 100644 --- a/test/test_proxy_tensor.py +++ b/test/test_proxy_tensor.py @@ -1017,40 +1017,39 @@ def _test_dynamic(self, fn, trace_inputs, test_inputs, assert_eq=True): def test_debug_interpreter(self): - import torch.library - from torch.library import Library + from torch.library import _scoped_library - foo = Library("foo", "DEF") # noqa: TOR901 - foo.define("foo(Tensor self) -> Tensor") + with _scoped_library("foo", "DEF") as foo: + foo.define("foo(Tensor self) -> Tensor") - # Operator where meta and cpu disagree on strides - @torch.library.impl(foo, "foo", "CPU") - def foo_cpu(x): - return x.clone().T + # Operator where meta and cpu disagree on strides + @torch.library.impl(foo, "foo", "CPU") + def foo_cpu(x): + return x.clone().T - @torch.library.impl(foo, "foo", "Meta") - def foo_meta(x): - return x.clone() + @torch.library.impl(foo, "foo", "Meta") + def foo_meta(x): + return x.clone() - def f(x): - return torch.ops.foo.foo.default(x) + def f(x): + return torch.ops.foo.foo.default(x) - gm = make_fx(f, tracing_mode="symbolic")(torch.randn(2, 2)) - from torch._functorch.compilers import DebugInterpreter + gm = make_fx(f, tracing_mode="symbolic")(torch.randn(2, 2)) + from torch._functorch.compilers import DebugInterpreter - interp = DebugInterpreter(gm) + interp = DebugInterpreter(gm) - # input mismatch is caught (indicates guard problem) - self.assertRaisesRegex( - AssertionError, r"3 != 1", - lambda: interp.run(torch.randn(3, 3).T), - ) + # input mismatch is caught (indicates guard problem) + self.assertRaisesRegex( + AssertionError, r"3 != 1", + lambda: interp.run(torch.randn(3, 3).T), + ) - # Catch the incorrect meta - self.assertRaisesRegex( - AssertionError, r"\(3, 1\) != \(1, 3\)", - lambda: interp.run(torch.randn(3, 3)) - ) + # Catch the incorrect meta + self.assertRaisesRegex( + AssertionError, r"\(3, 1\) != \(1, 3\)", + lambda: interp.run(torch.randn(3, 3)) + ) def test_int_input(self): def f(x, y): diff --git a/test/test_torch.py b/test/test_torch.py index 54a756b5940cf..66b021920594f 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -10331,33 +10331,32 @@ def __del__(self): @skipIfTorchDynamo("Not a suitable test for TorchDynamo") def test_storage_preserve_nonhermetic_in_hermetic_context(self): - from torch.library import Library, impl + from torch.library import _scoped_library, impl global _my_storage - my_lib = Library("my_lib", "DEF") # noqa: TOR901 - self.addCleanup(my_lib._destroy) - my_lib.define('my_func() -> None') + with _scoped_library("my_lib", "DEF") as my_lib: + my_lib.define('my_func() -> None') - a = torch.tensor([1.]) - _my_storage = a.untyped_storage() + a = torch.tensor([1.]) + _my_storage = a.untyped_storage() - m, t = Tracker.make() - _my_storage._tracker = t - del t + m, t = Tracker.make() + _my_storage._tracker = t + del t - @impl(my_lib, 'my_func', '') - def my_func(): - global _my_storage - del _my_storage + @impl(my_lib, 'my_func', '') + def my_func(): + global _my_storage + del _my_storage - self.assertFalse(m[0]) - torch.ops.my_lib.my_func() - self.assertFalse(m[0]) + self.assertFalse(m[0]) + torch.ops.my_lib.my_func() + self.assertFalse(m[0]) - s = a.untyped_storage() - del a - del s - self.assertTrue(m[0]) + s = a.untyped_storage() + del a + del s + self.assertTrue(m[0]) # FIXME: move to test_autograd? @skipIfTorchDynamo("TorchDynamo does not work well with hooks") From 5f7b013bcacb1d6848bc1397450c7acc04d155e0 Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com> Date: Tue, 19 May 2026 17:29:51 -0500 Subject: [PATCH 85/87] [release/2.11][UP][ROCm][inductor] Use hipModuleLoadData in StaticCudaLauncher (#3238) ## Summary - Backports upstream PyTorch PR pytorch/pytorch#183926 to ROCm release/2.11. - Uses `hipModuleLoadData` for ROCm static launcher module loading to avoid retaining open HSACO file descriptors. - Leaves the CUDA/NVIDIA path unchanged. - Resolves Jira https://amd-hub.atlassian.net/browse/ROCM-24659, https://amd-hub.atlassian.net/browse/ROCM-24664 Made with [Cursor](https://cursor.com) Co-authored-by: PyTorch MergeBot --- torch/csrc/inductor/static_launcher/cuda.cpp | 21 +++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/torch/csrc/inductor/static_launcher/cuda.cpp b/torch/csrc/inductor/static_launcher/cuda.cpp index a2378b7c1a248..9687a57346106 100644 --- a/torch/csrc/inductor/static_launcher/cuda.cpp +++ b/torch/csrc/inductor/static_launcher/cuda.cpp @@ -12,6 +12,9 @@ #if defined(USE_ROCM) #include +#include +#include +#include #endif /** @@ -101,6 +104,19 @@ CUdeviceptr getPointer(PyObject* obj) { #define SHARED_MEM_STATIC_MAX 49152 // 48 KB +#if defined(USE_ROCM) +std::vector readKernelImage(const std::string& filePath) { + std::ifstream file(filePath, std::ios::binary); + TORCH_CHECK(file, "Failed to open kernel image: ", filePath); + + auto begin = std::istreambuf_iterator(file); + auto end = std::istreambuf_iterator(); + std::vector image(begin, end); + TORCH_CHECK(!image.empty(), "Kernel image is empty: ", filePath); + return image; +} +#endif + CUfunction loadKernel( std::string filePath, const std::string& funcName, @@ -116,7 +132,10 @@ CUfunction loadKernel( CUfunction func = nullptr; #if defined(USE_ROCM) - AT_CUDA_DRIVER_CHECK(hipModuleLoad(&mod, filePath.c_str())); + // Unlike cuModuleLoad, hipModuleLoad keeps a file descriptor for the loaded + // HSACO. Load from memory to avoid retaining one FD per static launcher. + auto image = readKernelImage(filePath); + AT_CUDA_DRIVER_CHECK(hipModuleLoadData(&mod, image.data())); AT_CUDA_DRIVER_CHECK(hipModuleGetFunction(&func, mod, funcName.c_str())); int shared_optin = 0; AT_CUDA_DRIVER_CHECK(hipDeviceGetAttribute( From 6c683dd613ed22b748d513c35082ba94de91cd49 Mon Sep 17 00:00:00 2001 From: Umesh Chand Date: Wed, 20 May 2026 15:48:26 -0700 Subject: [PATCH 86/87] =?UTF-8?q?[Inductor]=20Fix=20flaky=20epilogue=20fus?= =?UTF-8?q?ion=20tests=20by=20adding=20missing=20tearDown=E2=80=A6=20(#324?= =?UTF-8?q?5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …Class (#180736) TestPrologueFusion and TestEpilogueFusionStaticAnalysis both use ExitStack in setUpClass to apply config.patch(), but neither defined tearDownClass to close the stack. When TestPrologueFusion runs before TestEpilogueFusionStaticAnalysis in the same process, config values like max_autotune_gemm_backends="TRITON" leak through, removing the aten kernel choice from autotuning and causing test failures. Fixes #179693 Pull Request resolved: https://github.com/pytorch/pytorch/pull/180736 Approved by: https://github.com/Skylion007 ## Motivation ## Technical Details ## Test Plan ## Test Result ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. Co-authored-by: NikhilAPatel --- test/inductor/test_max_autotune.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py index d94dd81b4673a..5d3855aa73e93 100644 --- a/test/inductor/test_max_autotune.py +++ b/test/inductor/test_max_autotune.py @@ -3450,6 +3450,11 @@ def setUpClass(cls): ) ) + @classmethod + def tearDownClass(cls): + cls._stack.close() + super().tearDownClass() + def check_code(self, code_str, num_kernels, num_allocs, num_deallocs): FileCheck().check(get_func_call()).check_count( get_kernel_launch(), @@ -3943,6 +3948,11 @@ def setUpClass(cls): ) ) + @classmethod + def tearDownClass(cls): + cls._stack.close() + super().tearDownClass() + @contextlib.contextmanager def get_common_patches(self, async_compile: bool, persistent_tma: bool): common_patches = ( From 53679bf172c72d6ebb8135203930df6a199b8f79 Mon Sep 17 00:00:00 2001 From: Darren Lao Date: Thu, 21 May 2026 11:27:07 -0400 Subject: [PATCH 87/87] [release/2.11] Free deferred record_stream blocks at graph capture end (#175817) (#3241) ## Motivation Aimed as a fix for test `TestMemPool.test_graph_capture_reclaim_shared_pool` failing in TheRock wheels: https://github.com/ROCm/TheRock/issues/4925 The test was brought into `release/2.11` by the cherry-pick of upstream pytorch/pytorch#176024 in #3182, but the allocator fix from upstream pytorch/pytorch#175817 was not. Without this fix, `endAllocateToPool` (called from `CUDAGraph::capture_end`) does not reclaim `record_stream`-deferred blocks, so a second graph capture into the same shared pool cannot reuse the block freed in the first capture. ## Technical Details Cherry-pick of upstream pytorch/pytorch#175817 (commit `b55e5314fb72f1ea782f72a6c9728a40c12678ea`) on top of `release/2.11`. ## Test Plan - Build PyTorch wheels from this branch and verify that the test `TestMemPool.test_graph_capture_reclaim_shared_pool` is now passing. ## Test Result - `TestMemPool.test_graph_capture_reclaim_shared_pool` passed for torch 2.11: https://github.com/ROCm/TheRock/actions/runs/26116907093/job/76816330885 ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. Co-authored-by: Frank Lin --- c10/cuda/CUDACachingAllocator.cpp | 66 +++++++++++++++++++++++++------ test/test_cuda.py | 13 +----- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 7a053b8134ef7..268d5fb70fdbc 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -2774,18 +2774,62 @@ class DeviceCachingAllocator { void endAllocateToPool(MempoolId_t mempool_id) { std::lock_guard lock(mutex); - if (CUDAAllocatorConfig::graph_capture_record_stream_reuse() && - !graph_reuse_context.empty()) { - auto capture_id = mempool_to_capture_id[mempool_id]; - auto graph_context = graph_reuse_context[capture_id]; - for (auto& [stream, _] : graph_context.visited) { - TORCH_INTERNAL_ASSERT( - stream_get_capture_info(stream).status == - cudaStreamCaptureStatusNone, - "This stream should not be capturing when the capture is ended"); + if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) { + // Remove stream reuse context and mapping for this capture, if present. + if (!graph_reuse_context.empty()) { + auto capture_id = mempool_to_capture_id[mempool_id]; + auto graph_context = graph_reuse_context[capture_id]; + for (auto& [stream, _] : graph_context.visited) { + TORCH_INTERNAL_ASSERT( + stream_get_capture_info(stream).status == + cudaStreamCaptureStatusNone, + "This stream should not be capturing when the capture is ended"); + } + graph_reuse_context.erase(capture_id); + mempool_to_capture_id.erase(mempool_id); + } + + // Free deferred blocks associated with the ended pool, if any. + // cudaStreamEndCapture would have failed if any stream used during + // capture hadn't been joined back, so all stream uses on these + // blocks are known to be complete and we can safely clear them. + if (!deferred_blocks.empty()) { + auto pool_it = graph_pools.find(mempool_id); + if (pool_it != graph_pools.end()) { + auto* private_pool = pool_it->second.get(); + auto context = maybeGatherContext(RecordContext::ALL); + std::vector blocks_to_erase; + for (auto& [block, markers] : deferred_blocks) { + if (block->pool->owner_PrivatePool == private_pool) { + // At capture end, handle blocks associated with non-capturing + // streams. Remove only stream uses introduced during capture + // (guaranteed complete), and for any leftover pre-capture uses, + // insert events to track their completion. This aligns with + // insert_events_deferred_until_no_capture semantics. + remove_cudagraph_stream_uses(block); + if (block->stream_uses.empty()) { + free_block(block, context); + } else { + // Pre-capture stream uses remain; record events so + // process_events can free the block once they complete. + insert_events(block); + // block->event_count should likely be non-zero here since + // block->stream_uses is not empty. Defensive: still free if + // event_count is zero, but this should be rare. + if (block->event_count == 0) { + free_block(block, context); + } + } + // Must erase from deferred_blocks regardless of which branch we + // took. + blocks_to_erase.push_back(block); + } + } + for (auto* b : blocks_to_erase) { + deferred_blocks.erase(b); + } + } } - graph_reuse_context.erase(capture_id); - mempool_to_capture_id.erase(mempool_id); } for (auto it = captures_underway.begin(); it != captures_underway.end(); diff --git a/test/test_cuda.py b/test/test_cuda.py index 56625d716244c..5dd2a7346c79b 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -132,7 +132,6 @@ _cycles_per_ms = None - _wait_for_cpu_kernel = None @@ -8379,17 +8378,7 @@ class TestCudaDeviceParametrized(TestCase): def test_graph_external_wait_and_record(self): torch.cuda.empty_cache() - kernel_source = r""" - __global__ void wait_for_cpu(int *pinned_cpu_flag) { - int flag = 0; - do { - asm volatile("ld.relaxed.sys.global.s32 %0, [%1];" : "=r"(flag) : "l"(pinned_cpu_flag) : "memory"); - } while (flag == 0); - } - """ - from torch.cuda import _compile_kernel - - spin_wait_kernel = _compile_kernel(kernel_source, "wait_for_cpu") + spin_wait_kernel = get_wait_for_cpu_kernel() x = torch.ones(4, device="cuda") x_cpu = torch.zeros(x.shape, device="cpu").pin_memory()