From af5c4ec030daf123c0260a0223388420e1479fa6 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 16 Feb 2026 13:13:36 -0500
Subject: [PATCH 01/87] [RELEASE 2.11] Release only changes (#175091)

* [RELEASE 2.11] Release only changes

* remove_file

* Trigger rebuild
---
 .ci/pytorch/common_utils.sh                   |   2 +-
 .github/ci_commit_pins/xla.txt                |   2 +-
 .github/scripts/filter_test_configs.py        |   6 +-
 .github/templates/common.yml.j2               |   2 +-
 .../linux_binary_build_workflow.yml.j2        |  18 +-
 .../macos_binary_build_workflow.yml.j2        |   2 +-
 .../windows_binary_build_workflow.yml.j2      |   6 +-
 .github/workflows/_bazel-build-test.yml       |  14 +-
 ...nary-build-flash-attention-wheel-linux.yml |  10 +-
 ...ry-build-flash-attention-wheel-windows.yml |   6 +-
 .github/workflows/_binary-build-linux.yml     |  13 +-
 .github/workflows/_binary-test-linux.yml      |  13 +-
 .github/workflows/_binary-upload.yml          |   2 +-
 .github/workflows/_docs.yml                   |  10 +-
 .github/workflows/_link_check.yml             |   4 +-
 .github/workflows/_linux-build.yml            |  10 +-
 .github/workflows/_linux-test-stable-fa3.yml  |   8 +-
 .github/workflows/_linux-test.yml             |  14 +-
 .github/workflows/_mac-build.yml              |   8 +-
 .github/workflows/_mac-test.yml               |  10 +-
 .github/workflows/_rocm-test.yml              |   8 +-
 .github/workflows/_runner-determinator.yml    |   2 +-
 .github/workflows/_vllm-benchmark.yml         |   4 +-
 .github/workflows/_win-build.yml              |   4 +-
 .github/workflows/_win-test.yml               |   4 +-
 .github/workflows/_xpu-test.yml               |   8 +-
 .github/workflows/b200-distributed.yml        |   2 +-
 .github/workflows/b200-symm-mem.yml           |   2 +-
 .github/workflows/build-almalinux-images.yml  |   2 +-
 .github/workflows/build-libtorch-images.yml   |   4 +-
 .../build-manywheel-images-s390x.yml          |   2 +-
 .github/workflows/build-manywheel-images.yml  |   4 +-
 .github/workflows/build-triton-wheel.yml      |  14 +-
 .github/workflows/build-vllm-wheel.yml        |   6 +-
 .github/workflows/claude-code.yml             |   2 +-
 .github/workflows/claude-issue-triage-run.yml |   2 +-
 .../close-nonexistent-disable-issues.yml      |   2 +-
 .github/workflows/create_release.yml          |   2 +-
 .github/workflows/docker-builds.yml           |  10 +-
 .github/workflows/docker-cache-rocm.yml       |   4 +-
 .github/workflows/docker-release.yml          |  10 +-
 .github/workflows/dynamo-unittest.yml         |   2 +-
 ...linux-aarch64-binary-manywheel-nightly.yml |   2 +-
 ...enerated-linux-binary-libtorch-nightly.yml |  12 +-
 ...nerated-linux-binary-manywheel-nightly.yml | 135 +++++------
 ...d-linux-s390x-binary-manywheel-nightly.yml |   2 +-
 ...-arm64-binary-libtorch-release-nightly.yml |   1 -
 ...rated-macos-arm64-binary-wheel-nightly.yml |   7 -
 ...ws-arm64-binary-libtorch-debug-nightly.yml |   2 +-
 ...-arm64-binary-libtorch-release-nightly.yml |   2 +-
 ...ted-windows-arm64-binary-wheel-nightly.yml |   2 +-
 ...-windows-binary-libtorch-debug-nightly.yml |  26 +--
 ...indows-binary-libtorch-release-nightly.yml |  26 +--
 ...generated-windows-binary-wheel-nightly.yml | 212 ++++++------------
 .github/workflows/h100-cutlass-backend.yml    |   2 +-
 .github/workflows/h100-distributed.yml        |   2 +-
 .github/workflows/h100-symm-mem.yml           |   2 +-
 .../workflows/inductor-micro-benchmark.yml    |   2 +-
 .github/workflows/inductor-nightly.yml        |   2 +-
 .github/workflows/inductor-pallas.yml         |   2 +-
 .github/workflows/inductor-perf-compare.yml   |   2 +-
 .github/workflows/inductor-perf-test-b200.yml |   2 +-
 .../inductor-perf-test-nightly-aarch64.yml    |   2 +-
 .../inductor-perf-test-nightly-h100.yml       |   2 +-
 .../inductor-perf-test-nightly-rocm-mi300.yml |   2 +-
 .../inductor-perf-test-nightly-rocm-mi355.yml |   2 +-
 .../inductor-perf-test-nightly-x86-zen.yml    |   2 +-
 .../inductor-perf-test-nightly-x86.yml        |   2 +-
 .../inductor-perf-test-nightly-xpu.yml        |   2 +-
 .../workflows/inductor-perf-test-nightly.yml  |   2 +-
 .github/workflows/inductor-periodic.yml       |   2 +-
 .github/workflows/inductor-rocm-mi200.yml     |   2 +-
 .github/workflows/inductor-rocm-mi300.yml     |   2 +-
 .github/workflows/inductor-rocm-mi355.yml     |   2 +-
 .github/workflows/inductor-unittest.yml       |   2 +-
 .github/workflows/inductor.yml                |   2 +-
 .github/workflows/lint-autoformat.yml         |   2 +-
 .github/workflows/lint-bc.yml                 |   2 +-
 .github/workflows/lint.yml                    |  23 +-
 .github/workflows/linux-aarch64.yml           |   2 +-
 .github/workflows/llm_td_retrieval.yml        |   4 +-
 .github/workflows/nightly-s3-uploads.yml      |   2 +-
 .github/workflows/nightly.yml                 |   4 +-
 .github/workflows/nitpicker.yml               |   2 +-
 .github/workflows/operator_microbenchmark.yml |   2 +-
 .github/workflows/periodic-rocm-mi200.yml     |   2 +-
 .github/workflows/periodic-rocm-mi300.yml     |   2 +-
 .github/workflows/periodic-rocm-mi355.yml     |   2 +-
 .github/workflows/periodic.yml                |   2 +-
 .github/workflows/pull.yml                    |   2 +-
 .github/workflows/quantization-periodic.yml   |   2 +-
 .github/workflows/rocm-mi200.yml              |   2 +-
 .github/workflows/rocm-mi300.yml              |   2 +-
 .github/workflows/rocm-mi355.yml              |   2 +-
 .github/workflows/rocm-navi31.yml             |   2 +-
 .github/workflows/rocm-nightly.yml            |   2 +-
 .github/workflows/slow-rocm-mi200.yml         |   2 +-
 .github/workflows/slow.yml                    |   2 +-
 .../target-determination-indexer.yml          |  10 +-
 .github/workflows/target_determination.yml    |   4 +-
 .github/workflows/test-b200.yml               |   2 +-
 .github/workflows/test-check-binary.yml       |   4 +-
 .github/workflows/test-h100.yml               |   2 +-
 .github/workflows/tools-unit-tests.yml        |   4 +-
 .github/workflows/torchbench.yml              |   2 +-
 .github/workflows/trunk-rocm-sandbox.yml      |   2 +-
 .github/workflows/trunk.yml                   |   2 +-
 .github/workflows/unstable.yml                |   2 +-
 .github/workflows/update-viablestrict.yml     |   2 +-
 .github/workflows/update_pytorch_labels.yml   |   2 +-
 .../upload-test-stats-while-running.yml       |   2 +-
 .github/workflows/upload-test-stats.yml       |   2 +-
 .../upload-torch-dynamo-perf-stats.yml        |   2 +-
 .../upload_test_stats_intermediate.yml        |   2 +-
 .github/workflows/vllm-benchmark.yml          |   4 +-
 .github/workflows/weekly.yml                  |   2 +-
 .github/workflows/xpu.yml                     |   2 +-
 tools/stats/import_test_stats.py              |   2 +-
 118 files changed, 370 insertions(+), 488 deletions(-)

diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index c4a92c997561e..ad45622823b4c 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -290,7 +290,7 @@ function install_torchrec_and_fbgemm() {
 
 function clone_pytorch_xla() {
   if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.11 https://github.com/pytorch/xla.git
     pushd xla
     # pin the xla hash so that we don't get broken by changes to xla
     git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 7d3a8548d145b..9939a0505c6fc 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-c04e61c3424142c0eebcc9e59984b9d8fced18c0
+r2.11
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 182a75f13cad6..087a59be348c4 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -41,10 +41,10 @@ def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool:
     "rerun_disabled_tests": lambda job_name: True,
 }
 
-# The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+# The link to the published list of disabled jobs.
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=EdtXb8H1wC3KKKfSV9z7QtgG3FngDv3B"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=iafjJg17T2MK7wQiJ0qx32zIPMqqwZqv"
 
 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 064eea7592230..201415632b7e1 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -32,7 +32,7 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
       !{{ display_ec2_information() }}
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index b9d3a51354e06..e110f33d8ce39 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -56,7 +56,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -137,18 +137,18 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11
       - name: Login to ECR
-        uses: pytorch/pytorch/.github/actions/ecr-login@main
+        uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11
       - uses: !{{ common.download_artifact_action }}
         name: Download Build Artifacts
         with:
           name: !{{ config["build_name"] }}
           path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: !{{ config["container_image"] }}
@@ -156,7 +156,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -180,7 +180,7 @@ jobs:
         with:
           name: !{{ config["build_name"] }}
           path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
@@ -194,7 +194,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: !{{ config["container_image"] }}
@@ -202,7 +202,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 958c6b85902c2..0414de214f123 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -71,7 +71,7 @@ jobs:
     steps:
       !{{ set_runner_specific_vars() }}
       !{{ setup_python(config.get("python_version", "3.10")) }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       - name: Populate binary env
         run: |
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2
index 34c148270c6bc..10153574304d7 100644
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@@ -64,7 +64,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -135,7 +135,7 @@ jobs:
 {%- else %}
       !{{ set_runner_specific_vars() }}
       !{{ common.setup_ec2_windows() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
 {%- endif %}
       - name: Populate binary env
         shell: bash
@@ -211,7 +211,7 @@ jobs:
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
 {%- else %}
       !{{ common.setup_ec2_windows() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       !{{ set_runner_specific_vars() }}
 {%- endif %}
       - uses: !{{ common.download_artifact_action }}
diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml
index eaebce92ba898..21508ce0d7f21 100644
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@@ -47,7 +47,7 @@ jobs:
       reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           fetch-depth: 1
           submodules: false
@@ -69,13 +69,13 @@ jobs:
     runs-on: ${{ matrix.runner }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
@@ -85,12 +85,12 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-image-name: ${{ inputs.docker-image-name }}
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -100,7 +100,7 @@ jobs:
         run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.11
 
       - name: Output disk space left
         run: |
@@ -211,5 +211,5 @@ jobs:
           file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
diff --git a/.github/workflows/_binary-build-flash-attention-wheel-linux.yml b/.github/workflows/_binary-build-flash-attention-wheel-linux.yml
index 3fdc1dc4175c9..68244dc98829d 100644
--- a/.github/workflows/_binary-build-flash-attention-wheel-linux.yml
+++ b/.github/workflows/_binary-build-flash-attention-wheel-linux.yml
@@ -23,7 +23,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -83,13 +83,13 @@ jobs:
       TORCH_VERSION: "2.10.0"
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           fail-silently: false
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: true
 
@@ -97,7 +97,7 @@ jobs:
         uses: ./.github/actions/setup-linux
 
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ env.DOCKER_IMAGE }}
 
@@ -138,5 +138,5 @@ jobs:
           path: ${{ runner.temp }}/artifacts/*.whl
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
diff --git a/.github/workflows/_binary-build-flash-attention-wheel-windows.yml b/.github/workflows/_binary-build-flash-attention-wheel-windows.yml
index 4fc1dc8a53367..642045c0da492 100644
--- a/.github/workflows/_binary-build-flash-attention-wheel-windows.yml
+++ b/.github/workflows/_binary-build-flash-attention-wheel-windows.yml
@@ -22,7 +22,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -71,12 +71,12 @@ jobs:
           git config --global core.ignorecase false
           git config --global core.fsmonitor false
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
           submodules: true
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index 115f6572441a1..c7d55a2e929b8 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -142,13 +142,13 @@ jobs:
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.github-token }}
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}
 
@@ -179,7 +179,6 @@ jobs:
       - name: Checkout PyTorch to pytorch dir
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -214,9 +213,9 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
-          # If doing this in main or release branch, use docker.io. Otherwise
+          # If doing this in release/2.11 or release branch, use docker.io. Otherwise
           # use ECR
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: ${{ inputs.DOCKER_IMAGE }}
@@ -228,7 +227,7 @@ jobs:
 
       - name: Pull Docker image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -281,7 +280,7 @@ jobs:
 
       - name: Teardown Linux
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
 
       - name: Chown workspace
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 3dd8235a4c4f2..ed7738ecbdcc2 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -125,14 +125,14 @@ jobs:
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.github-token }}
 
         # Setup the environment
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}
 
@@ -153,7 +153,6 @@ jobs:
       - name: Checkout PyTorch to pytorch dir
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           show-progress: false
           path: pytorch
@@ -185,7 +184,7 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.11
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
 
       - name: configure aws credentials
@@ -200,7 +199,7 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: ${{ inputs.DOCKER_IMAGE }}
@@ -210,7 +209,7 @@ jobs:
 
       - name: Pull Docker image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -222,7 +221,7 @@ jobs:
 
       - name: Teardown Linux
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
 
       - name: Chown workspace
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml
index 636b76d42931a..ce6c61e930620 100644
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@@ -81,7 +81,7 @@ jobs:
       SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
 
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index f5cb186c6f189..ff21b561c760a 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -80,7 +80,7 @@ jobs:
     name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -91,7 +91,7 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
@@ -103,12 +103,12 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -213,5 +213,5 @@ jobs:
           echo "https://docs-preview.pytorch.org/pytorch/pytorch/nightly-${{ github.sha }}/cppdocs/index.html"
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml
index 014e6106b0730..b45837ab110d6 100644
--- a/.github/workflows/_link_check.yml
+++ b/.github/workflows/_link_check.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   lint-urls:
     if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     with:
       job-name: lint-urls
       timeout: 120
@@ -37,7 +37,7 @@ jobs:
 
   lint-xrefs:
     if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     with:
       job-name: lint-xrefs
       timeout: 60
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index 25501b59c7851..7696674ab2717 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -138,7 +138,7 @@ jobs:
       build-environment: ${{ inputs.build-environment }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -151,7 +151,7 @@ jobs:
       # checkout because when we run this action we don't *have* a local
       # checkout. In other cases you should prefer a local checkout.
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
 
@@ -185,7 +185,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image-name: ${{ inputs.docker-image-name }}
@@ -201,7 +201,7 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@@ -464,7 +464,7 @@ jobs:
           artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
 
       - name: Cleanup docker
diff --git a/.github/workflows/_linux-test-stable-fa3.yml b/.github/workflows/_linux-test-stable-fa3.yml
index f2e16712ff447..ffd5845adc473 100644
--- a/.github/workflows/_linux-test-stable-fa3.yml
+++ b/.github/workflows/_linux-test-stable-fa3.yml
@@ -60,7 +60,7 @@ jobs:
       contents: read
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
 
@@ -78,7 +78,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
@@ -92,7 +92,7 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -254,5 +254,5 @@ jobs:
           workflow_attempt: ${{github.run_attempt}}
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 979b7a8e55a08..3e9a03befbddd 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -104,7 +104,7 @@ jobs:
       contents: read
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -113,7 +113,7 @@ jobs:
               docker exec -it $(docker container ps --format '{{.ID}}') bash
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
 
@@ -147,7 +147,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image-name: ${{ inputs.docker-image }}
@@ -163,7 +163,7 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@@ -175,7 +175,7 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.11
         with:
           driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
         if: ${{ !contains(matrix.runner, 'b200') }}
@@ -512,7 +512,7 @@ jobs:
           aws-region: us-east-1
 
       - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11
         if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.check-tpu.outputs.has_tpu != 'true'
         with:
           benchmark-results-dir: test/test-reports
@@ -570,7 +570,7 @@ jobs:
           workflow_attempt: ${{github.run_attempt}}
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
 
       - name: Cleanup docker
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 4fd7874ee0c4d..351ab9376a416 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -71,11 +71,11 @@ jobs:
       build-environment: ${{ inputs.build-environment }}
     steps:
       - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.11
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
 
       - name: Set xcode version
         env:
@@ -86,7 +86,7 @@ jobs:
           fi
 
       - name: Setup Python
-        uses: pytorch/test-infra/.github/actions/setup-python@main
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.11
         with:
           python-version: ${{ inputs.python-version }}
           pip-requirements-file: .ci/docker/requirements-ci.txt
@@ -192,4 +192,4 @@ jobs:
       - name: Clean up disk space
         if: always()
         continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.11
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 82eb3c4bf2c75..67a7320e08edc 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -105,11 +105,11 @@ jobs:
           done
 
       - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.11
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
 
       - name: Get workflow job id
         id: get-job-id
@@ -119,7 +119,7 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Setup Python
-        uses: pytorch/test-infra/.github/actions/setup-python@main
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.11
         with:
           python-version: ${{ inputs.python-version }}
           pip-requirements-file: .ci/docker/requirements-ci.txt
@@ -257,7 +257,7 @@ jobs:
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
@@ -287,4 +287,4 @@ jobs:
       - name: Clean up disk space
         if: always()
         continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.11
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 38f3bff66c14e..f07fb9ee8b71e 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -85,7 +85,7 @@ jobs:
     timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
 
@@ -104,12 +104,12 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -332,7 +332,7 @@ jobs:
           aws-region: us-east-1
 
       - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
diff --git a/.github/workflows/_runner-determinator.yml b/.github/workflows/_runner-determinator.yml
index 0d674f044ec42..b127c82266561 100644
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@@ -59,7 +59,7 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
     steps:
       # - name: Checkout PyTorch
-      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
       #   with:
       #     fetch-depth: 1
       #     submodules: true
diff --git a/.github/workflows/_vllm-benchmark.yml b/.github/workflows/_vllm-benchmark.yml
index d25c3c80767b1..d5aa61a6341c7 100644
--- a/.github/workflows/_vllm-benchmark.yml
+++ b/.github/workflows/_vllm-benchmark.yml
@@ -84,7 +84,7 @@ jobs:
           name: ${{ inputs.build_environment }}
           s3-bucket: gha-artifacts
 
-      - uses: pytorch/test-infra/.github/actions/setup-uv@main
+      - uses: pytorch/test-infra/.github/actions/setup-uv@release/2.11
         with:
           python-version: "3.12"
           activate-environment: "true"
@@ -219,7 +219,7 @@ jobs:
           aws-region: us-east-1
 
       - name: Upload the benchmark results to OSS benchmark database for the dashboard
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11
         with:
           benchmark-results-dir: vllm-project/vllm/benchmarks/results
           benchmark-name: 'PyTorch x vLLM benchmark'
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 005d68ece857d..034a308054361 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -89,7 +89,7 @@ jobs:
           git config --global core.fsmonitor false
 
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -104,7 +104,7 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
 
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 3d2fe8a4b3fac..59bfc6a25e4ca 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -78,7 +78,7 @@ jobs:
           git config --global core.fsmonitor false
 
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -94,7 +94,7 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
 
diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml
index 5724403e6de44..75c033789a313 100644
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@@ -86,7 +86,7 @@ jobs:
     steps:
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
 
       - name: Setup XPU
         uses: ./.github/actions/setup-xpu
@@ -96,7 +96,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
@@ -110,7 +110,7 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -345,7 +345,7 @@ jobs:
           aws-region: us-east-1
 
       - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.11
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
diff --git a/.github/workflows/b200-distributed.yml b/.github/workflows/b200-distributed.yml
index e52c7a4b5f5c5..9ba6839858027 100644
--- a/.github/workflows/b200-distributed.yml
+++ b/.github/workflows/b200-distributed.yml
@@ -24,7 +24,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/b200-symm-mem.yml b/.github/workflows/b200-symm-mem.yml
index 62367b61b07b9..6ca86affacc0f 100644
--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@@ -24,7 +24,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml
index 9090fba0a9773..f0ba752891cff 100644
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@@ -39,7 +39,7 @@ jobs:
         tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "rocm7.2", "cpu"]
     steps:
       - name: Build docker image
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.11
         with:
           docker-image-name: almalinux-builder
           custom-tag-prefix: ${{matrix.tag}}
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
index 47bf15e1db3ab..bdc81c0fc4a3f 100644
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@@ -32,7 +32,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -59,7 +59,7 @@ jobs:
         ]
     steps:
       - name: Build docker image
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.11
         with:
           docker-image-name: libtorch-cxx11-builder
           custom-tag-prefix: ${{ matrix.tag }}
diff --git a/.github/workflows/build-manywheel-images-s390x.yml b/.github/workflows/build-manywheel-images-s390x.yml
index c498e169f1aa5..f13d0be04c81b 100644
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: linux.s390x
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
           no-sudo: true
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
index f86cefd7c7a1a..beabc976f30ad 100644
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@@ -32,7 +32,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -65,7 +65,7 @@ jobs:
     name: ${{ matrix.name }}:${{ matrix.tag }}
     steps:
       - name: Build docker image
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.11
         with:
           docker-image-name: ${{ matrix.name }}
           custom-tag-prefix: ${{ matrix.tag }}
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index 263745ff0fe23..60175432e1557 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -3,7 +3,7 @@ name: Build Triton wheels
 on:
   push:
     branches:
-      - main
+      - release/2.11
     tags:
       # NOTE: Binary build pipelines should only get triggered on release candidate builds
       # Release candidate tags look like: v1.11.0-rc1
@@ -36,7 +36,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -74,12 +74,12 @@ jobs:
       PLATFORM: 'manylinux_2_28_x86_64'
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
 
@@ -90,7 +90,7 @@ jobs:
         uses: ./.github/actions/ecr-login
 
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ env.DOCKER_IMAGE }}
 
@@ -179,7 +179,7 @@ jobs:
           path: ${{ runner.temp }}/artifacts/wheelhouse/*
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
 
   build-wheel-win:
@@ -212,7 +212,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml
index 1ef0684688218..9fea4a06c60a2 100644
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@@ -65,12 +65,12 @@ jobs:
       BUILD_DEVICE: ${{ matrix.device }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
 
@@ -167,7 +167,7 @@ jobs:
           path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
 
   # Copied from build-triton-wheel workflow (mostly)
diff --git a/.github/workflows/claude-code.yml b/.github/workflows/claude-code.yml
index 6ba4f6c26165e..06b6c5a63fb7a 100644
--- a/.github/workflows/claude-code.yml
+++ b/.github/workflows/claude-code.yml
@@ -75,4 +75,4 @@ jobs:
 
       - name: Upload usage metrics
         if: always()
-        uses: pytorch/test-infra/.github/actions/upload-claude-usage@main
+        uses: pytorch/test-infra/.github/actions/upload-claude-usage@release/2.11
diff --git a/.github/workflows/claude-issue-triage-run.yml b/.github/workflows/claude-issue-triage-run.yml
index 6d63695b97ecf..655ede7b4ee89 100644
--- a/.github/workflows/claude-issue-triage-run.yml
+++ b/.github/workflows/claude-issue-triage-run.yml
@@ -106,4 +106,4 @@ jobs:
           fi
 
       - name: Upload usage metrics
-        uses: pytorch/test-infra/.github/actions/upload-claude-usage@main
+        uses: pytorch/test-infra/.github/actions/upload-claude-usage@release/2.11
diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml
index bef3d8797149c..256cd8e9ec0d0 100644
--- a/.github/workflows/close-nonexistent-disable-issues.yml
+++ b/.github/workflows/close-nonexistent-disable-issues.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index d5e0d96fe19f2..4932631f2d2eb 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -19,7 +19,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index c4c989af980e6..a0df8bccc8df9 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -33,7 +33,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -106,7 +106,7 @@ jobs:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required for git merge-base
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
@@ -116,14 +116,14 @@ jobs:
 
       - name: Build docker image
         id: build-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-image-name: ci-image:${{ matrix.docker-image-name }}
           always-rebuild: true
           push: true
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.build-docker-image.outputs.docker-image }}
 
@@ -170,5 +170,5 @@ jobs:
         if: always()
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
diff --git a/.github/workflows/docker-cache-rocm.yml b/.github/workflows/docker-cache-rocm.yml
index 2c2a9ba16647f..c53969b40e70c 100644
--- a/.github/workflows/docker-cache-rocm.yml
+++ b/.github/workflows/docker-cache-rocm.yml
@@ -71,7 +71,7 @@ jobs:
           echo "Outputs of download-docker-builds-artifacts job: ${JSON_STRINGIFIED}"
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           no-sudo: true
 
@@ -86,7 +86,7 @@ jobs:
             echo "ghcr_image=${ghcr_image}" >> "$GITHUB_OUTPUT"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.ghcr-io-tag.outputs.ghcr_image }}
 
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index 577a8acb5203f..38db8612698f8 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -38,7 +38,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -53,7 +53,7 @@ jobs:
       matrix: ${{ steps.generate-matrix.outputs.matrix }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           fetch-depth: 1
           submodules: true
@@ -83,7 +83,7 @@ jobs:
       CUDNN_VERSION: ${{ matrix.cudnn_version }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
       # [see note: pytorch repo ref]
@@ -169,13 +169,13 @@ jobs:
           fi
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
 
   validate:
     needs: build
     if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) }}
-    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main
+    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.11
     with:
       channel: nightly
       ref: main
diff --git a/.github/workflows/dynamo-unittest.yml b/.github/workflows/dynamo-unittest.yml
index f7eea350b5644..ac1b8684de9ba 100644
--- a/.github/workflows/dynamo-unittest.yml
+++ b/.github/workflows/dynamo-unittest.yml
@@ -21,7 +21,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index e4a8e5e96a88a..0dc6a42e77d24 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index 771681dc123d0..db8ed62b924ef 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -446,7 +446,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -468,7 +467,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: libtorch-cxx11-builder
@@ -476,7 +475,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -565,7 +564,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -587,7 +585,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: libtorch-cxx11-builder
@@ -595,7 +593,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 4e49ac2223ada..fcd006886abed 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -433,7 +433,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -455,7 +454,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -463,7 +462,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -549,7 +548,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -571,7 +569,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -579,7 +577,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -654,9 +652,9 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11
       - name: Login to ECR
-        uses: pytorch/pytorch/.github/actions/ecr-login@main
+        uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -665,7 +663,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -676,7 +673,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -684,7 +681,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1100,7 +1097,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1122,7 +1118,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1130,7 +1126,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1216,7 +1212,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1238,7 +1233,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1246,7 +1241,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1321,9 +1316,9 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11
       - name: Login to ECR
-        uses: pytorch/pytorch/.github/actions/ecr-login@main
+        uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -1332,7 +1327,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1343,7 +1337,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1351,7 +1345,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1767,7 +1761,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1789,7 +1782,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1797,7 +1790,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1883,7 +1876,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1905,7 +1897,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1913,7 +1905,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1988,9 +1980,9 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11
       - name: Login to ECR
-        uses: pytorch/pytorch/.github/actions/ecr-login@main
+        uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -1999,7 +1991,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2010,7 +2001,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2018,7 +2009,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2434,7 +2425,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2456,7 +2446,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2464,7 +2454,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2550,7 +2540,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2572,7 +2561,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2580,7 +2569,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2655,9 +2644,9 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11
       - name: Login to ECR
-        uses: pytorch/pytorch/.github/actions/ecr-login@main
+        uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -2666,7 +2655,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2677,7 +2665,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2685,7 +2673,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3101,7 +3089,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3123,7 +3110,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3131,7 +3118,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3217,7 +3204,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3239,7 +3225,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3247,7 +3233,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3322,9 +3308,9 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11
       - name: Login to ECR
-        uses: pytorch/pytorch/.github/actions/ecr-login@main
+        uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -3333,7 +3319,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3344,7 +3329,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3352,7 +3337,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3768,7 +3753,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3790,7 +3774,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3798,7 +3782,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3884,7 +3868,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3906,7 +3889,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3914,7 +3897,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3989,9 +3972,9 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11
       - name: Login to ECR
-        uses: pytorch/pytorch/.github/actions/ecr-login@main
+        uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -4000,7 +3983,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4011,7 +3993,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -4019,7 +4001,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -4435,7 +4417,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4457,7 +4438,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -4465,7 +4446,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -4551,7 +4532,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4573,7 +4553,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -4581,7 +4561,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -4656,9 +4636,9 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.11
       - name: Login to ECR
-        uses: pytorch/pytorch/.github/actions/ecr-login@main
+        uses: pytorch/pytorch/.github/actions/ecr-login@release/2.11
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -4667,7 +4647,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4678,7 +4657,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -4686,7 +4665,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
index 2314dfa31db47..9c31484e78781 100644
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@@ -42,7 +42,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
index 2d47b83e5822e..ed793bbc8a1f7 100644
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@@ -69,7 +69,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index 70d210c7eabb6..4814baa851180 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -65,7 +65,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -175,7 +174,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -285,7 +283,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -395,7 +392,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -505,7 +501,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -615,7 +610,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -725,7 +719,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
index 7c26dbc3b9eea..1f607424c2a23 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
index 5e30b66183840..49dab1a56f13f 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
index 1368bc942350e..3a56869f3343b 100644
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 3ca3364e5de88..92b4185acdae2 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -35,7 +35,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -84,7 +84,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -116,7 +116,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -190,7 +189,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -222,7 +221,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -332,7 +330,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -364,7 +362,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -439,7 +436,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -471,7 +468,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -582,7 +578,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -614,7 +610,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -689,7 +684,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -721,7 +716,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -832,7 +826,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -864,7 +858,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -939,7 +932,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -971,7 +964,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index c6d1e2cf3b017..16075a8568e35 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -35,7 +35,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -84,7 +84,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -116,7 +116,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -190,7 +189,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -222,7 +221,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -332,7 +330,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -364,7 +362,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -439,7 +436,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -471,7 +468,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -582,7 +578,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -614,7 +610,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -689,7 +684,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -721,7 +716,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -832,7 +826,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -864,7 +858,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -939,7 +932,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -971,7 +964,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index e23118631d3a9..8a322048744cd 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -35,7 +35,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -80,7 +80,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -112,7 +112,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -182,7 +181,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -214,7 +213,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -316,7 +314,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -348,7 +346,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -419,7 +416,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -451,7 +448,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -554,7 +550,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -586,7 +582,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -657,7 +652,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -689,7 +684,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -792,7 +786,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -824,7 +818,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -895,7 +888,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -927,7 +920,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1030,7 +1022,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1062,7 +1054,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1132,7 +1123,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1164,7 +1155,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1265,7 +1255,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1297,7 +1287,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1367,7 +1356,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1399,7 +1388,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1501,7 +1489,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1533,7 +1521,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1604,7 +1591,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1636,7 +1623,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1739,7 +1725,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1771,7 +1757,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1842,7 +1827,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1874,7 +1859,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1977,7 +1961,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2009,7 +1993,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2080,7 +2063,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2112,7 +2095,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2215,7 +2197,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2247,7 +2229,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2317,7 +2298,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2349,7 +2330,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2450,7 +2430,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2482,7 +2462,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2552,7 +2531,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2584,7 +2563,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2686,7 +2664,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2718,7 +2696,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2789,7 +2766,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2821,7 +2798,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2924,7 +2900,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2956,7 +2932,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3027,7 +3002,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3059,7 +3034,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3162,7 +3136,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3194,7 +3168,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3265,7 +3238,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3297,7 +3270,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3400,7 +3372,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3432,7 +3404,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3502,7 +3473,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3534,7 +3505,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3635,7 +3605,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3667,7 +3637,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3737,7 +3706,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3769,7 +3738,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3871,7 +3839,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3903,7 +3871,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3974,7 +3941,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4006,7 +3973,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4109,7 +4075,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4141,7 +4107,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4212,7 +4177,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4244,7 +4209,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4347,7 +4311,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4379,7 +4343,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4450,7 +4413,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4482,7 +4445,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4585,7 +4547,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4617,7 +4579,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4687,7 +4648,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4719,7 +4680,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4820,7 +4780,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4852,7 +4812,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4922,7 +4881,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4954,7 +4913,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5056,7 +5014,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5088,7 +5046,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5159,7 +5116,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5191,7 +5148,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5294,7 +5250,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5326,7 +5282,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5397,7 +5352,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5429,7 +5384,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5532,7 +5486,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5564,7 +5518,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5635,7 +5588,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5667,7 +5620,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5770,7 +5722,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5802,7 +5754,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5872,7 +5823,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5904,7 +5855,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6005,7 +5955,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6037,7 +5987,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6107,7 +6056,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6139,7 +6088,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6241,7 +6189,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6273,7 +6221,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6344,7 +6291,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6376,7 +6323,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6479,7 +6425,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6511,7 +6457,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6582,7 +6527,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6614,7 +6559,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6717,7 +6661,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6749,7 +6693,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6820,7 +6763,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6852,7 +6795,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6955,7 +6897,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6987,7 +6929,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7057,7 +6998,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7089,7 +7030,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7190,7 +7130,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7222,7 +7162,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7292,7 +7231,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7324,7 +7263,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7426,7 +7364,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7458,7 +7396,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7529,7 +7466,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7561,7 +7498,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7664,7 +7600,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7696,7 +7632,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7767,7 +7702,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7799,7 +7734,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7902,7 +7836,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7934,7 +7868,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -8005,7 +7938,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -8037,7 +7970,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -8140,7 +8072,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -8172,7 +8104,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -8242,7 +8173,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.11
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -8274,7 +8205,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/h100-cutlass-backend.yml b/.github/workflows/h100-cutlass-backend.yml
index e5406f7600133..9f770549f40a3 100644
--- a/.github/workflows/h100-cutlass-backend.yml
+++ b/.github/workflows/h100-cutlass-backend.yml
@@ -27,7 +27,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml
index 0e5370a51c160..e566df60728e6 100644
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@@ -24,7 +24,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/h100-symm-mem.yml b/.github/workflows/h100-symm-mem.yml
index 09c362a546024..a9f8d3ff31270 100644
--- a/.github/workflows/h100-symm-mem.yml
+++ b/.github/workflows/h100-symm-mem.yml
@@ -24,7 +24,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml
index 5813aa28365e7..35a1a4ef972a5 100644
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@@ -20,7 +20,7 @@ permissions:
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml
index 4258e8fdb0c84..0e8ef5e9ea2bc 100644
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@@ -23,7 +23,7 @@ permissions:
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-pallas.yml b/.github/workflows/inductor-pallas.yml
index 8676434d0e580..e0bb5731ec4b9 100644
--- a/.github/workflows/inductor-pallas.yml
+++ b/.github/workflows/inductor-pallas.yml
@@ -20,7 +20,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml
index 17b265500d47e..6235d02970849 100644
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@@ -18,7 +18,7 @@ jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-perf-test-b200.yml b/.github/workflows/inductor-perf-test-b200.yml
index 07c7a0cf987d7..003f27476bcb9 100644
--- a/.github/workflows/inductor-perf-test-b200.yml
+++ b/.github/workflows/inductor-perf-test-b200.yml
@@ -68,7 +68,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
index f7b3517dccc06..855e87ce6ca5d 100644
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@@ -55,7 +55,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml
index 155b995e3c8a2..a929475355888 100644
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@@ -73,7 +73,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
index c556c6b455783..4f4c8461cf994 100644
--- a/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
@@ -68,7 +68,7 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml
index e6fd83193202c..22ef17f000455 100644
--- a/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml
@@ -68,7 +68,7 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
index eee51b7ff8889..9d43549fae06a 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@@ -65,7 +65,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml
index 87875831e2a0b..7239952de60f2 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@@ -65,7 +65,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-xpu.yml b/.github/workflows/inductor-perf-test-nightly-xpu.yml
index b51795c663957..981720537f3f0 100644
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@@ -68,7 +68,7 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 2a8e29278b8bc..6539a81f7c196 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -68,7 +68,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index 18d7c7189f38e..1e87adc965c74 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -22,7 +22,7 @@ permissions:
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-rocm-mi200.yml b/.github/workflows/inductor-rocm-mi200.yml
index 1698eb4fc85fb..e9218f3acaa86 100644
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@@ -21,7 +21,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml
index 633386aba487b..5828b299590c7 100644
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@@ -29,7 +29,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-rocm-mi355.yml b/.github/workflows/inductor-rocm-mi355.yml
index 70ea41a6da698..396e1f8e65ae0 100644
--- a/.github/workflows/inductor-rocm-mi355.yml
+++ b/.github/workflows/inductor-rocm-mi355.yml
@@ -28,7 +28,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml
index b5cea8ceb5265..ea6ce55dbd470 100644
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@@ -22,7 +22,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index acfa832c88ebb..3736415f11b74 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -35,7 +35,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/lint-autoformat.yml b/.github/workflows/lint-autoformat.yml
index b962970dc5b78..66acb3eab1f89 100644
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@@ -13,7 +13,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'autoformat') }}
     steps:
       - name: Checkout pytorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: true
           fetch-depth: 0
diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml
index e0de9ede35084..43fac2dc9584d 100644
--- a/.github/workflows/lint-bc.yml
+++ b/.github/workflows/lint-bc.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Run BC Lint Action
-        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        uses: pytorch/test-infra/.github/actions/bc-lint@release/2.11
         with:
           repo: ${{ github.event.pull_request.head.repo.full_name }}
           base_sha: ${{ github.event.pull_request.base.sha }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index e5d998e100a94..ec215e92e91f3 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -22,7 +22,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -36,7 +36,7 @@ jobs:
       all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') || github.event_name == 'push' }}
 
   lintrunner-clang:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     # Needed to prevent deduping on HUD
     name: lintrunner-clang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
     needs: [get-label-type, get-changed-files]
@@ -78,7 +78,7 @@ jobs:
   #       fails to find types when it should
   # NOTE: We should be able to disable this and consolidate with Pyrefly
   lintrunner-pyrefly:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     name: lintrunner-pyrefly-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
     needs: [get-label-type, get-changed-files]
     # Only run if there are changed files relevant to pyrefly
@@ -103,7 +103,7 @@ jobs:
         ADDITIONAL_LINTRUNNER_ARGS="--take PYREFLY --all-files" .github/scripts/lintrunner.sh
 
   lintrunner-noclang:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     name: lintrunner-noclang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
     needs: [get-label-type, get-changed-files]
     with:
@@ -125,7 +125,7 @@ jobs:
         fi
 
   quick-checks:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     needs: get-label-type
     with:
       timeout: 120
@@ -165,7 +165,7 @@ jobs:
     if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
           fetch-depth: -1
@@ -178,7 +178,7 @@ jobs:
           bash .github/scripts/pr-sanity-check.sh
 
   workflow-checks:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     needs: get-label-type
     with:
       timeout: 120
@@ -189,6 +189,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         # Regenerate workflows
+        export RELEASE_VERSION_TAG=2.11
         .github/scripts/generate_ci_workflows.py
 
         RC=0
@@ -212,7 +213,7 @@ jobs:
         exit $RC
 
   toc:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     needs: get-label-type
     with:
       timeout: 120
@@ -248,7 +249,7 @@ jobs:
   test-tools:
     name: Test tools
     if: ${{ github.repository == 'pytorch/pytorch' }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     needs: get-label-type
     with:
       timeout: 120
@@ -268,7 +269,7 @@ jobs:
     runs-on: linux.24_04.4x
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
           fetch-depth: 1
@@ -305,7 +306,7 @@ jobs:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required, to allow us to use git log
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml
index 2c6e6b6dac39c..3636c4e626c27 100644
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@@ -19,7 +19,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml
index 565a9b25df50f..ebe04611f0ecb 100644
--- a/.github/workflows/llm_td_retrieval.yml
+++ b/.github/workflows/llm_td_retrieval.yml
@@ -12,7 +12,7 @@ jobs:
     name: get-label-type
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -116,5 +116,5 @@ jobs:
           AWS_REGION: ""
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
diff --git a/.github/workflows/nightly-s3-uploads.yml b/.github/workflows/nightly-s3-uploads.yml
index acf3504dec9ca..dd22ec672b2b1 100644
--- a/.github/workflows/nightly-s3-uploads.yml
+++ b/.github/workflows/nightly-s3-uploads.yml
@@ -23,7 +23,7 @@ jobs:
     environment: upload-stats
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index c47b0c5763078..5202627ade876 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -21,7 +21,7 @@ concurrency:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -94,7 +94,7 @@ jobs:
     if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
     steps:
       - name: "${{ matrix.repo-owner }}/${{ matrix.repo-name }} update-commit-hash"
-        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.11
         with:
           repo-owner: ${{ matrix.repo-owner }}
           repo-name: ${{ matrix.repo-name }}
diff --git a/.github/workflows/nitpicker.yml b/.github/workflows/nitpicker.yml
index 40bd245ce913f..ce7f51b9bfb29 100644
--- a/.github/workflows/nitpicker.yml
+++ b/.github/workflows/nitpicker.yml
@@ -19,7 +19,7 @@ jobs:
     if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }}
     steps:
     - name: Checkout PyTorch
-      uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
     - uses: ethanis/nitpicker@v1
       with:
         nitpicks: '.github/nitpicks.yml'
diff --git a/.github/workflows/operator_microbenchmark.yml b/.github/workflows/operator_microbenchmark.yml
index 445cdcc4be04a..8fc1f9f319a45 100644
--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@@ -21,7 +21,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/periodic-rocm-mi200.yml b/.github/workflows/periodic-rocm-mi200.yml
index 865d999623cbb..041bb5a1bf18f 100644
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@@ -37,7 +37,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml
index 88da168926444..eaec4ab7aac40 100644
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@@ -35,7 +35,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/periodic-rocm-mi355.yml b/.github/workflows/periodic-rocm-mi355.yml
index 9885ffb2d3832..07135377fd44f 100644
--- a/.github/workflows/periodic-rocm-mi355.yml
+++ b/.github/workflows/periodic-rocm-mi355.yml
@@ -35,7 +35,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 16a706a7be6be..70403093e2568 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -43,7 +43,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 538e163fb84e1..22989263dd22f 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -61,7 +61,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/quantization-periodic.yml b/.github/workflows/quantization-periodic.yml
index 8dd97ff9308db..7ae408a021bf4 100644
--- a/.github/workflows/quantization-periodic.yml
+++ b/.github/workflows/quantization-periodic.yml
@@ -20,7 +20,7 @@ permissions:
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/rocm-mi200.yml b/.github/workflows/rocm-mi200.yml
index 1bbb538527f13..e9ffdc91a18f5 100644
--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@@ -28,7 +28,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml
index 9c2bae06f32bd..06e43244d7d63 100644
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@@ -27,7 +27,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/rocm-mi355.yml b/.github/workflows/rocm-mi355.yml
index 5a77695011f3c..777667dda2372 100644
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@@ -25,7 +25,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/rocm-navi31.yml b/.github/workflows/rocm-navi31.yml
index bf1661b35e210..c04d5c2040d4b 100644
--- a/.github/workflows/rocm-navi31.yml
+++ b/.github/workflows/rocm-navi31.yml
@@ -28,7 +28,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/rocm-nightly.yml b/.github/workflows/rocm-nightly.yml
index 649de4ab2f689..9c58062c79b02 100644
--- a/.github/workflows/rocm-nightly.yml
+++ b/.github/workflows/rocm-nightly.yml
@@ -15,7 +15,7 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/slow-rocm-mi200.yml b/.github/workflows/slow-rocm-mi200.yml
index 937f04980522e..3bd00e3d193db 100644
--- a/.github/workflows/slow-rocm-mi200.yml
+++ b/.github/workflows/slow-rocm-mi200.yml
@@ -41,7 +41,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 8da9c9bd219d5..c73f6d2d48b22 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -41,7 +41,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml
index 3438b1dd5ac57..5f3f3725e7577 100644
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@@ -13,7 +13,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -38,7 +38,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
           working-directory: pytorch
@@ -53,13 +53,13 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.11
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.11
 
       - name: Clone CodeLlama
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -152,7 +152,7 @@ jobs:
             "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}"
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.11
         if: always()
 
 concurrency:
diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml
index c712b11185a76..0c2e4d458c7ae 100644
--- a/.github/workflows/target_determination.yml
+++ b/.github/workflows/target_determination.yml
@@ -9,7 +9,7 @@ jobs:
     name: get-label-type
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -27,7 +27,7 @@ jobs:
       # checkout because when we run this action we don't *have* a local
       # checkout. In other cases you should prefer a local checkout.
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
 
diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml
index 19dcb07c29844..e800ab6cf1c3d 100644
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml
index 883b2d253aa8f..bd0ac343ee04a 100644
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@@ -15,7 +15,7 @@ jobs:
   check_binary_linux_cpu:
     if: github.repository_owner == 'pytorch'
     name: Test check_binary.sh for Linux CPU
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     with:
       docker-image: python:3.11
       docker-build-dir: "skip-docker-build"
@@ -30,7 +30,7 @@ jobs:
   check_binary_linux_cuda:
     if: github.repository_owner == 'pytorch'
     name: Test check_binary.sh for Linux CUDA
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.11
     with:
       runner: linux.g4dn.4xlarge.nvidia.gpu
       docker-image: python:3.11
diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml
index 4351b427b0b8a..7d75675ebcb78 100644
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@@ -28,7 +28,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/tools-unit-tests.yml b/.github/workflows/tools-unit-tests.yml
index 4f87992eb5d72..6559b1852205e 100644
--- a/.github/workflows/tools-unit-tests.yml
+++ b/.github/workflows/tools-unit-tests.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout pytorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: true
           fetch-depth: 0
@@ -52,7 +52,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout pytorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: true
           fetch-depth: 0
diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml
index 508c39a653600..a84ff38e72471 100644
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@@ -18,7 +18,7 @@ jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/trunk-rocm-sandbox.yml b/.github/workflows/trunk-rocm-sandbox.yml
index aee6a5d87df09..200b071eb6693 100644
--- a/.github/workflows/trunk-rocm-sandbox.yml
+++ b/.github/workflows/trunk-rocm-sandbox.yml
@@ -36,7 +36,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 65bc108c842af..75966ef5e5c4c 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -58,7 +58,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index b5955127d9fb3..916a0c2d342c1 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -46,7 +46,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 1b4af0f274913..ca00236186c6f 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -18,7 +18,7 @@ jobs:
     environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
     steps:
       - name: Update viable/strict
-        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+        uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.11
         id: update_viablestrict
         with:
           repository: pytorch/pytorch
diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml
index a1b8c38141ae8..4bd09fbbfa5aa 100644
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@@ -17,7 +17,7 @@ jobs:
       contents: read
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/upload-test-stats-while-running.yml b/.github/workflows/upload-test-stats-while-running.yml
index 9aecaad0e068f..e7683c8af5eeb 100644
--- a/.github/workflows/upload-test-stats-while-running.yml
+++ b/.github/workflows/upload-test-stats-while-running.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on: linux.2xlarge
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index 5dcbfe7fd65fa..a6cc99c1efb1c 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -66,7 +66,7 @@ jobs:
         run: echo "${TRIGGERING_WORKFLOW}"
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
 
       - name: Configure aws credentials
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml
index 07471619437a2..b3221fb0144d0 100644
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@@ -32,7 +32,7 @@ jobs:
     name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml
index 5702562006055..856f6e7ccedce 100644
--- a/.github/workflows/upload_test_stats_intermediate.yml
+++ b/.github/workflows/upload_test_stats_intermediate.yml
@@ -17,7 +17,7 @@ jobs:
     environment: upload-stats
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.11
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
index 24c57119cae85..b15dbc7c2db2e 100644
--- a/.github/workflows/vllm-benchmark.yml
+++ b/.github/workflows/vllm-benchmark.yml
@@ -44,7 +44,7 @@ jobs:
       torch_cuda_arch_list: '8.0 8.9 9.0 10.0'
       build_environment: linux-jammy-cuda12.9-py3.12-gcc11
     steps:
-      - uses: pytorch/test-infra/.github/actions/setup-uv@main
+      - uses: pytorch/test-infra/.github/actions/setup-uv@release/2.11
         with:
           python-version: "3.12"
           activate-environment: "true"
@@ -82,7 +82,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           working-directory: pytorch/pytorch
           docker-image-name: ci-image:pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 7bed6c785d4db..b8aadb37fc528 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -22,7 +22,7 @@ jobs:
           fetch-depth: 0
       - name: update-xla-commit-hash
         continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.11
         with:
           repo-name: xla
           branch: master
diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml
index 84b8aa3cd91d4..440580b475945 100644
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@@ -19,7 +19,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index a7c661340d13e..fbf4b1d16dfd5 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -112,7 +112,7 @@ def process_disabled_test(the_response: dict[str, Any]) -> dict[str, Any]:
         return disabled_test_from_issues
 
     try:
-        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json"
+        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json?versionId=cnSTGFIe2xdODOeLj3qZMwi4tgoH6y67"
         return fetch_and_cache(dirpath, filename, url, process_disabled_test)
     except Exception:
         print("Couldn't download test skip set, leaving all tests enabled...")

From 8b1d03b6bf5032c71abf4bde80c4001f989ccd5a Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 16 Feb 2026 11:00:49 -0800
Subject: [PATCH 02/87] Update inductor expected accuracy files (#175096)

Update inductor expected accuracy files (#175041)

## Summary

This PR updates the expected accuracy CSV files for inductor benchmarks based on CI results from PyTorch commit 93dd7743c6577271a81f2fef0fdeafc5fe06e553.

These files serve as reference points for dynamo/inductor CI to track:
- Graph breaks
- Model accuracy

## Changes

- Updated CUDA expected accuracy files in `benchmarks/dynamo/ci_expected_accuracy/`
- Updated ROCm expected accuracy files in `benchmarks/dynamo/ci_expected_accuracy/rocm/`

## Test Plan

- [ ] Verify that the CI jobs pass with the updated expected accuracy files
- [ ] Review the diff to ensure changes are reasonable and expected
- [ ] Check that no unexpected regressions are being marked as "expected"

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175041
Approved by: https://github.com/atalman

(cherry picked from commit f90c091c44cf4e8feffbf5d5afdebd20798a86fa)
---
 .../ci_expected_accuracy/cpu_inductor_torchbench_inference.csv  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index 0157149df8cb8..5fb09f9e69f1d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -186,7 +186,7 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
+pytorch_CycleGAN_and_pix2pix,eager_fail_to_run,0
 
 
From 148630ba62bb9b216eea3737d90a2c2d6d772ba2 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Mon, 16 Feb 2026 12:29:29 -0800
Subject: [PATCH 03/87] Revert "[fix] DISABLED test_index
 (__main__.DistTensorOpsTest) (#172373)" (#175094)

This reverts commit 70726364e8565902d6f9ed9e47cd197caf544399.

Reverted https://github.com/pytorch/pytorch/pull/172373 on behalf of https://github.com/jeffdaily due to PR claims to fix ROCm DISABLED issue but it did not ([comment](https://github.com/pytorch/pytorch/pull/172373#issuecomment-3909564537))

Co-authored-by: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
---
 test/distributed/tensor/test_tensor_ops.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index c59c7aba540d5..692f074a91665 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -19,12 +19,7 @@
 from torch.distributed.tensor._sharding_prop import ShardingPropagator
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import (
-    MI200_ARCH,
-    run_tests,
-    serialTest,
-    skipIfRocmArch,
-)
+from torch.testing._internal.common_utils import MI200_ARCH, run_tests, skipIfRocmArch
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     create_local_tensor_test_class,
     DTensorConverter,
@@ -619,7 +614,6 @@ def test_gather(self):
 
     @skipIfRocmArch(MI200_ARCH)
     @with_comms
-    @serialTest()
     def test_index(self):
         meshes = [
             self.build_device_mesh(),  # 1D mesh

From 1b0497e1dc77f60d79fd365524684e83f19fcf80 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Mon, 16 Feb 2026 12:30:16 -0800
Subject: [PATCH 04/87] Revert "[CI] Enable TIMM pretrained model caching on
 shared HF cache (#174596)" (#175095)

This reverts commit 781b5d1dc94544bcc3841ad7babcd1be783a5056.

Reverted https://github.com/pytorch/pytorch/pull/174596 on behalf of https://github.com/jeffdaily due to This broke ROCm dynamo benchmarks.  Lots of permission denied errors. ([comment](https://github.com/pytorch/pytorch/pull/174596#issuecomment-3909918521))

Co-authored-by: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
---
 .ci/pytorch/test.sh               |  6 ------
 .github/workflows/_linux-test.yml | 16 ----------------
 benchmarks/dynamo/common.py       | 32 +++----------------------------
 3 files changed, 3 insertions(+), 51 deletions(-)

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 7bc94541a7558..e3e71bd0d54fa 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -1937,12 +1937,6 @@ elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
   test_dynamo_benchmark huggingface "$id"
 elif [[ "${TEST_CONFIG}" == *timm* ]]; then
   install_torchvision
-  TIMM_PIN="$(< .ci/docker/ci_commit_pins/timm.txt)"
-  export HF_HOME="${HF_HOME}/timm_${TIMM_PIN}"
-  if [[ "${TRANSFORMERS_OFFLINE:-1}" == "0" ]]; then
-    python benchmarks/dynamo/timm_models.py --download-only \
-      && touch "${HF_HOME}/.timm_cache_complete"
-  fi
   id=$((SHARD_NUMBER-1))
   test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 3e9a03befbddd..74d89aea3f869 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -391,22 +391,6 @@ jobs:
             export HF_DATASETS_OFFLINE=0
           fi
 
-          # For TIMM jobs, create a pin-specific cache directory and make it
-          # world-writable so the jenkins user inside docker can write to it.
-          # Only enable online mode if the cache hasn't been fully populated.
-          if [[ "${TEST_CONFIG}" == *timm* ]]; then
-            TIMM_PIN="$(< .ci/docker/ci_commit_pins/timm.txt)"
-            TIMM_CACHE_DIR="${HF_CACHE}/timm_${TIMM_PIN}"
-            if [[ ! -d "${TIMM_CACHE_DIR}" ]]; then
-              mkdir -p "${TIMM_CACHE_DIR}"
-              chmod -R a+rwX "${TIMM_CACHE_DIR}"
-            fi
-            if [[ ! -f "${TIMM_CACHE_DIR}/.timm_cache_complete" ]]; then
-              export TRANSFORMERS_OFFLINE=0
-              export HF_DATASETS_OFFLINE=0
-            fi
-          fi
-
           # detached container should get cleaned up by teardown_ec2_linux
           # TODO: Stop building test binaries as part of the build phase
           # Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index ee8a95f5c8459..3771ddd65e2e9 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -475,7 +475,6 @@ def output_signpost(data, args, suite, error=None):
         "log_conv_args",
         "recompile_profiler",
         "find_batch_sizes",
-        "download_only",
         # Redundant
         "batch_size",
         "batch_size_file",
@@ -3683,13 +3682,8 @@ def get_example_inputs(self):
         action="store_true",
         help="finds the largest batch size that could fit on GPUs",
     )
-    group.add_argument(
-        "--download-only",
-        action="store_true",
-        help="Download all models and exit without running benchmarks.",
-    )
 
-    mode_group = parser.add_mutually_exclusive_group(required=False)
+    mode_group = parser.add_mutually_exclusive_group(required=True)
     mode_group.add_argument(
         "--accuracy",
         action="store_true",
@@ -3703,7 +3697,7 @@ def get_example_inputs(self):
         action="store_true",
         help="extracts the tolerance for each model with small batch size and eval mode",
     )
-    run_mode_group = parser.add_mutually_exclusive_group(required=False)
+    run_mode_group = parser.add_mutually_exclusive_group(required=True)
     run_mode_group.add_argument(
         "--training",
         action="store_true",
@@ -3712,13 +3706,7 @@ def get_example_inputs(self):
     run_mode_group.add_argument(
         "--inference", action="store_true", help="Performs inference"
     )
-    args = parser.parse_args(args)
-    if not args.download_only:
-        if not any([args.accuracy, args.performance, args.tolerance]):
-            parser.error("one of --accuracy/--performance/--tolerance is required")
-        if not any([args.training, args.inference]):
-            parser.error("one of --training/--inference is required")
-    return args
+    return parser.parse_args(args)
 
 
 def process_caching_precompile():
@@ -4282,20 +4270,6 @@ def model_iter_fn_and_mark_step(*args, **kwargs):
             write_outputs(output_filename, [], [args.only, batch_size])
         return
 
-    if args.download_only:
-        model_names = list(runner.iter_model_names(args))
-        failed = []
-        for name in model_names:
-            try:
-                runner._download_model(name)
-                print(f"Downloaded: {name}")
-            except Exception as e:
-                print(f"Failed: {name}: {e}")
-                failed.append(name)
-        if failed:
-            sys.exit(1)
-        return
-
     should_profile_details = args.profile_details
     args.profile_details = {}
     if args.export_profiler_trace:

From 7afdbae2d55f95c6065101e727bd1e0cd2a72d34 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 16 Feb 2026 17:11:49 -0800
Subject: [PATCH 05/87] Fix macOS arm64 libtorch release upload failure
 (#175108)

Fix macOS arm64 libtorch release upload failure (#175100)

**Summary**

  Failures introduced by following PR: https://github.com/pytorch/pytorch/pull/173541

  The change from RENAME_WHEEL=true to RENAME_WHEEL=false as the default in
  build_wheel.sh (landed in the 2026-01-31 nightly) broke libtorch builds on
   macOS arm64. The elif branch at line 220 was missing a BUILD_PYTHONLESS
  guard, so libtorch builds (BUILD_PYTHONLESS=1) entered the wheel-copy path
   instead of the libtorch zip-packaging path. This caused the build to
  produce a .whl artifact instead of the expected .zip files, and the upload
   script then failed because it looks for *.zip files.

  The fix adds -z "$BUILD_PYTHONLESS" to the elif condition, matching the
  guard already present on the if branch.

  Failures can be seen here: https://hud.pytorch.org/hud/pytorch/pytorch/nightly/1?per_page=50&name_filter=macos-arm64-binary-libtorch-release%20%2F%20libtorch-cpu
  Failing run: https://github.com/pytorch/pytorch/actions/runs/21541142799/job/62076418921
  Successful run (previous nightly): https://github.com/pytorch/pytorch/actions/runs/21508411052/job/61971405484

**Test plan**
	In CI run ciflow/binaries. Make sure the Rename/Copy log is same as successful run above
Pull Request resolved: https://github.com/pytorch/pytorch/pull/175100
Approved by: https://github.com/huydhn, https://github.com/isuruf

(cherry picked from commit bad1df73015bd733d84be9fe90765fd17d30b89e)

Co-authored-by: atalman <atalman@fb.com>
---
 .ci/wheel/build_wheel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh
index afd1faf2a5f7c..6fb63c361f018 100755
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@@ -217,7 +217,7 @@ if [[ -z "$BUILD_PYTHONLESS" && $RENAME_WHEEL == true  ]]; then
     # Copy the whl to a final destination before tests are run
     echo "Renaming Wheel file: $wheel_filename_gen to $wheel_filename_new"
     cp "$whl_tmp_dir/$wheel_filename_gen" "$PYTORCH_FINAL_PACKAGE_DIR/$wheel_filename_new"
-elif [[ $RENAME_WHEEL == false ]]; then
+elif [[ -z "$BUILD_PYTHONLESS" && $RENAME_WHEEL == false ]]; then
     echo "Copying Wheel file: $wheel_filename_gen to $PYTORCH_FINAL_PACKAGE_DIR"
     cp "$whl_tmp_dir/$wheel_filename_gen" "$PYTORCH_FINAL_PACKAGE_DIR/$wheel_filename_gen"
     if [[ "$VERIFY_WHEELNAME" == "true" && "$wheel_filename_gen" != "$wheel_filename_new" ]]; then

From 2ceed7407ca27576a06a64a95f0879ba9b9071cc Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 19 Feb 2026 05:57:22 -0800
Subject: [PATCH 06/87] [benchmark] Skip pytorch_CycleGAN_and_pix2pix from
 inductor benchmarks (#175299)

[benchmark] Skip pytorch_CycleGAN_and_pix2pix from inductor benchmarks (#175066)

## Summary

Skip the `pytorch_CycleGAN_and_pix2pix` benchmark model from the inductor benchmark suite.

This legacy 2017 model has been failing with `eager_fail_to_run` on 100%
of commits since mid-2025, providing zero CI signal while consuming
~5.3M GPU-seconds/week across 7+ benchmark jobs on CUDA, CPU, and ROCm.

**Estimated savings: ~310 GPU-hours/week (~1,240 GPU-hours/month)**

Skip it in `torchbench.yaml` and remove its entries from all 31 expected
accuracy CSV files. Also remove it from the `higher_fp16` tolerance list.

See P2188981399 for the full CI workflow analysis.

## Test Plan

- CI should pass with CycleGAN skipped (it was already failing 100% of the time)
- No other benchmark models affected

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175066
Approved by: https://github.com/huydhn, https://github.com/malfet

(cherry picked from commit 688c943324013bfdf1e24aac37d02602c4bbf02a)

Co-authored-by: Eli Uriegas <eliuriegas@meta.com>
---
 .../ci_expected_accuracy/aot_eager_torchbench_inference.csv   | 4 ----
 .../ci_expected_accuracy/aot_eager_torchbench_training.csv    | 4 ----
 .../aot_inductor_torchbench_inference.csv                     | 4 ----
 .../cpu_aot_inductor_amp_freezing_torchbench_inference.csv    | 4 ----
 .../cpu_aot_inductor_freezing_torchbench_inference.csv        | 4 ----
 .../cpu_inductor_amp_freezing_torchbench_inference.csv        | 4 ----
 .../cpu_inductor_freezing_torchbench_inference.csv            | 4 ----
 .../cpu_inductor_torchbench_inference.csv                     | 4 ----
 .../dynamic_aot_eager_torchbench_inference.csv                | 4 ----
 .../dynamic_aot_eager_torchbench_training.csv                 | 4 ----
 ...mic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv | 4 ----
 ...dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv | 4 ----
 .../dynamic_cpu_inductor_torchbench_inference.csv             | 4 ----
 ...ax_autotune_inductor_amp_freezing_torchbench_inference.csv | 4 ----
 .../dynamic_inductor_torchbench_inference.csv                 | 4 ----
 .../dynamic_inductor_torchbench_training.csv                  | 4 ----
 .../dynamo_eager_torchbench_inference.csv                     | 4 ----
 .../ci_expected_accuracy/dynamo_eager_torchbench_training.csv | 4 ----
 .../ci_expected_accuracy/inductor_torchbench_inference.csv    | 4 ----
 .../ci_expected_accuracy/inductor_torchbench_training.csv     | 4 ----
 .../rocm/aot_eager_torchbench_inference.csv                   | 1 -
 .../rocm/aot_eager_torchbench_training.csv                    | 1 -
 .../rocm/aot_inductor_torchbench_inference.csv                | 1 -
 .../rocm/dynamic_aot_eager_torchbench_inference.csv           | 1 -
 .../rocm/dynamic_aot_eager_torchbench_training.csv            | 1 -
 .../rocm/dynamic_inductor_torchbench_inference.csv            | 1 -
 .../rocm/dynamic_inductor_torchbench_training.csv             | 1 -
 .../rocm/dynamo_eager_torchbench_inference.csv                | 1 -
 .../rocm/dynamo_eager_torchbench_training.csv                 | 1 -
 .../rocm/inductor_torchbench_inference.csv                    | 1 -
 .../rocm/inductor_torchbench_training.csv                     | 1 -
 benchmarks/dynamo/torchbench.yaml                             | 4 +++-
 32 files changed, 3 insertions(+), 92 deletions(-)

diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index bdf3313659b66..b0daa9a947ad4 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
index ee7838505d67c..9ff11382f67ad 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -130,10 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
-
-
-
 pytorch_stargan,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
index 6fdfc6e72bda3..5d9d432e644a7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@@ -178,10 +178,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
index 061727b22329c..c9a93b51d10c7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@@ -162,10 +162,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
index 061727b22329c..c9a93b51d10c7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@@ -162,10 +162,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
index fa64abf3c8246..0799b804bbf8c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@@ -186,10 +186,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
index 1d5a70739dd79..813520528261e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@@ -186,10 +186,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index 5fb09f9e69f1d..0a51409c04ac3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -186,10 +186,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,eager_fail_to_run,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 16034d6bdfe72..8d4895e4a1ccd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index 54486aa44d557..6743f7c739ef1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -130,10 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
-
-
-
 pytorch_stargan,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
index 7deb0fbba56b5..f7df046d04cce 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@@ -146,10 +146,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
index 7deb0fbba56b5..f7df046d04cce 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
@@ -146,10 +146,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
index 57ba90bd512bb..46d453de48c7d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -170,10 +170,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
index 723ef7a272ea1..dfa70e34afc53 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@@ -190,10 +190,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index 1ea87d2648875..72fd3af5beeda 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
index 24ad3a397e2cc..e4c20cfebf465 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -130,10 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
-
-
-
 pytorch_stargan,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index bdf3313659b66..b0daa9a947ad4 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
index ee7838505d67c..9ff11382f67ad 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -130,10 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
-
-
-
 pytorch_stargan,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index d5395066bbb42..bc98e325ec784 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -202,10 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
-
-
-
 pytorch_stargan,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
index 24ad3a397e2cc..e4c20cfebf465 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -130,10 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
-
-
-
 pytorch_stargan,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
index 9e1d982030b13..5f486d36c45b0 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@@ -202,7 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
index fa52782561be2..d171c2e06bffa 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
@@ -130,7 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
index 6d4c4c0359a50..fb70c29d79fda 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
@@ -178,7 +178,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
index 61ffb7197f676..2dd7684e5bd4f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@@ -202,7 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
index c9d8189dc8b74..4bb772b41d8f0 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
@@ -130,7 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
index a678afd25c32d..d1150f849e2ee 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@@ -202,7 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
index e059c66ca3d04..877277a5aa192 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
@@ -130,7 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
index 8dbc90cc8d2c7..a18f3b215ecc8 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@@ -202,7 +202,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
index 12d994100dfe3..d7d0a9b0b3292 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
@@ -130,7 +130,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
index ef7c4f2a01e9f..55457b2f60695 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@@ -210,7 +210,6 @@ pyhpc_turbulent_kinetic_energy,pass,0
 
 
-pytorch_CycleGAN_and_pix2pix,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
index 91e6df19ff02a..529710bbc21c1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
@@ -134,7 +134,6 @@ phlippe_resnet,pass,6
 
 
-pytorch_CycleGAN_and_pix2pix,pass,6
 
 
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
index b7f6229bed1cd..9a1825bc3d290 100644
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -43,7 +43,6 @@ tolerance:
     - doctr_reco_predictor
     - drq
     - phlippe_resnet
-    - pytorch_CycleGAN_and_pix2pix
 
   higher_bf16:
     - doctr_reco_predictor
@@ -219,6 +218,9 @@ skip:
     # Has never been working correctly
     # https://github.com/pytorch/pytorch/issues/172015#issuecomment-3730509098
     - modded_nanogpt
+    # Broken since mid-2025, eager_fail_to_run on all platforms.
+    # Legacy 2017 model providing zero CI signal.
+    - pytorch_CycleGAN_and_pix2pix
 
   device:
     cpu:

From d80a5844806105de9105da07e3d1836c293a46a6 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 19 Feb 2026 05:58:15 -0800
Subject: [PATCH 07/87] [CI] Move CUDA 12.8 GPU tests from per-commit trunk to
 periodic (#175300)

[CI] Move CUDA 12.8 GPU tests from per-commit trunk to periodic (#175067)

## Summary

Move CUDA 12.8 GPU tests from per-commit trunk CI to periodic (~3x/day on weekdays).

Both CUDA 12.8 and 13.0 are shipping wheel targets (nightly ships cu126, cu128, cu129, cu130), but their trunk CI test suites have **85-90% failure correlation** -- they almost always fail together. Over a 30-day analysis window covering 97 reverts and 38 significant regression events, **CUDA 12.8 never uniquely caught a regression that 13.0 missed**.

CUDA 13.0 is kept per-commit because:
- It is the **newest** shipping CUDA version
- Most likely to surface **novel breakage** from new CUDA runtime behavior
- Forward-looking CI should protect what's coming, not what's already stable

CUDA 12.8 is moved to periodic because:
- It is **mature and well-understood** -- breakage is less likely and less urgent
- The rare 12.8-only regression can tolerate the ~8-hour periodic detection window
- The 12.8 build job **remains in trunk** because `cross-compile-linux-test` depends on its artifacts

**Estimated savings: ~1,270 GPU-hours/week (~5,080 GPU-hours/month)**

This is the #2 savings opportunity from a broader CI workflow analysis (P2188981399) covering 128 PR+trunk jobs over 30 days. Combined with #175066 (CycleGAN skip, ~310 GPU-hours/week), total savings from this stack: **~1,580 GPU-hours/week (~6,320 GPU-hours/month)**.

### Changes
- `trunk.yml`: remove CUDA 12.8 test job (5 default + 3 distributed + 1 pr_time_benchmarks + 1 libtorch shards) and no-ops build
- `periodic.yml`: add default (5 GPU shards on g6.4xlarge) and distributed (3 multi-GPU shards on g4dn.12xlarge) to existing CUDA 12.8 periodic entry

## Test Plan

- CUDA 12.8 GPU tests continue to run in periodic (3x/day weekdays)
- CUDA 13.0 per-commit coverage is unchanged
- Cross-compile-linux-test continues to work (12.8 build job kept)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175067
Approved by: https://github.com/malfet
ghstack dependencies: #175066

(cherry picked from commit ef0353feec832b691bfae4631f54475455c42e94)

Co-authored-by: Eli Uriegas <eliuriegas@meta.com>
---
 .github/workflows/periodic.yml | 13 +++++++++++-
 .github/workflows/trunk.yml    | 37 +++++-----------------------------
 2 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 70403093e2568..2914329f1c48b 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -90,9 +90,20 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: 8.6
+      # GPU default + distributed tests moved here from trunk.yml.
+      # CUDA 13.0 remains per-commit in trunk; 12.8 GPU tests run periodically.
+      # See P2188981399 for the full CI workflow analysis.
+      cuda-arch-list: '7.5 8.6 8.9'
       test-matrix: |
         { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
           { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 75966ef5e5c4c..5b741eb67954a 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -112,21 +112,10 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc11-test:
-    if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' linux-jammy-cuda12.8-py3.10-gcc11 ') }}
-    name: linux-jammy-cuda12.8-py3.10-gcc11
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-build
-      - target-determination
-      - job-filter
-    with:
-      timeout-minutes: 360
-      build-environment: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.build-environment }}
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
-      tests-to-include: ${{ github.event.inputs.tests-to-include || '' }}
-    secrets: inherit
+  # CUDA 12.8 GPU tests moved to periodic.yml to reduce per-commit compute.
+  # CUDA 13.0 (the newer shipping version) remains per-commit for forward-looking coverage.
+  # The 12.8 build job is kept because cross-compile-linux-test depends on it.
+  # See P2188981399 for the full CI workflow analysis.
 
   linux-jammy-cuda13_0-py3_10-gcc11-build:
     if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' linux-jammy-cuda13.0-py3.10-gcc11 ') }}
@@ -171,23 +160,7 @@ jobs:
     secrets: inherit
 
   # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
-  linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
-    if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' linux-jammy-cuda12.8-py3.10-gcc11-no-ops ') }}
-    name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
-    uses: ./.github/workflows/_linux-build.yml
-    needs:
-      - get-label-type
-      - job-filter
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-    secrets: inherit
-
+  # CUDA 12.8 no-ops build moved to periodic (only 13.0 remains per-commit)
   linux-jammy-cuda13_0-py3_10-gcc11-no-ops-build:
     if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' linux-jammy-cuda13.0-py3.10-gcc11-no-ops ') }}
     name: linux-jammy-cuda13.0-py3.10-gcc11-no-ops

From c649f1ae141e8970c86846dba846d7c32d135b8b Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 19 Feb 2026 15:50:48 -0800
Subject: [PATCH 08/87] [BE] Remove cuda 12.4 periodic tests (#175362)

[BE] Remove cuda 12.4 periodic tests (#175170)

These tests are either timing out or failing for couple of month now. No reason to keep them around:
https://hud.pytorch.org/hud/pytorch/pytorch/main/2?per_page=50&name_filter=12.4

Failures go back as far as 9.29.2025 : https://hud.pytorch.org/pytorch/pytorch/commit/efd7fd5ed5ac7ec03201a546a09fb19ec59de431
Pull Request resolved: https://github.com/pytorch/pytorch/pull/175170
Approved by: https://github.com/malfet

(cherry picked from commit 174157a10161342694636d7733bfff22b21bda8c)

Co-authored-by: atalman <atalman@fb.com>
---
 .github/workflows/periodic.yml | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 2914329f1c48b..16728fa24dd7f 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -51,37 +51,6 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
-  linux-jammy-cuda12_4-py3_10-gcc11-build:
-    name: linux-jammy-cuda12.4-py3.10-gcc11
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.4-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
-      cuda-arch-list: 7.5
-      test-matrix: |
-        { include: [
-          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_4-py3_10-gcc11-test:
-    name: linux-jammy-cuda12.4-py3.10-gcc11
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_4-py3_10-gcc11-build
-      - target-determination
-    with:
-      build-environment: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.build-environment }}
-      docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-cuda12_8-py3_10-gcc11-build:
     name: linux-jammy-cuda12.8-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml

From 4b8a514606230b60bb8f27be5f11612f21b4aec1 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 19 Feb 2026 16:04:05 -0800
Subject: [PATCH 09/87] [CI] Add CUDA 13 periodic tests (#175380)

[CI] Add CUDA 13 periodic tests (#174850)

https://github.com/pytorch/pytorch/issues/173950
To prepare moving CUDA 13 wheels to stable wheels, need to add CUDA 13 periodic cuda tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/174850
Approved by: https://github.com/atalman


(cherry picked from commit 7cdd4b16cad708e2083ea9ff2ec724876485cf90)

Co-authored-by: Ting Lu <tingl@nvidia.com>
Co-authored-by: Andrey Talman <atalman@fb.com>
---
 .github/workflows/periodic.yml | 93 +++++++++++-----------------------
 1 file changed, 29 insertions(+), 64 deletions(-)

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 16728fa24dd7f..1a739986715c8 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -51,17 +51,14 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
-  linux-jammy-cuda12_8-py3_10-gcc11-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11
+  linux-jammy-cuda13_0-py3_10-gcc11-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      # GPU default + distributed tests moved here from trunk.yml.
-      # CUDA 13.0 remains per-commit in trunk; 12.8 GPU tests run periodically.
-      # See P2188981399 for the full CI workflow analysis.
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
       cuda-arch-list: '7.5 8.6 8.9'
       test-matrix: |
         { include: [
@@ -84,26 +81,26 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc11-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11
+  linux-jammy-cuda13_0-py3_10-gcc11-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - linux-jammy-cuda13_0-py3_10-gcc11-build
       - target-determination
     with:
-      build-environment: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.build-environment }}
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
+      build-environment: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.build-environment }}
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc11-debug-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-debug
+  linux-jammy-cuda13_0-py3_10-gcc11-debug-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11-debug
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-debug
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
       cuda-arch-list: 8.9
       test-matrix: |
         { include: [
@@ -117,58 +114,26 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc11-debug-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-debug
+  linux-jammy-cuda13_0-py3_10-gcc11-debug-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc11-debug
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-debug-build
+      - linux-jammy-cuda13_0-py3_10-gcc11-debug-build
       - target-determination
     with:
-      build-environment: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.build-environment }}
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.test-matrix }}
+      build-environment: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-debug-build.outputs.build-environment }}
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-debug-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-debug-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda13_0-py3_10-gcc11-build:
-    name: linux-jammy-cuda13.0-py3.10-gcc11
+  linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build:
+    name: linux-jammy-cuda13.0-py3-gcc11-slow-gradcheck
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      cuda-arch-list: 7.5
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      build-environment: linux-jammy-cuda13.0-py3-gcc11-slow-gradcheck
       docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
-      test-matrix: |
-        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda13_0-py3_10-gcc11-test:
-    name: linux-jammy-cuda13.0-py3.10-gcc11
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda13_0-py3_10-gcc11-build
-      - target-determination
-    with:
-      build-environment: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.build-environment }}
-      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
-    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
@@ -183,15 +148,15 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-test:
-    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
+  linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-test:
+    name: linux-jammy-cuda13.0-py3-gcc11-slow-gradcheck
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build
+      - linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build
       - target-determination
     with:
-      build-environment: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.build-environment }}
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.test-matrix }}
+      build-environment: ${{ needs.linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build.outputs.build-environment }}
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3-gcc11-slow-gradcheck-build.outputs.test-matrix }}
       timeout-minutes: 300
     secrets: inherit

From 5decbe0079eb387c203ab654b2481e0e4676ee19 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 20 Feb 2026 07:53:48 -0800
Subject: [PATCH 10/87] [ROCm] forward fix #174087, take 4 (#175159)

[ROCm] forward fix #174087, take 4 (#175098)

vllm build broke due to missing getCurrentHIPStreamMasqueradingAsCUDA.

Though it existed in the header aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h, this header was not included directly or indirectly by vllm.  PR #174087 subtly broke this even when trying to be backward compatible.  Moving the declarations of these Masquerading functions into c10/cuda/CUDAStream.h (c10/hip/HIPStream.h when hipified) fixes the vllm build.  Any external projects that had included the HIPStreamMasqueradingAsCUDA.h forward to c10/hip/HIPStream.h anyway.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175098
Approved by: https://github.com/atalman

(cherry picked from commit e6d6f0465ae435b4b73757553d3aa4504dd92d7d)

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
---
 .../hip/impl/HIPStreamMasqueradingAsCUDA.h    | 27 -------------------
 c10/cuda/CUDAStream.h                         | 22 +++++++++++++++
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
index a6d24a55c7b0d..1c0047371d7b6 100644
--- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
@@ -24,33 +24,6 @@ class HIPStreamMasqueradingAsCUDA final : public c10::cuda::CUDAStream {
   c10::cuda::CUDAStream hip_stream() const { return *this; }
 };
 
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, DeviceIndex device = -1) {
-  return HIPStreamMasqueradingAsCUDA(c10::cuda::getStreamFromPool(isHighPriority, device));
-}
-
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) {
-  return HIPStreamMasqueradingAsCUDA(c10::cuda::getStreamFromPool(priority, device));
-}
-
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) {
-  return HIPStreamMasqueradingAsCUDA(c10::cuda::getStreamFromExternal(ext_stream, device));
-}
-
-inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
-  return HIPStreamMasqueradingAsCUDA(c10::cuda::getDefaultCUDAStream(device_index));
-}
-
-inline HIPStreamMasqueradingAsCUDA getCurrentHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
-  return HIPStreamMasqueradingAsCUDA(c10::cuda::getCurrentCUDAStream(device_index));
-}
-
-inline void setCurrentHIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA stream) {
-  c10::cuda::setCurrentCUDAStream(stream.hip_stream());
-}
-
 inline std::ostream& operator<<(std::ostream& stream, const HIPStreamMasqueradingAsCUDA& s) {
   stream << s.hip_stream() << " (masquerading as CUDA)";
   return stream;
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index d3d1402593751..f27c7c9176631 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -256,6 +256,28 @@ inline c10::cuda::CUDAStream getCurrentHIPStream(
   return c10::cuda::getCurrentCUDAStream(device_index);
 }
 inline auto& setCurrentHIPStream = c10::cuda::setCurrentCUDAStream;
+inline c10::cuda::CUDAStream getStreamFromPoolMasqueradingAsCUDA(
+    const bool isHighPriority = false,
+    DeviceIndex device = -1) {
+  return c10::cuda::getStreamFromPool(isHighPriority, device);
+}
+inline c10::cuda::CUDAStream getStreamFromPoolMasqueradingAsCUDA(
+    const int priority,
+    DeviceIndex device = -1) {
+  return c10::cuda::getStreamFromPool(priority, device);
+}
+inline auto& getStreamFromExternalMasqueradingAsCUDA =
+    c10::cuda::getStreamFromExternal;
+inline c10::cuda::CUDAStream getDefaultHIPStreamMasqueradingAsCUDA(
+    DeviceIndex device_index = -1) {
+  return c10::cuda::getDefaultCUDAStream(device_index);
+}
+inline c10::cuda::CUDAStream getCurrentHIPStreamMasqueradingAsCUDA(
+    DeviceIndex device_index = -1) {
+  return c10::cuda::getCurrentCUDAStream(device_index);
+}
+inline auto& setCurrentHIPStreamMasqueradingAsCUDA =
+    c10::cuda::setCurrentCUDAStream;
 } // namespace c10::hip
 #endif
 

From a2813af6564cf10ae3ec37e568c53f91707d45c2 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 23 Feb 2026 17:29:28 -0500
Subject: [PATCH 11/87] [release-only] Remove +ptx from cuda 13.0 builds
 (#175567)

---
 .ci/manywheel/build_cuda.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
index 94bf6a6b4b26c..bd952d5e08fd5 100644
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@@ -115,7 +115,7 @@ case ${CUDA_VERSION} in
         fi
         ;;
     13.0)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0"
         export TORCH_NVCC_FLAGS="-compress-mode=size"
         export BUILD_BUNDLE_PTXAS=1
         ;;

From 98e35020c7423c304778a7044f6baa3c8a98ba6d Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 23 Feb 2026 17:01:30 -0800
Subject: [PATCH 12/87] [MPS] Fix 2-pass SDPA memory corruption by forcing
 float accumulators (#175580)

[MPS] Fix 2-pass SDPA memory corruption by forcing float accumulators (#174945)

Ensure `sums` and `maxs` buffers in `sdpa_vector_2pass_mps` are allocated as `kFloat` instead of inheriting the input dtype. This fixes out-of-bounds memory access and nondeterministic/corrupt results, as reported in #174861 (reproducible with bf16/fp16 and GQA, seq_len > 1023).

Adds a regression test covering bf16/fp16/fp32 and relaxes tolerance for bf16 to validate numerical correctness and determinism on MPS.

Fixes #174861
Pull Request resolved: https://github.com/pytorch/pytorch/pull/174945
Approved by: https://github.com/malfet

(cherry picked from commit c68a1d2c01dfba9da53a1bd495cc263d6b802150)

Co-authored-by: Roy Hvaara <roy@lightyear.no>
---
 .../ATen/native/mps/operations/Attention.mm   |  4 ++--
 test/test_mps.py                              | 22 +++++++++++++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Attention.mm b/aten/src/ATen/native/mps/operations/Attention.mm
index ce57174177885..26477fef2ed61 100644
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@@ -265,8 +265,8 @@
 
   auto out = at::empty({batchSize, num_heads, seq_len_q, headSize}, q_.options());
   auto intermediate = at::empty({batchSize, num_heads, seq_len_q, blocks, headSize}, q_.options());
-  auto sums = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options());
-  auto maxs = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options());
+  auto sums = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options().dtype(kFloat));
+  auto maxs = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options().dtype(kFloat));
 
   auto scale_factor = sdp::calculate_scale(orig_query, scale).expect_float();
   bool has_mask = mask_.has_value();
diff --git a/test/test_mps.py b/test/test_mps.py
index 482a163986c17..02e291017582e 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9629,10 +9629,10 @@ def weight_int8pack_mm(a, b_int8pack, b_scales):
 
 
 class TestSDPA(TestCaseMPS):
-    def _compare_tensors(self, y, ref):
+    def _compare_tensors(self, y, ref, tol=0.01):
         denom = torch.maximum(ref.abs(), torch.tensor([1e-6], device=ref.device, dtype=ref.dtype))
         err = ((y - ref).abs() / denom).mean().item()
-        self.assertLess(err, 0.01)
+        self.assertLess(err, tol)
 
     def _test_sdpa_no_mask(
         self,
@@ -9736,6 +9736,24 @@ def test_sdpa_full_mask(self, dtype):
         out_mps = F.scaled_dot_product_attention(q.to('mps'), k.to('mps'), v.to('mps'), attn_mask=mask.to('mps'))
         self._compare_tensors(out_mps.cpu(), out_cpu)
 
+    @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float])
+    def test_sdpa_2pass(self, dtype):
+        # Regression test for https://github.com/pytorch/pytorch/issues/174861
+        q = torch.randn(1, 32, 1, 128, dtype=dtype)
+        k = torch.randn(1, 2, 1024, 128, dtype=dtype)
+        v = torch.randn(1, 2, 1024, 128, dtype=dtype)
+        sdpa_kwargs = {"enable_gqa": True}
+
+        out_cpu = F.scaled_dot_product_attention(q, k, v, **sdpa_kwargs)
+        out_mps = F.scaled_dot_product_attention(
+            q.to("mps"), k.to("mps"), v.to("mps"), **sdpa_kwargs
+        )
+
+        tol = 0.1 if dtype == torch.bfloat16 else 0.01
+
+        self.assertEqual(out_mps, out_cpu, atol=1e-3, rtol=1e-6)
+        self._compare_tensors(out_mps.cpu(), out_cpu, tol=tol)
+
     @parametrize("dtype", [torch.float16, torch.float32])
     def test_sdpa_3d_input(self, dtype):
         head_num, seq_len, embed_dim = 16, 16, 80

From ef5c69dec13056e58810a59663db7f8a6b8bebbd Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 24 Feb 2026 09:25:58 -0800
Subject: [PATCH 13/87] Disable einops 0.8.2 check on PyTorch (#175442)

Disable einops 0.8.2 check on PyTorch (#175351)

Partially revert #173611 and fallback to the previous behavior on einops, which uses `allow_in_graph`.

**Context**

* Dynamo does not trace into `@lru_cache` and warns on any usage.
* einops uses `@lru_cache` as part of `_prepare_transformation_recipe`.
* Every einops op goes through this function.
* Dynamo warns on every einops op trace and this creates a logspam
  problem.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175351
Approved by: https://github.com/Lucaskabela

(cherry picked from commit 1fe0f51a5f14f566c6ab58a386eb86f3f5ca227e)

Co-authored-by: Guilherme Leobas <gleobas@quansight.com>
---
 test/dynamo/test_einops.py  | 11 +++++++++++
 torch/_dynamo/decorators.py | 17 +++++++++--------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/test/dynamo/test_einops.py b/test/dynamo/test_einops.py
index fcd86e50b944c..2c445be38ee8e 100644
--- a/test/dynamo/test_einops.py
+++ b/test/dynamo/test_einops.py
@@ -11,6 +11,7 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+    xfailIf,
 )
 
 
@@ -190,6 +191,7 @@ def f(x):
             else:
                 self.assertIn(einops_method, output)
 
+    @xfailIf(einops_version == "0.8.2")
     @parametrize(
         "method",
         ["reduce", "repeat", "pack", "unpack", "einsum", "rearrange"],
@@ -222,6 +224,15 @@ def test_einops_method(self, method):
             self.fail(method)
         self._run_in_subprocess(flag, method, einops_method, snippet)
 
+    def test_no_warning(self):
+        # checks that this doesn't produce any warnings
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            return einops.rearrange(x, "... -> (...)")
+
+        x = torch.randn(5)
+        self.assertNotWarn(lambda: fn(x))
+
 
 instantiate_parametrized_tests(
     TestEinops,
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index ed1354555a7d7..b444eaea9fb70 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -1156,14 +1156,15 @@ def mark_static_address(t: Any, guard: bool = False) -> None:
 def _allow_in_graph_einops() -> None:
     import einops
 
-    if einops.__version__ >= "0.8.2":
-        if hasattr(einops, "einops") and hasattr(einops.einops, "get_backend"):
-            # trigger backend registration up front to avoid a later guard failure
-            # that would otherwise cause a recompilation
-            einops.rearrange(torch.randn(1), "i -> i")
-
-        # einops 0.8.2+ don't need explicit allow_in_graph calls
-        return
+    # There is a lru_cache logspam issue with einops when allow_in_graph is not
+    # used. Disabling this for now until the lru_cache issue is resolved.
+    # if einops.__version__ >= "0.8.2":
+    #     if hasattr(einops, "einops") and hasattr(einops.einops, "get_backend"):
+    #         # trigger backend registration up front to avoid a later guard failure
+    #         # that would otherwise cause a recompilation
+    #         einops.rearrange(torch.randn(1), "i -> i")
+    #     # einops 0.8.2+ don't need explicit allow_in_graph calls
+    #     return
 
     try:
         # requires einops > 0.6.1, torch >= 2.0

From 013fdc238535f85f3ad603016a88ab0f6fabecce Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 24 Feb 2026 14:47:31 -0800
Subject: [PATCH 14/87] [CPUBLAS] Fix UB: use vector::resize() instead of
 reserve() before operator[] access (#175579)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[CPUBLAS] Fix UB: use vector::resize() instead of reserve() before operator[] access (#175315)

Fixes #175302

## Summary
`reserve(1)` → `resize(1)`. See issue for details.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175315
Approved by: https://github.com/zou3519, https://github.com/malfet

(cherry picked from commit f08aafa9e82c5ae142b97dbfcac1ebd5d9ca7fde)

Co-authored-by: mulatta <67085791+mulatta@users.noreply.github.com>
---
 aten/src/ATen/native/CPUBlas.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index 87351cdb98717..d36b5c813a140 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -1107,7 +1107,7 @@ struct GemmHelper {
     // Create a scratchpad buffer for the brgemm execution
     scratchpad = std::vector<uint8_t>(brg.get_scratchpad_size());
     // Prepare default vector of pairs of tensors A and B offsets for each batch.
-    A_B_offsets.reserve(1);
+    A_B_offsets.resize(1);
     A_B_offsets[0] = std::make_pair(0, 0);
   }
   dnnl::ukernel::brgemm brg;

From d70d867311dd195f80832486ff33ba6b3fdc271f Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 24 Feb 2026 14:49:38 -0800
Subject: [PATCH 15/87] Remove python constraint on setuptools (#175627)

Remove python constraint on setuptools (#175577)

Fixes https://github.com/pytorch/pytorch/issues/173823
Dependency on setuptools was added 8 years ago here: https://github.com/pytorch/pytorch/pull/5207
This issue remained hidden since we run smoke test in conda env. Conda create env installs setuptools by default. This became apparent when testing using uv

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175577
Approved by: https://github.com/malfet, https://github.com/seemethere

(cherry picked from commit eaa022177ba8b2d8b38e27316a18696c49299cd8)

Co-authored-by: atalman <atalman@fb.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7b9fba42e746e..0decaf66ed625 100644
--- a/setup.py
+++ b/setup.py
@@ -1703,7 +1703,7 @@ def main() -> None:
     install_requires = [
         "filelock",
         "typing-extensions>=4.10.0",
-        'setuptools<82 ; python_version >= "3.12"',
+        "setuptools<82",
         "sympy>=1.13.3",
         "networkx>=2.5.1",
         "jinja2",

From 883a7e2b5f257dcafb31da0b9210f292243ef7f7 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 24 Feb 2026 15:06:00 -0800
Subject: [PATCH 16/87] Supports custom empty tensor in InputObserver (#175581)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Supports custom empty tensor in InputObserver (#174964)

When running a LLM handling images and text (Gemma3), the first call to the forward method has input_ids, pixel_values and but no past_key_values. Next calls do not have pixel_values but have past_key_values. The InputObserver knows the whole list of inputs but since, there is only one example of input_pixel (and the batch dimension is usually constant accross all calls), we need to way to tell the InputObserver what a empty tensor for pixel_values when it is missing.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/174964
Approved by: https://github.com/titaiwangms, https://github.com/justinchuby


(cherry picked from commit bc9adaa4524c67014ed4945292930c215392d192)

Co-authored-by: Xavier Dupré <xadupre@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Justin Chu <justinchuby@users.noreply.github.com>
---
 test/onnx/exporter/test_input_observer.py     | 285 ++++++++++++++
 .../_internal/exporter/_input_observer.py     | 368 +++++++++++++++---
 2 files changed, 604 insertions(+), 49 deletions(-)

diff --git a/test/onnx/exporter/test_input_observer.py b/test/onnx/exporter/test_input_observer.py
index 39386a0775fc2..68e74c40b7cd3 100644
--- a/test/onnx/exporter/test_input_observer.py
+++ b/test/onnx/exporter/test_input_observer.py
@@ -872,6 +872,291 @@ def forward(self, x=None, y=None):
         with self.assertRaises(RuntimeError):
             observer.infer_dynamic_shapes()
 
+    def test_infer_dynamic_shapes_missing(self):
+        class Model(torch.nn.Module):
+            def forward(
+                self,
+                input_ids=None,
+                pixel_values=None,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=None,
+                token_type_ids=None,
+                cache_position=None,
+            ):
+                return input_ids
+
+        inputs = [
+            dict(
+                input_ids=torch.ones((1, 28), dtype=torch.int64),
+                pixel_values=torch.ones((1, 3, 112, 112), dtype=torch.int64),
+                attention_mask=torch.ones((1, 28), dtype=torch.int64),
+                position_ids=torch.ones((1, 28), dtype=torch.int64),
+                token_type_ids=torch.ones((1, 28), dtype=torch.int64),
+                cache_position=torch.ones((28,), dtype=torch.int64),
+            ),
+            dict(
+                input_ids=torch.ones((1, 1), dtype=torch.int64),
+                attention_mask=torch.ones((1, 29), dtype=torch.int64),
+                position_ids=torch.ones((1, 1), dtype=torch.int64),
+                past_key_values=torch.rand((1, 1, 28, 32)),
+                token_type_ids=torch.ones((1, 1), dtype=torch.int64),
+                cache_position=torch.ones((1,), dtype=torch.int64),
+            ),
+            dict(
+                input_ids=torch.ones((1, 1), dtype=torch.int64),
+                attention_mask=torch.ones((1, 30), dtype=torch.int64),
+                position_ids=torch.ones((1, 1), dtype=torch.int64),
+                past_key_values=torch.rand((1, 1, 29, 32)),
+                token_type_ids=torch.ones((1, 1), dtype=torch.int64),
+                cache_position=torch.ones((1,), dtype=torch.int64),
+            ),
+        ]
+
+        model = Model()
+        observer = InputObserver(
+            value_if_missing=dict(pixel_values=torch.empty((0, 3, 112, 112)))
+        )
+        with observer(model):
+            for kwargs in inputs:
+                model(**kwargs)
+
+        shapes = observer.infer_dynamic_shapes(set_batch_dimension_for=True)
+        cst = torch.export.Dim.DYNAMIC
+        expected = {
+            "input_ids": {0: cst, 1: cst},
+            "pixel_values": {0: cst},
+            "attention_mask": {0: cst, 1: cst},
+            "position_ids": {0: cst, 1: cst},
+            "past_key_values": {0: cst, 2: cst},
+            "token_type_ids": {0: cst, 1: cst},
+            "cache_position": {0: cst},
+        }
+        self.assertEqual(expected, shapes)
+        kwargs = observer.infer_arguments()
+        self.assertEqual(list(expected), list(kwargs))
+        self.assertEqual((0, 3, 112, 112), kwargs["pixel_values"].shape)
+
+    def test_infer_dynamic_shapes_missing_args(self):
+        class Model(torch.nn.Module):
+            def forward(
+                self,
+                input_ids=None,
+                pixel_values=None,
+                attention_mask=None,
+                past_key_values=None,
+            ):
+                return input_ids
+
+        inputs = [
+            (
+                torch.ones((1, 28), dtype=torch.int64),
+                torch.ones((1, 3, 112, 112), dtype=torch.int64),
+                torch.ones((1, 28), dtype=torch.int64),
+            ),
+            (
+                torch.ones((1, 1), dtype=torch.int64),
+                None,
+                torch.ones((1, 29), dtype=torch.int64),
+                torch.rand((1, 1, 28, 32)),
+            ),
+            (
+                torch.ones((1, 1), dtype=torch.int64),
+                None,
+                torch.ones((1, 30), dtype=torch.int64),
+                torch.rand((1, 1, 29, 32)),
+            ),
+        ]
+
+        model = Model()
+        observer = InputObserver(
+            value_if_missing={1: torch.empty((0, 3, 112, 112), dtype=torch.int64)}
+        )
+        with observer(model):
+            for args in inputs:
+                model(*args)
+
+        shapes = observer.infer_dynamic_shapes(set_batch_dimension_for=True)
+        cst = torch.export.Dim.DYNAMIC
+        expected = ({0: cst, 1: cst}, {0: cst}, {0: cst, 1: cst}, {0: cst, 2: cst})
+        self.assertEqual(expected, shapes)
+        args = observer.infer_arguments()
+        self.assertEqual(len(expected), len(args))
+        self.assertEqual((0, 3, 112, 112), args[1].shape)
+
+    def test_infer_dynamic_shapes_missing_kwargs_nested(self):
+        class Model(torch.nn.Module):
+            def forward(
+                self,
+                input_ids=None,
+                pixel_values=None,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=None,
+                token_type_ids=None,
+                cache_position=None,
+            ):
+                return input_ids
+
+        inputs = [
+            dict(
+                input_ids=torch.ones((1, 28), dtype=torch.int64),
+                pixel_values=(
+                    torch.ones((1, 3, 112, 112), dtype=torch.int64),
+                    torch.ones((1, 3, 112, 112), dtype=torch.int64),
+                ),
+                attention_mask=torch.ones((1, 28), dtype=torch.int64),
+                position_ids=torch.ones((1, 28), dtype=torch.int64),
+                token_type_ids=torch.ones((1, 28), dtype=torch.int64),
+                cache_position=torch.ones((28,), dtype=torch.int64),
+            ),
+            dict(
+                input_ids=torch.ones((1, 1), dtype=torch.int64),
+                attention_mask=torch.ones((1, 29), dtype=torch.int64),
+                position_ids=torch.ones((1, 1), dtype=torch.int64),
+                past_key_values=torch.rand((1, 1, 28, 32)),
+                token_type_ids=torch.ones((1, 1), dtype=torch.int64),
+                cache_position=torch.ones((1,), dtype=torch.int64),
+            ),
+            dict(
+                input_ids=torch.ones((1, 1), dtype=torch.int64),
+                attention_mask=torch.ones((1, 30), dtype=torch.int64),
+                position_ids=torch.ones((1, 1), dtype=torch.int64),
+                past_key_values=torch.rand((1, 1, 29, 32)),
+                token_type_ids=torch.ones((1, 1), dtype=torch.int64),
+                cache_position=torch.ones((1,), dtype=torch.int64),
+            ),
+        ]
+
+        model = Model()
+        observer = InputObserver(
+            value_if_missing=dict(
+                pixel_values=(
+                    torch.empty((0, 3, 112, 112), dtype=torch.int64),
+                    torch.empty((0, 3, 112, 112), dtype=torch.int64),
+                )
+            )
+        )
+        with observer(model):
+            for kwargs in inputs:
+                model(**kwargs)
+
+        shapes = observer.infer_dynamic_shapes(set_batch_dimension_for=True)
+        cst = torch.export.Dim.DYNAMIC
+        expected = {
+            "input_ids": {0: cst, 1: cst},
+            "pixel_values": ({0: cst}, {0: cst}),
+            "attention_mask": {0: cst, 1: cst},
+            "position_ids": {0: cst, 1: cst},
+            "past_key_values": {0: cst, 2: cst},
+            "token_type_ids": {0: cst, 1: cst},
+            "cache_position": {0: cst},
+        }
+        self.assertEqual(expected, shapes)
+        kwargs = observer.infer_arguments()
+        self.assertEqual(list(expected), list(kwargs))
+        self.assertIsInstance(kwargs["pixel_values"], tuple)
+        self.assertEqual(2, len(kwargs["pixel_values"]))
+        self.assertEqual((0, 3, 112, 112), kwargs["pixel_values"][0].shape)
+        self.assertEqual((0, 3, 112, 112), kwargs["pixel_values"][1].shape)
+
+    def test_io_captured_kwargs_kwargs(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, **kwargs):
+                return x + kwargs["y"]
+
+        inputs = [
+            dict(x=torch.randn((5, 6)), y=torch.randn((1, 6))),
+            dict(x=torch.randn((7, 7)), y=torch.randn((1, 7))),
+            dict(x=torch.randn((7, 8)), y=torch.randn((1, 8))),
+            dict(x=torch.randn((7, 9)), y=torch.randn((1, 9))),
+        ]
+
+        model = Model()
+        expected = [model(**kwargs) for kwargs in inputs]
+        observer = InputObserver()
+        with observer(model):
+            for kwargs in inputs:
+                model(**kwargs)
+        self.assertEqual(len(observer.info), 3)
+        for i in range(3):
+            self.assertEqual(len(observer.info.flat_outputs[i]), 1)
+            torch.testing.assert_close(expected[i], observer.info.flat_outputs[i][0])
+
+        cst = torch.export.Dim.DYNAMIC
+        ds = observer.infer_dynamic_shapes()
+        self.assertEqual(dict(x={0: cst, 1: cst}, kwargs=dict(y={1: cst})), ds)
+        args = observer.infer_arguments()
+        self.assertIsInstance(args, dict)
+        self.assertEqual(2, len(args))
+        self.assertEqual(["x", "y"], list(args))
+
+        dynamic_shapes = torch.export.AdditionalInputs()
+        for kwargs in inputs:
+            dynamic_shapes.add((), kwargs)
+        dss = dynamic_shapes.dynamic_shapes(model, (), inputs[0])
+        self.assertEqual({"x": (cst, cst), "kwargs": {"y": (None, cst)}}, dss)
+
+    def test_io_captured_kwargs_kwargs_with_args(self):
+        class Model(torch.nn.Module):
+            def forward(self, a, *args, **kwargs):
+                return a - args[0] * args[1] + kwargs["x"] - kwargs["y"]
+
+        inputs = [
+            (
+                (torch.randn((5, 6)), torch.randn((5, 6)), torch.randn((5, 6))),
+                dict(x=torch.randn((5, 6)), y=torch.randn((1, 6))),
+            ),
+            (
+                (torch.randn((7, 7)), torch.randn((7, 7)), torch.randn((7, 7))),
+                dict(x=torch.randn((7, 7)), y=torch.randn((1, 7))),
+            ),
+        ]
+
+        model = Model()
+        expected = [model(*args, **kwargs) for args, kwargs in inputs]
+        observer = InputObserver()
+        with observer(model):
+            for args, kwargs in inputs:
+                model(*args, **kwargs)
+        self.assertEqual(len(observer.info), 2)
+        for i in range(2):
+            self.assertEqual(len(observer.info.flat_outputs[i]), 1)
+            torch.testing.assert_close(expected[i], observer.info.flat_outputs[i][0])
+
+        cst = torch.export.Dim.DYNAMIC
+        ds = observer.infer_dynamic_shapes()
+        self.assertEqual(
+            {
+                "a": {0: cst, 1: cst},
+                "args": ({0: cst, 1: cst}, {0: cst, 1: cst}),
+                "kwargs": {"x": {0: cst, 1: cst}, "y": {1: cst}},
+            },
+            ds,
+        )
+
+        dynamic_shapes = torch.export.AdditionalInputs()
+        for args, kwargs in inputs:
+            dynamic_shapes.add(args, kwargs)
+        dss = dynamic_shapes.dynamic_shapes(model, *inputs[0])
+        self.assertEqual(
+            {
+                "a": (cst, cst),
+                "args": ((cst, cst), (cst, cst)),
+                "kwargs": {"x": (cst, cst), "y": (None, cst)},
+            },
+            dss,
+        )
+
+        with self.assertRaises(RuntimeError):
+            observer.infer_arguments()
+
+        args, kwargs = observer.infer_arguments(as_args_kwargs=True)
+        self.assertIsInstance(kwargs, dict)
+        self.assertEqual(["x", "y"], list(kwargs))
+        self.assertIsInstance(args, tuple)
+        self.assertEqual(len(args), 3)
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/torch/onnx/_internal/exporter/_input_observer.py b/torch/onnx/_internal/exporter/_input_observer.py
index 2a4392919e120..f6fb62b41d474 100644
--- a/torch/onnx/_internal/exporter/_input_observer.py
+++ b/torch/onnx/_internal/exporter/_input_observer.py
@@ -27,11 +27,11 @@ def _flatten_unflatten_for_dynamic_shapes(
 
     Args:
         obj: Object from a custom class.
-        change_function: Function to modify the tensor in the structure itself,
-            like replace them by a shape.
+        change_function: If not None, this function is called to modify the tensors
+            in the structure itself, like replace them by a shape.
 
     Returns:
-        the serialized object
+        The flattened object.
     """
     if isinstance(obj, torch.Tensor):
         return change_function(obj) if change_function else obj
@@ -77,10 +77,10 @@ def _infer_dynamic_dimensions(
 
     Args:
         shape_list:
-            list of shapes, they must all have the same length
+            List of shapes, they must all have the same length.
         set_batch_dimension:
-            forces the first dimension to be treated as dynamic,
-            even if all shapes have the same value for that dimension
+            Forces the first dimension to be treated as dynamic,
+            even if all shapes have the same value for that dimension.
 
     Returns:
         list of dynamic dimensions
@@ -88,7 +88,7 @@ def _infer_dynamic_dimensions(
     unique_ranks = {len(shape) for shape in shape_list}
     torch._check(
         len(unique_ranks) == 1,
-        lambda: "all shapes in shape_list must have the same rank",
+        lambda: f"All shapes in shape_list must have the same rank but {shape_list=}.",
     )
     rank = unique_ranks.pop()
     dynamic = []
@@ -102,6 +102,7 @@ def _infer_dynamic_dimensions(
 class InputCandidate:
     """Retains one set of inputs given to the forward method or any
     other method the class :class:`InputObserver` is stealing from.
+    Any class is allowed as long as it can be flattened.
 
     Args:
         args: Positional arguments.
@@ -110,6 +111,9 @@ class InputCandidate:
             may be modified inplace, the original value must be retained.
         cst_kwargs: Any optional arguments constant over multiple calls.
             int, float, str, bool values must be stored here.
+
+    The constructor flattens the received arguments.
+    Any necessary flattening function should have been registered first.
     """
 
     def __init__(
@@ -283,18 +287,36 @@ class InputObserverInfo:
             to be the same in the ordered dictionaries `add_inputs` receive.
         default_values: Default values defined by the signature of the function,
             any value equal to that is ignored to simplify the export.
+        value_if_missing: If an argument is missing,
+            a default value will be taken in this dictionary,
+            this is used when after the prefill step, an argument
+            disappears (such as `pixel_values`) and another one
+            is added (such as `past_key_values`).
+            The values are only to infer dynamic shapes and arguments,
+            not to run the model.
+        args_name_and_position: Name of parameter `*args`
+            and its position if it exists.
+        kwargs_name: Name of the variable keyword parameter `**kwargs` if it exists.
+
+    This is used by class :class:`InputObserver`.
     """
 
     def __init__(
         self,
         signature_names: list[str],
         default_values: dict[str, int | bool | str | float],
+        value_if_missing: dict[str | int, Any],
+        args_name_and_position: tuple[str, int] | None,
+        kwargs_name: str | None,
     ):
         self.default_values = default_values
+        self.value_if_missing = value_if_missing
         self.inputs: list[InputCandidate] = []
         self.outputs_specs: list[torch.utils._pytree.PyTreeSpec] = []
         self.flat_outputs: list[list[torch.Tensor | None]] = []
         self.latencies: list[float] = []
+        self.args_name_and_position = args_name_and_position
+        self.kwargs_name = kwargs_name
         self.signature_names = signature_names
         self._best_candidate: InputCandidate | None = None
         self._captured_inputs: dict[int | str, int] | None = None
@@ -316,6 +338,7 @@ def add_inputs(self, args: tuple[Any, ...], kwargs: dict[str, Any]):
             if k in self.signature_names
             and isinstance(v, (int, float, bool, str))
             and v != self.default_values.get(k, None)
+            and self.default_values.get(k, None) is not None
         }
         kwargs = {
             k: v
@@ -323,6 +346,50 @@ def add_inputs(self, args: tuple[Any, ...], kwargs: dict[str, Any]):
             if v is not None and not isinstance(v, (int, float, bool, str))
         }
 
+        # adds value_if_missing attributes
+        for k, v in self.value_if_missing.items():
+            if isinstance(k, str):
+                if k not in kwargs:
+                    # Validate that `value_if_missing` keys are compatible
+                    # with the observed signature.
+                    # If the function does not accept **kwargs,
+                    # all value_if_missing keys must be
+                    # present in the observed signature names.
+                    if k not in self.signature_names and not self.kwargs_name:
+                        raise ValueError(
+                            f"Unexpected keyword argument {k!r} "
+                            f"provided as a value_if_missing input "
+                            "for a function that does not accept it. "
+                            f"All value_if_missing keys must "
+                            f"be in the observed signature: {tuple(self.signature_names)}."
+                        )
+                    kwargs[k] = v
+            elif isinstance(k, int):
+                if k >= len(self.signature_names):
+                    raise ValueError(
+                        f"Unexpected keyword argument {k=} "
+                        f"provided as a value_if_missing input "
+                        "for a function that does not accept it. "
+                        f"All value_if_missing indices must "
+                        f"be in the observed signature: {tuple(self.signature_names)}."
+                    )
+                if k >= len(args):
+                    raise NotImplementedError(
+                        f"Unexpected keyword argument {k=} "
+                        f"provided as a value_if_missing input "
+                        "for a function that does not accept it. "
+                        f"All value_if_missing indices must "
+                        f"be in the observed signature: {tuple(self.signature_names)}, "
+                        f"only {len(args)} were given."
+                    )
+                list_args = list(args)
+                list_args[k] = v
+                args = tuple(list_args)
+            else:
+                raise TypeError(
+                    f"Unexpected type {type(k)} for a missing value. The key is {k!r}."
+                )
+
         # kwargs may come in a different order each time.
         # dictionaries are ordered and torch.export.export expects
         # dynamic shapes and kwargs to follow the same order.
@@ -458,27 +525,61 @@ def _set_batch_dimension_for_flat_index(index) -> bool:
         flat_dynamic_shapes = [dict.fromkeys(dims, cst) for dims in dynamic_shapes]
         if return_flat:
             return tuple(flat_dynamic_shapes)
+
+        # Let's regroup.
         if len(flat_dynamic_shapes) == len(self._best_candidate.args) + len(
             self._best_candidate.kwargs
         ):
             # It means forward method is called with tensors only.
-            if not self._best_candidate.kwargs and not self._best_candidate.cst_kwargs:
+            if (
+                not self._best_candidate.kwargs
+                and not self._best_candidate.cst_kwargs
+                and not self.args_name_and_position
+            ):
                 # only positional arguments
                 return tuple(flat_dynamic_shapes)
             if not self._best_candidate.args:
                 # only named arguments
                 ds = dict(zip(list(self._best_candidate.kwargs), flat_dynamic_shapes))
-                return {**ds, **dict.fromkeys(self._best_candidate.cst_kwargs, None)}
+                return self._post_process_for_kwargs(
+                    {**ds, **dict.fromkeys(self._best_candidate.cst_kwargs, None)}
+                )
+            if not self.args_name_and_position:
+                # positional arguments needs to be moved to the named arguments
+                n_args = len(self._best_candidate.args)
+                pos_names = self.signature_names[:n_args]
+                return self._post_process_for_kwargs(
+                    {
+                        **dict(zip(pos_names, flat_dynamic_shapes[:n_args])),
+                        **dict(
+                            zip(
+                                list(self._best_candidate.kwargs),
+                                flat_dynamic_shapes[n_args:],
+                            )
+                        ),
+                        **dict.fromkeys(self._best_candidate.cst_kwargs, None),
+                    }
+                )
             # positional arguments needs to be moved to the named arguments
-            n_args = len(self._best_candidate.args)
+            n_args = min(len(self._best_candidate.args), self.args_name_and_position[1])
+            i_kwargs = max(
+                len(self._best_candidate.args), self.args_name_and_position[1]
+            )
+            var_pos = self.args_name_and_position[0]
             pos_names = self.signature_names[:n_args]
-            return {
-                **dict(zip(pos_names, flat_dynamic_shapes[:n_args])),
-                **dict(
-                    zip(list(self._best_candidate.kwargs), flat_dynamic_shapes[n_args:])
-                ),
-                **dict.fromkeys(self._best_candidate.cst_kwargs, None),
-            }
+            return self._post_process_for_kwargs(
+                {
+                    **dict(zip(pos_names, flat_dynamic_shapes[:n_args])),
+                    var_pos: tuple(flat_dynamic_shapes[n_args:i_kwargs]),
+                    **dict(
+                        zip(
+                            list(self._best_candidate.kwargs),
+                            flat_dynamic_shapes[i_kwargs:],
+                        )
+                    ),
+                    **dict.fromkeys(self._best_candidate.cst_kwargs, None),
+                }
+            )
 
         # nested types, here comes the fun part because the shapes cannot be unflattened,
         # custom classes must appear in their flattened shape.
@@ -518,20 +619,62 @@ def change_function(t):
                 **ds_kwargs,
                 **dict.fromkeys(self._best_candidate.cst_kwargs, None),
             }
-        if not ds_kwargs:
+        if not ds_kwargs and not self.args_name_and_position:
             return tuple(ds_args)
         if not ds_args:
-            return ds_kwargs
-        pos_names = self.signature_names[: len(ds_args)]
-        return {**dict(zip(pos_names, ds_args)), **ds_kwargs}
+            return self._post_process_for_kwargs(ds_kwargs)
+
+        if not self.args_name_and_position:
+            pos_names = self.signature_names[: len(ds_args)]
+            return self._post_process_for_kwargs(
+                {**dict(zip(pos_names, ds_args)), **ds_kwargs}
+            )
+
+        n_args = min(len(ds_args), self.args_name_and_position[1])
+        pos_names = self.signature_names[:n_args]
+        return self._post_process_for_kwargs(
+            {
+                **dict(zip(pos_names, ds_args[:n_args])),
+                self.args_name_and_position[0]: tuple(ds_args[n_args:]),
+                **ds_kwargs,
+            }
+        )
 
     def infer_arguments(
         self,
         index_or_candidate: InputCandidate | int | None = None,
         /,
         flat: bool = False,
-    ) -> list[torch.Tensor | None] | tuple[torch.Tensor, ...] | dict[str, torch.Tensor]:
-        """Infers arguments based on the collected tensors."""
+        as_args_kwargs: bool = False,
+    ) -> (
+        list[torch.Tensor | None]
+        | tuple[torch.Tensor, ...]
+        | dict[str, torch.Tensor]
+        | tuple[list[torch.Tensor] | tuple[torch.Tensor, ...], dict[str, torch.Tensor]]
+    ):
+        """Infers arguments based on the collected tensors.
+
+        Args:
+            index_or_candidate: If missing, the method selects one set of inputs
+                among the available ones, usually the set of inputs containing
+                with the highest number of tensors.
+                It then replaces None values and missing tensors with empty tensors.
+                If not missing, it can be an integer to fetch one of the stored set
+                or some inputs.
+            flat: If True, it returns a flattened list of tensors,
+                if False, it returns a tuple or a dictionary preserving
+                the nested structures. The flat version is used internally.
+                It produces a single list of tensors easier to process or modify
+                rather than a nested structure holding the same tensors.
+                The original structure can be restored with
+                ``torch.utils._pytree.tree_unflatten(flat_list, self.aligned_spec)``.
+                This mechanism is used to replace None values by empty tensors.
+            as_args_kwargs: If True, the method always returns `(args, kwargs)`,
+                otherwise, it returns either a tuple (only args) or a dictionary
+                (only kwargs) or raises an exception if it cannot do so.
+        Returns:
+            Inferred arguments, every optional tensor is replaced by an empty tensor.
+        """
         # This is already checked by _build_inputs_completed_with_none_values
         # but this is not always well captured by tools checking types.
         self.align_inputs_none_values()
@@ -540,9 +683,9 @@ def infer_arguments(
         if index_or_candidate is None:
             for cand in self.inputs:
                 args, kwargs = cand.args, cand.kwargs
-                if len(args) == len(self._best_candidate.args) and len(kwargs) == len(
-                    self._best_candidate.kwargs
-                ):
+                if len(args) == len(self._best_candidate.args or ()) and len(
+                    kwargs
+                ) == len(self._best_candidate.kwargs or {}):
                     candidate = cand
                     break
         elif isinstance(index_or_candidate, int):
@@ -622,13 +765,50 @@ def infer_arguments(
             # pyrefly: ignore[invalid-argument]
             kwargs = {**kwargs, **self._best_candidate.cst_kwargs}
 
-        if not kwargs:
-            return args
-        if not args:
+        if not as_args_kwargs:
+            if not kwargs:
+                return args
+            if not args:
+                return kwargs
+
+            # We need to move args to kwargs
+            if self.args_name_and_position:
+                raise RuntimeError(
+                    "Cannot return arguments "
+                    "as a single tuple or a single dictionary "
+                    "because of '*args' in the function signature. "
+                    "You need to set `as_args_kwargs=True`."
+                )
+            n_args = len(args)
+            pos_names = self.signature_names[:n_args]
+            return {**dict(zip(pos_names, args[:n_args])), **kwargs}
+
+        # Generic case.
+        return tuple(args), kwargs
+
+    def _post_process_for_kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]:
+        """:func:`torch.export.export` requires dynamic shapes and keyword arguments
+        that are not part of the explicit function signature but are collected via
+        ``**<kwargs_name>`` to be wrapped under the corresponding parameter name
+        (``self.kwargs_name``) as ``{<kwargs_name>: {'param': shape or tensor}}``.
+        This function ensures this wrapping is performed when ``self.kwargs_name`` is set.
+        """
+        if not self.kwargs_name:
+            # Nothing to do here.
             return kwargs
-        # We need to move args to kwargs
-        pos_names = self.signature_names[: len(args)]
-        return {**dict(zip(pos_names, args)), **kwargs}
+        to_be_moved = {k for k in kwargs if k not in self.signature_names}
+        if not to_be_moved:
+            return kwargs
+        keywords = {k: v for k, v in kwargs.items() if k in to_be_moved}
+        new_kwargs = {k: v for k, v in kwargs.items() if k not in to_be_moved}
+        if self.kwargs_name in new_kwargs:
+            raise ValueError(
+                f"Keyword argument name collision: received a keyword argument "
+                f"'{self.kwargs_name}' which conflicts with the **{self.kwargs_name} "
+                "parameter used to collect extra keyword arguments. "
+                "Passing a keyword argument with this name is not supported."
+            )
+        return {**new_kwargs, self.kwargs_name: keywords}
 
 
 class InputObserver:
@@ -636,6 +816,15 @@ class InputObserver:
     This information is used to infer dynamic shapes and
     export arguments.
 
+    Args:
+        value_if_missing: If an argument is missing,
+            a default value will be taken in this dictionary,
+            this is used when after the prefill step, an argument
+            disappears (such as `pixel_values`) and another one
+            is added (such as `past_key_values`).
+            The values are only to infer dynamic shapes and arguments,
+            not to run the model.
+
     Examples
     --------
     >>> input_observer = InputObserver()
@@ -660,11 +849,59 @@ class InputObserver:
     >>>     dynamic_shapes.input_observer.infer_dynamic_shapes(),
     >>> )
 
+    The last example considers an LLM taking images and text as inputs.
+    The first call to the forward method which we try to export has `pixel_values`
+    but no `past_key_values`. The next calls do not have `pixel_values` but
+    `past_key_values`. The observer understands `pixel_values` and `past_key_values`
+    are needed but they may not be both specified at the same time.
+    Since `pixel_values` only appears in the first call, the observer cannot
+    tell how to infer an empty tensor for this argument. That's what the argument
+    `value_if_missing` is for. The following example is more than a dummy example
+    but shows how to use it with ``transformers``.
+
+    .. code-block:: python
+
+        from transformers import pipeline
+
+        model_id = "tiny-random/gemma-3"
+        pipe = pipeline(
+            "image-text-to-text",
+            model=model_id,
+            device="cpu",
+            trust_remote_code=True,
+            max_new_tokens=3,
+            dtype=torch.float16,
+        )
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG",
+                    },
+                    {"type": "text", "text": "What animal is on the candy?"},
+                ],
+            },
+        ]
+        observer = InputObserver(
+            value_if_missing=dict(
+                pixel_values=torch.empty((0, 3, 896, 896), dtype=torch.float16)
+            )
+        )
+        with observer(pipe.model):
+            pipe(text=messages, max_new_tokens=4)
+
     .. versionadded:: 2.11.0
     """
 
-    def __init__(self):
+    def __init__(self, value_if_missing: dict[str | int, Any] | None = None):
         self.info: InputObserverInfo | None = None
+        self.value_if_missing = value_if_missing or {}
 
     def _replaced_method(
         self,
@@ -716,6 +953,16 @@ def __call__(
         captured_method = getattr(model, method_name)
         sig = inspect.signature(captured_method)
         if self.info is None:
+            kwargs_names = [
+                p
+                for p in sig.parameters
+                if sig.parameters[p].kind == inspect.Parameter.VAR_KEYWORD
+            ]
+            args_names = [
+                (p, i)
+                for (i, p) in enumerate(sig.parameters)
+                if sig.parameters[p].kind == inspect.Parameter.VAR_POSITIONAL
+            ]
             self.info = InputObserverInfo(
                 signature_names=list(sig.parameters),
                 default_values={
@@ -724,6 +971,9 @@ def __call__(
                     if p.default != inspect.Parameter.empty
                     and isinstance(p.default, (int, bool, str, float))
                 },
+                value_if_missing=self.value_if_missing,
+                args_name_and_position=args_names[0] if args_names else None,
+                kwargs_name=kwargs_names[0] if kwargs_names else None,
             )
         n_already_stored = len(self.info)
         lambda_method = lambda *args, _cm=captured_method, _snc=(  # noqa: E731
@@ -777,7 +1027,13 @@ def infer_arguments(
         self,
         index_or_args_or_kwargs: tuple[Any] | dict[str, Any] | int | None = None,
         flat: bool = False,
-    ) -> list[torch.Tensor | None] | tuple[torch.Tensor, ...] | dict[str, torch.Tensor]:
+        as_args_kwargs: bool = False,
+    ) -> (
+        list[torch.Tensor | None]
+        | tuple[torch.Tensor, ...]
+        | dict[str, torch.Tensor]
+        | tuple[list[torch.Tensor] | tuple[torch.Tensor, ...], dict[str, torch.Tensor]]
+    ):
         """Infers arguments based on the collected tensors.
 
         Args:
@@ -789,8 +1045,15 @@ def infer_arguments(
                 or some inputs.
             flat: If True, it returns a flattened list of tensors,
                 if False, it returns a tuple or a dictionary preserving
-                the nested structures.
-
+                the nested structures. The flat version is used internally.
+                It produces a single list of tensors easier to process or modify
+                rather than a nested structure holding the same tensors.
+                The original structure can be restored with
+                ``torch.utils._pytree.tree_unflatten(flat_list, self.aligned_spec)``.
+                This mechanism is used to replace None values by empty tensors.
+            as_args_kwargs: If True, the method always returns `(args, kwargs)`,
+                otherwise, it returns either a tuple (only args) or a dictionary
+                (only kwargs) or raises an exception if it cannot do so.
         Returns:
             Inferred arguments, every optional tensor is replaced by an empty tensor.
         """
@@ -832,7 +1095,11 @@ def infer_arguments(
                 self.info._captured_inputs,
                 self.info.signature_names,
             )
-        return self.info.infer_arguments(index_or_candidate, flat=flat)
+        return self.info.infer_arguments(
+            index_or_candidate,
+            flat=flat,
+            as_args_kwargs=as_args_kwargs,
+        )
 
     def check_discrepancies(
         self,
@@ -843,25 +1110,26 @@ def check_discrepancies(
         initializer: Callable[
             [str | bytes], ort.InferenceSession
         ] = _onnx_program._ort_session_initializer,
-    ) -> list[dict[str, str | int | float]]:
+        skip_none: bool = True,
+    ) -> list[dict[str, str | int | float | bool]]:
         """Computes the discrepancies between the saved inputs and outputs
         with the saved onnx model.
 
         Args:
-            onnx_program:
-                Exported Model to verify.
-            atol:
-                Absolute tolerance, recommended values, 1e-4 for float, 1e-2 for float16.
-            rtol:
-                Relative tolerance.
-            progress_bar:
-                Shows a progress bar (requires `tqdm`).
-            initializer: The function to initialize the ONNX Runtime inference
+            onnx_program: Exported Model to verify.
+            atol: Absolute tolerance, recommended values, 1e-4 for float, 1e-2 for float16.
+            rtol: Relative tolerance.
+            progress_bar: Shows a progress bar (requires `tqdm`).
+            initializer: The function called to initialize the ONNX Runtime inference
                 session with the specified model. By default, it uses the
                 `_ort_session_initializer` function.
+            skip_none: Does not check discrepancies when an output is None.
 
         Returns:
             A list of dictionaries, ready to be consumed by a dataframe.
+
+        The function catches exceptions, it shows the error in the returned
+        summary.
         """
         # For big models, we should consider taking a filename to avoid the users
         # creating the model proto twice.
@@ -904,7 +1172,9 @@ def check_discrepancies(
 
             duration = time.perf_counter() - begin
             if error:
-                diff: dict[str, Any] = dict(error=error, SUCCESS=False)
+                diff: dict[str, str | int | float | bool] = dict(
+                    error=error, SUCCESS=False
+                )
             elif ort_outputs is None or len(outputs) != len(ort_outputs):
                 diff = dict(SUCCESS=False, error="not the same number of outputs")
             else:
@@ -915,7 +1185,7 @@ def check_discrepancies(
                 # pyrefly: ignore[no-matching-overload]
                 for torch_tensor, ort_tensor in zip(outputs, ort_outputs):
                     if torch_tensor is None or ort_tensor is None:
-                        if type(torch_tensor) is not type(ort_tensor):
+                        if type(torch_tensor) is not type(ort_tensor) and not skip_none:
                             success = False
                             error = "missing output"
                             break

From 9bb65cc0961441322f2b5e1acb4d7b92fe199507 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 24 Feb 2026 15:15:36 -0800
Subject: [PATCH 17/87] Bump transformers version to 5.2.0 (#175661)

Bump transformers version to 5.2.0 (#175274)

Take over the Dependabot PR from https://github.com/pytorch/pytorch/pull/175147 to fix the failures there

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175274
Approved by: https://github.com/xmfan, https://github.com/malfet

(cherry picked from commit 268cfa727a55fda9a68a791bc3d4e0d941b220ba)

Co-authored-by: Huy Do <huydhn@gmail.com>
---
 .../ci_commit_pins/huggingface-requirements.txt  |  2 +-
 .../aot_eager_huggingface_inference.csv          |  2 +-
 .../aot_eager_huggingface_training.csv           |  2 +-
 ...ductor_amp_freezing_huggingface_inference.csv |  2 +-
 ...u_inductor_freezing_huggingface_inference.csv |  2 +-
 .../cpu_inductor_huggingface_inference.csv       |  2 +-
 .../dynamic_aot_eager_huggingface_inference.csv  |  2 +-
 .../dynamic_aot_eager_huggingface_training.csv   |  2 +-
 ...ynamic_cpu_inductor_huggingface_inference.csv |  2 +-
 ...ductor_amp_freezing_huggingface_inference.csv |  2 +-
 .../dynamic_inductor_huggingface_inference.csv   |  2 +-
 .../dynamic_inductor_huggingface_training.csv    |  2 +-
 .../dynamo_eager_huggingface_inference.csv       |  2 +-
 .../dynamo_eager_huggingface_training.csv        |  2 +-
 .../inductor_huggingface_inference.csv           |  2 +-
 .../inductor_huggingface_training.csv            |  2 +-
 .../rocm/aot_eager_huggingface_inference.csv     |  2 +-
 .../rocm/aot_eager_huggingface_training.csv      |  4 ++--
 .../dynamic_aot_eager_huggingface_inference.csv  |  2 +-
 .../dynamic_aot_eager_huggingface_training.csv   |  4 ++--
 .../dynamic_inductor_huggingface_inference.csv   |  2 +-
 .../dynamic_inductor_huggingface_training.csv    |  4 ++--
 .../rocm/dynamo_eager_huggingface_inference.csv  |  2 +-
 .../rocm/dynamo_eager_huggingface_training.csv   |  4 ++--
 .../rocm/inductor_huggingface_inference.csv      |  2 +-
 .../rocm/inductor_huggingface_training.csv       |  2 +-
 test/distributed/test_dynamo_distributed.py      | 16 ++++++++++++++++
 27 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/huggingface-requirements.txt b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
index 408343c9099c8..08538ff511057 100644
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@@ -1,2 +1,2 @@
-transformers==4.57.5
+transformers==5.2.0
 soxr==0.5.0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
index 82d21ea3cb298..5ca03b5ecf9fb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
index 439c9bf530468..e4aabce10466d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
index 439c9bf530468..e4aabce10466d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
index 439c9bf530468..e4aabce10466d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
index 82d21ea3cb298..5ca03b5ecf9fb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
index 439c9bf530468..e4aabce10466d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
index 439c9bf530468..e4aabce10466d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
index 82d21ea3cb298..5ca03b5ecf9fb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
index 82d21ea3cb298..5ca03b5ecf9fb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
index 82d21ea3cb298..5ca03b5ecf9fb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
index deea60d3203e2..af120d3d31b3b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
@@ -30,7 +30,7 @@ DistilBertForMaskedLM,pass,5
 
 
-DistillGPT2,pass,7
+DistillGPT2,fail_accuracy,7
 
 
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
index deea60d3203e2..af120d3d31b3b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
@@ -30,7 +30,7 @@ DistilBertForMaskedLM,pass,5
 
 
-DistillGPT2,pass,7
+DistillGPT2,fail_accuracy,7
 
 
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
index deea60d3203e2..af120d3d31b3b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
@@ -30,7 +30,7 @@ DistilBertForMaskedLM,pass,5
 
 
-DistillGPT2,pass,7
+DistillGPT2,fail_accuracy,7
 
 
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
index d3f0fbba71826..87dd88078f222 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
index deea60d3203e2..af120d3d31b3b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
@@ -30,7 +30,7 @@ DistilBertForMaskedLM,pass,5
 
 
-DistillGPT2,pass,7
+DistillGPT2,fail_accuracy,7
 
 
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
index 46f1e5adf4ec9..6f65795e3f04e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,0
 
 
-M2M100ForConditionalGeneration,pass,0
+M2M100ForConditionalGeneration,pass,7
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
index e06f3bde8af13..07ec2bb634b39 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
@@ -50,7 +50,7 @@ LayoutLMForMaskedLM,pass,5
 
 
-M2M100ForConditionalGeneration,pass,4
+M2M100ForConditionalGeneration,pass,11
 
 
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 61186034c746f..418c845c88e86 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -351,6 +351,8 @@ def run_hf_bert_ddp(self, model, inputs, backend):
 
 
 class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
+    @unittest.expectedFailure
+    # https://github.com/huggingface/transformers/issues/44188
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", True)
     @patch.object(torch._inductor.config, "fallback_random", True)
@@ -363,6 +365,8 @@ def test_hf_bert_ddp_inductor(self):
         model = FakeDDP(model)
         run_hf_bert_ddp(self, model, inputs, "inductor")
 
+    @unittest.expectedFailure
+    # https://github.com/huggingface/transformers/issues/44188
     @patch.object(config, "optimize_ddp", True)
     def test_hf_bert_ddp_aot_eager(self):
         model, inputs = get_hf_bert(0)
@@ -597,6 +601,8 @@ def _test_hf_bert_ddp_inductor(self, static_graph):
             model = DDP(model, static_graph=static_graph)
             run_hf_bert_ddp(self, model, inputs, "inductor")
 
+    @unittest.expectedFailure
+    # https://github.com/huggingface/transformers/issues/44188
     @skip_if_lt_x_gpu(2)
     @import_transformers_or_skip()
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@@ -605,6 +611,8 @@ def _test_hf_bert_ddp_inductor(self, static_graph):
     def test_hf_bert_ddp_inductor(self):
         self._test_hf_bert_ddp_inductor(static_graph=False)
 
+    @unittest.expectedFailure
+    # https://github.com/huggingface/transformers/issues/44188
     @skip_if_lt_x_gpu(2)
     @import_transformers_or_skip()
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@@ -619,12 +627,16 @@ def _test_hf_bert_aot_eager(self, static_graph):
             model = DDP(model, static_graph=static_graph)
             run_hf_bert_ddp(self, model, inputs, "aot_eager")
 
+    @unittest.expectedFailure
+    # https://github.com/huggingface/transformers/issues/44188
     @skip_if_lt_x_gpu(2)
     @import_transformers_or_skip()
     @config.patch(optimize_ddp=True, enable_compiler_collectives=True)
     def test_hf_bert_ddp_aot_eager(self):
         self._test_hf_bert_aot_eager(static_graph=False)
 
+    @unittest.expectedFailure
+    # https://github.com/huggingface/transformers/issues/44188
     @skip_if_lt_x_gpu(2)
     @import_transformers_or_skip()
     @config.patch(optimize_ddp=True, enable_compiler_collectives=True)
@@ -843,6 +855,8 @@ def test_fsdp_activation_checkpointing(self):
                 find_first_node(cnt.graphs[0], tag_activation_checkpoint) is not None
             )
 
+    @unittest.expectedFailure
+    # https://github.com/huggingface/transformers/issues/44188
     @import_transformers_or_skip()
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert
@@ -888,6 +902,8 @@ def apply_fsdp(model, wrap_policy):
                 )
                 self.assertTrue(same(correct_results, opt_results))
 
+    @unittest.expectedFailure
+    # https://github.com/huggingface/transformers/issues/44188
     @import_transformers_or_skip()
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert

From 8fcce8e16da24ecf53d6a4b1227e76fbf1d8bd46 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 25 Feb 2026 12:12:08 -0800
Subject: [PATCH 18/87] [CI] Switch vLLM test and benchmark workflows to CUDA
 13.0 (#175781)

[CI] Switch vLLM test and benchmark workflows to CUDA 13.0 (#175393)

We should run vLLM test and benchmark on CUDA 13.0 now
Pull Request resolved: https://github.com/pytorch/pytorch/pull/175393
Approved by: https://github.com/zou3519

(cherry picked from commit 72d0e643eb90f14085bab5e9cab8d3cceb0d7847)

Co-authored-by: Huy Do <huydhn@gmail.com>
---
 .ci/docker/build.sh                                    | 4 ++--
 .ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml | 4 ++--
 .github/workflows/_vllm-benchmark.yml                  | 4 ++--
 .github/workflows/_vllm-build.yml                      | 2 +-
 .github/workflows/docker-builds.yml                    | 3 +--
 .github/workflows/vllm-benchmark.yml                   | 6 +++---
 .github/workflows/vllm.yml                             | 8 ++++----
 7 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 37c082e7d378e..1bc7286d4abd0 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -151,8 +151,8 @@ case "$tag" in
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
-  pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm)
-    CUDA_VERSION=12.9.1
+  pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm)
+    CUDA_VERSION=13.0.2
     ANACONDA_PYTHON_VERSION=3.12
     GCC_VERSION=11
     VISION=yes
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml
index f2f450b6f9004..0327172b414a2 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml
@@ -104,10 +104,10 @@ vllm_pytorch_compilation_unit_tests:
 
 vllm_language_model_test_extended_generation_28_failure_test:
   title: Language Models Test (Extended Generation) 2.8 release failure
-  id: vllm_languagde_model_test_extended_generation_28_failure_test
+  id: vllm_language_model_test_extended_generation_28_failure_test
   package_install:
     - --no-build-isolation
-    - git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8
+    - git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0
   steps:
     - pytest -v -s models/language/generation/test_mistral.py
 
diff --git a/.github/workflows/_vllm-benchmark.yml b/.github/workflows/_vllm-benchmark.yml
index d5aa61a6341c7..7d43c271c4f1e 100644
--- a/.github/workflows/_vllm-benchmark.yml
+++ b/.github/workflows/_vllm-benchmark.yml
@@ -14,7 +14,7 @@ on:
       build_environment:
         required: true
         type: string
-        description: The build environment name, e.g. linux-jammy-cuda12.9-py3.12-gcc11
+        description: The build environment name, e.g. linux-jammy-cuda13.0-py3.12-gcc11
       pytorch_branch:
         required: false
         type: string
@@ -106,7 +106,7 @@ jobs:
             dist/ao/torchao-*.whl \
             dist/vllm/vllm-*.whl \
             dist/deepgemm/deep_gemm-*.whl \
-            --extra-index-url https://download.pytorch.org/whl/cu129 \
+            --extra-index-url https://download.pytorch.org/whl/cu130 \
             --index-strategy unsafe-best-match
 
       - name: Print some debug information
diff --git a/.github/workflows/_vllm-build.yml b/.github/workflows/_vllm-build.yml
index 630b5e8b6075d..b3e8c546c66f0 100644
--- a/.github/workflows/_vllm-build.yml
+++ b/.github/workflows/_vllm-build.yml
@@ -14,7 +14,7 @@ on:
       build_environment:
         required: true
         type: string
-        description: The build environment name, e.g. linux-jammy-cuda12.9-py3.12-gcc11
+        description: The build environment name, e.g. linux-jammy-cuda13.0-py3.12-gcc11
       pytorch_branch:
         required: false
         type: string
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index a0df8bccc8df9..dc9ecef7860ae 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -51,10 +51,9 @@ jobs:
         docker-image-name: [
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
           pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm,
+          pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks,
           pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
           pytorch-linux-jammy-py3.10-clang15,
           pytorch-linux-jammy-py3.11-clang15,
           pytorch-linux-jammy-py3.12-clang15,
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
index b15dbc7c2db2e..2d066241594b3 100644
--- a/.github/workflows/vllm-benchmark.yml
+++ b/.github/workflows/vllm-benchmark.yml
@@ -41,8 +41,8 @@ jobs:
     outputs:
       benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }}
       docker_image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      torch_cuda_arch_list: '8.0 8.9 9.0 10.0'
-      build_environment: linux-jammy-cuda12.9-py3.12-gcc11
+      torch_cuda_arch_list: '8.0 8.9 9.0 10.0 12.0'
+      build_environment: linux-jammy-cuda13.0-py3.12-gcc11
     steps:
       - uses: pytorch/test-infra/.github/actions/setup-uv@release/2.11
         with:
@@ -85,7 +85,7 @@ jobs:
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.11
         with:
           working-directory: pytorch/pytorch
-          docker-image-name: ci-image:pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm
+          docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm
 
   build:
     name: Build PyTorch and vLLM
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index df657d057bbd0..eb1c78019b10e 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -27,9 +27,9 @@ jobs:
       allow-reuse-old-whl: false
       build-additional-packages: "vision audio"
       build-external-packages: "vllm"
-      build-environment: linux-jammy-cuda12.9-py3.12-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.9-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.0 8.9 9.0 10.0'
+      build-environment: linux-jammy-cuda13.0-py3.12-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm
+      cuda-arch-list: '8.0 8.9 9.0 10.0 12.0'
       runner: linux.24xlarge.memory
       test-matrix: |
         { include: [
@@ -58,7 +58,7 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     with:
-      build-environment: linux-jammy-cuda12.9-py3.12-gcc11
+      build-environment: linux-jammy-cuda13.0-py3.12-gcc11
       docker-image: ${{ needs.build.outputs.docker-image }}
       test-matrix: ${{ needs.build.outputs.test-matrix }}
     secrets: inherit

From 6fe4bfbdcf13df37ee8297072d0977cde30d51e3 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 25 Feb 2026 12:15:15 -0800
Subject: [PATCH 19/87] Update vLLM pinned commit (#175238) (#175783)

Two tweaks:

* Move some tests around to match what they are in vLLM.  I'll work on a proper fix for this later to avoid the need to do this manually
* Fix 12.8 build. See https://github.com/vllm-project/vllm/pull/34791

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175238
Approved by: https://github.com/angelayi, https://github.com/zou3519

Co-authored-by: PyTorch UpdateBot <pytorchupdatebot@users.noreply.github.com>
---
 .../cli/lib/core/vllm/vllm_test_library.yaml  | 24 +++++++++----------
 .github/ci_commit_pins/vllm.txt               |  2 +-
 .github/ci_configs/vllm/Dockerfile            |  6 ++---
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml
index 0327172b414a2..948a771385686 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test_library.yaml
@@ -20,7 +20,8 @@ vllm_basic_models_test:
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_utils.py
     - pytest -v -s models/test_vision.py
-    - pytest -v -s models/test_initialization.py
+    - pytest -v -s models/test_initialization.py -k 'not voxtral'
+    - HF_DATASETS_OFFLINE=0 TRANSFORMERS_OFFLINE=0 pytest -v -s models/test_initialization.py -k voxtral
 
 vllm_entrypoints_test:
   title: Entrypoints Test
@@ -60,7 +61,7 @@ vllm_distributed_test_28_failure_test:
     VLLM_WORKER_MULTIPROC_METHOD: spawn
   num_gpus: 4
   steps:
-    - pytest -v -s distributed/test_sequence_parallel.py
+    - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py
 
 vllm_lora_28_failure_test:
   title: LoRA pytorch 2.8 failure test
@@ -85,21 +86,20 @@ vllm_multi_model_test_28_failure_test:
   package_install:
     - git+https://github.com/TIGER-AI-Lab/Mantis.git
   steps:
-    - pytest -v -s models/multimodal/generation/test_voxtral.py -k 'not 5-128-half'
-    - HF_DATASETS_OFFLINE=0 TRANSFORMERS_OFFLINE=0 pytest -v -s models/multimodal/generation/test_voxtral.py -k 5-128-half
+    - HF_DATASETS_OFFLINE=0 TRANSFORMERS_OFFLINE=0 pytest -v -s models/multimodal/generation/test_voxtral.py
     - pytest -v -s models/multimodal/pooling
 
 vllm_pytorch_compilation_unit_tests:
   title: PyTorch Compilation Unit Tests
   id: vllm_pytorch_compilation_unit_tests
   steps:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s compile/distributed/test_async_tp.py
-    - pytest -v -s compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s compile/passes/test_pass_manager.py
+    - pytest -v -s compile/passes/test_fusion.py
+    - pytest -v -s compile/passes/test_fusion_attn.py
+    - pytest -v -s compile/passes/test_silu_mul_quant_fusion.py
+    - pytest -v -s compile/passes/distributed/test_sequence_parallelism.py
+    - pytest -v -s compile/passes/distributed/test_async_tp.py
+    - pytest -v -s compile/passes/distributed/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
 
 vllm_language_model_test_extended_generation_28_failure_test:
@@ -118,7 +118,7 @@ vllm_distributed_test_2_gpu_28_failure_test:
     VLLM_WORKER_MULTIPROC_METHOD: spawn
   num_gpus: 4
   steps:
-    - pytest -v -s distributed/test_sequence_parallel.py
+    - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py
 
 vllm_lora_test:
   title: LoRA Test %N
diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt
index 235f99edfa759..c211a526574ec 100644
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@@ -1 +1 @@
-52ee21021a87735d46c4245c60bc0be42dd58c73
+a4047d4ea993fd52038433d87c16e603bee4f214
diff --git a/.github/ci_configs/vllm/Dockerfile b/.github/ci_configs/vllm/Dockerfile
index 1d6ac16926834..549c336a444cb 100644
--- a/.github/ci_configs/vllm/Dockerfile
+++ b/.github/ci_configs/vllm/Dockerfile
@@ -146,6 +146,9 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 
+ARG torch_cuda_arch_list='8.0;8.9;9.0;10.0;12.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
 # Use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git \
@@ -171,9 +174,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && sccache --show-stats; \
     fi
 
-ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
 ARG vllm_target_device="cuda"
 ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache

From 57614edc43a7e6c902b7a12389058252171a49eb Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 25 Feb 2026 12:31:27 -0800
Subject: [PATCH 20/87] [ROCm][CI] Upgrade ROCm CI to 7.2 - 4/N (#175767)

[ROCm][CI] Upgrade ROCm CI to 7.2 - 4/N (#173188)

In parallel with https://github.com/pytorch/pytorch/pull/173187

Pull Request resolved: https://github.com/pytorch/pytorch/pull/173188
Approved by: https://github.com/jeffdaily


(cherry picked from commit 8301e14b7003034ed707e3164361f789c93a45f5)

Co-authored-by: Jithun Nair <jithun.nair@amd.com>
Co-authored-by: Jeff Daily <jeff.daily@amd.com>
Co-authored-by: Jack Taylor <jack.taylor@amd.com>
---
 .ci/docker/build.sh                           |  2 +-
 .ci/docker/common/install_rocm.sh             | 23 ++++++++++++
 c10/util/complex_math.h                       | 35 +++++++++++++++++++
 test/distributed/test_dynamo_distributed.py   |  5 ++-
 test/inductor/test_aot_inductor.py            |  1 +
 test/inductor/test_ck_backend.py              |  4 +++
 .../test_torchinductor_dynamic_shapes.py      |  2 ++
 test/run_test.py                              |  4 +++
 test/test_linalg.py                           | 10 +++++-
 .../_internal/common_methods_invocations.py   | 16 +++++++++
 10 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 1bc7286d4abd0..9a051dc84aef5 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -195,7 +195,7 @@ case "$tag" in
     fi
     GCC_VERSION=11
     VISION=yes
-    ROCM_VERSION=7.1
+    ROCM_VERSION=7.2
     NINJA_VERSION=1.9.0
     TRITON=yes
     KATEX=yes
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index 21e5968016bd6..8b673a23f9de5 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -154,6 +154,29 @@ EOF
       fi
     fi
 
+    # ROCm 7.2 needs a fix from procprof sdk that isn't available until 7.2.1
+    if [[ $(ver $ROCM_VERSION) -eq $(ver 7.2) ]]; then
+        git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-systems.git
+        pushd rocm-systems/
+        git sparse-checkout init --cone
+        git sparse-checkout set projects/rocprofiler-sdk shared/rocprofiler-compute
+        git checkout develop
+        git checkout rocm-7.2.0
+        git config --global user.email "you@example.com"
+        git config --global user.name "Your Name"
+        git cherry-pick a71cc3cc88ed68b24c40cefec77d764053044862
+        sudo apt install -y cmake libdw-dev libsqlite3-dev
+        cmake                                         \
+              -B rocprofiler-sdk-build                \
+              -DCMAKE_INSTALL_PREFIX=/opt/rocm        \
+              -DCMAKE_PREFIX_PATH=/opt/rocm           \
+              -DGPU_TARGETS="${PYTORCH_ROCM_ARCH}"    \
+              projects/rocprofiler-sdk
+        cmake --build rocprofiler-sdk-build --target all --parallel $(nproc)
+        cmake --build rocprofiler-sdk-build --target install
+        popd
+    fi
+
     # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
     for kdb in /opt/rocm/share/miopen/db/*.kdb
     do
diff --git a/c10/util/complex_math.h b/c10/util/complex_math.h
index 2b591026c94da..d369df5059231 100644
--- a/c10/util/complex_math.h
+++ b/c10/util/complex_math.h
@@ -86,6 +86,41 @@ C10_HOST_DEVICE inline c10::complex<T> pow(
 #endif
 }
 
+// Regression in ROCm 7.2. See https://github.com/ROCm/rocm-libraries/pull/3836.
+// Specialized version for complex<float> on AMD GPUs to use FMA-based
+// multiplication
+#if defined(__HIPCC__)
+namespace detail {
+// FMA-aware complex multiplication for float precision on AMD GPUs.
+// This prevents SLP vectorizer from breaking FMA formation, which causes
+// numerical precision loss in complex arithmetic.
+// The issue occurs when vectorizer packs scalar multiplies before backend
+// can form FMA instructions, resulting in double rounding instead of single.
+C10_HOST_DEVICE inline thrust::complex<float> complex_mul_fma(
+    thrust::complex<float> a,
+    thrust::complex<float> b) {
+  // Complex multiplication: (a.r + a.i*i) * (b.r + b.i*i)
+  // = (a.r*b.r - a.i*b.i) + (a.r*b.i + a.i*b.r)*i
+  // Using __builtin_fmaf ensures FMA at source level:
+  // real: a.r*b.r + (-(a.i*b.i)) = FMA(a.r, b.r, -(a.i*b.i))
+  // imag: a.i*b.r + a.r*b.i = FMA(a.r, b.i, a.i*b.r)
+  float real_part = __builtin_fmaf(a.real(), b.real(), -(a.imag() * b.imag()));
+  float imag_part = __builtin_fmaf(a.real(), b.imag(), a.imag() * b.real());
+  return thrust::complex<float>(real_part, imag_part);
+}
+} // namespace detail
+
+template <>
+C10_HOST_DEVICE inline c10::complex<float> pow(
+    const c10::complex<float>& x,
+    const c10::complex<float>& y) {
+  auto log_x = thrust::log(static_cast<thrust::complex<float>>(x));
+  auto y_log_x =
+      detail::complex_mul_fma(static_cast<thrust::complex<float>>(y), log_x);
+  return static_cast<c10::complex<float>>(thrust::exp(y_log_x));
+}
+#endif
+
 template <typename T>
 C10_HOST_DEVICE inline c10::complex<T> pow(
     const c10::complex<T>& x,
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 418c845c88e86..cf09bf7ed9606 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -47,7 +47,7 @@
     requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import skipIfXpu
+from torch.testing._internal.common_utils import MI350_ARCH, skipIfRocmArch, skipIfXpu
 from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
@@ -808,6 +808,7 @@ def test_fsdp_unspecialized_forced_getattr_inline(self):
             outputs = fsdp_m(inputs)
             self.assertTrue(same(correct_outputs, outputs))
 
+    @skipIfRocmArch(MI350_ARCH)  # regression in ROCm 7.2
     @config.patch(enable_compiler_collectives=True)
     @skip_if_lt_x_gpu(1)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@@ -1495,6 +1496,7 @@ def test_ddp_baseline_aot_eager(self):
         outputs = ddp_m(inputs)
         self.assertTrue(same(correct_outputs, outputs))
 
+    @skipIfRocmArch(MI350_ARCH)  # regression in ROCm 7.2
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", False)
     def test_ddp_baseline_inductor(self):
@@ -1700,6 +1702,7 @@ def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
         model(hidden_states)
         torch.accelerator.synchronize()
 
+    @skipIfRocmArch(MI350_ARCH)  # regression in ROCm 7.2
     @patch.object(config, "optimize_ddp", True)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_graph_split_inductor(self):
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 0f0fcdb842085..8ac981f0bbc37 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -3502,6 +3502,7 @@ def forward(self, x):
         example_inputs = (torch.randn(3, 10, device=self.device),)
         self.check_model(Model(), example_inputs)
 
+    @skipIfRocmArch(NAVI_ARCH)  # regression on ROCm 7.2
     def test_repeated_calling(self):
         if self.device != "cuda":
             raise unittest.SkipTest("requires CUDA")
diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py
index 079be79fcc9d8..8cae41dfbae37 100644
--- a/test/inductor/test_ck_backend.py
+++ b/test/inductor/test_ck_backend.py
@@ -16,7 +16,9 @@
 from torch.testing._internal.common_cuda import tf32_off
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    MI350_ARCH,
     parametrize,
+    skipIfRocmArch,
 )
 from torch.testing._internal.inductor_utils import (
     _quantize_rowwise,
@@ -235,6 +237,8 @@ def mm(a, b):
             Y_eager = a @ b
             torch.testing.assert_close(Y_compiled, Y_eager, equal_nan=True)
 
+    # regression in ROCm 7.2, Mismatched elements, significantly
+    @skipIfRocmArch(MI350_ARCH)
     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @unittest.mock.patch.dict(os.environ, _test_env)
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index e579184978349..0d1d2427855f8 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -29,6 +29,7 @@
     IS_FBCODE,
     parametrize,
     serialTest,
+    skipIfRocm,
     TEST_CUDA_MEM_LEAK_CHECK,
     TEST_WITH_ASAN,
 )
@@ -630,6 +631,7 @@ def f(x, w):
         torch.compile(fullgraph=True)(f)(x, w).sum().backward()
         self.assertEqual(orig_w, w.grad)
 
+    @skipIfRocm  # regression in ROCm 7.2, XBLOCK should remain 64 (got 256)
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
     )
diff --git a/test/run_test.py b/test/run_test.py
index 820206480be11..a4eaaedb0f544 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -202,6 +202,10 @@ def __contains__(self, item):
 if TEST_WITH_ROCM and isRocmArchAnyOf(("gfx1100",)):
     # Some autotune tests on gfx1100 are hanging, disable for now
     ROCM_BLOCKLIST.append("inductor/test_max_autotune")
+    # ROCm 7.2 gfx1100 started timing out due to these
+    ROCM_BLOCKLIST.append("inductor/test_torchinductor_dynamic_shapes")
+    ROCM_BLOCKLIST.append("inductor/test_torchinductor_opinfo")
+    ROCM_BLOCKLIST.append("inductor/test_ck_backend")
 
 S390X_BLOCKLIST = [
     # these tests fail due to various reasons
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 9a84211dde6c7..346a6c0204479 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -3767,7 +3767,15 @@ def run_test(a_shape, ind):
             self.assertEqual(ans, result)
 
         # compare to NumPy output
-        run_test((12, 3, 4), ind=1)
+        if not torch.version.hip:
+            # https://github.com/pytorch/pytorch/issues/174913
+            # Skip one config due to regression on ROCm 7.2 for hipSolver.
+            # Rather than skip entire unit test using @skipIfRocm
+            # This happened on MI355, MI300, and MI200.
+            # Mismatched elements: 1 / 144 (0.7%)
+            # Greatest absolute difference: 0.00130462646484375 at index (1, 3, 6) (up to 0.001 allowed)
+            # Greatest relative difference: 1.5133813576539978e-05 at index (1, 3, 6) (up to 1.3e-06 allowed)
+            run_test((12, 3, 4), ind=1)
         run_test((3, 8, 24), ind=2)
         run_test((18, 3, 3, 2), ind=1)
         run_test((1, 4, 2, 2), ind=2)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 7a9e9df8a519d..e3ac2e1c4aade 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -16052,6 +16052,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(
                    toleranceOverride({torch.chalf: tol(atol=9e-2, rtol=9e-2), }),
                    'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}),
+                   'TestOperators', 'test_vjpvmap', device_type='cuda'
+               ),
                DecorateInfo(
                    toleranceOverride({torch.half: tol(atol=9e-3, rtol=2e-1), }),
                    'TestInductorOpInfo', 'test_comprehensive', device_type='cpu')],
@@ -16205,6 +16209,18 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                    toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}),
                    'TestOperators', 'test_vjpvmap',
                ),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}),
+                   'TestOperators', 'test_jvpvjp', device_type="cuda"
+               ),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}),
+                   'TestOperators', 'test_vjp', device_type="cuda"
+               ),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}),
+                   'TestCompositeCompliance', 'test_backward', device_type="cuda"
+               ),
                DecorateInfo(
                    toleranceOverride({torch.float16: tol(atol=5e-3, rtol=1e-3)}),
                    'TestInductorOpInfo', 'test_comprehensive',

From c67ec25fdb87e850f542408b2610656b9da66c4a Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 25 Feb 2026 12:33:32 -0800
Subject: [PATCH 21/87] [ROCm] Added CUDA check to test_pattern_matcher
 (#175766)

[ROCm] Added CUDA check to test_pattern_matcher (#175092)

Forward fix to #173856.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175092
Approved by: https://github.com/jeffdaily, https://github.com/Skylion007

(cherry picked from commit f6dcaa37201c0b6499f31e264b482574507b3085)

Co-authored-by: Arash Pakbin <arash.pakbin@amd.com>
---
 test/dynamo/test_activation_checkpointing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index 5d3e52612309f..dc6b910f9b111 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -1821,7 +1821,7 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
         prefer_cudnn = (
             cudnn_version > 91500 and dprops.major in (9, 10) and dprops.minor in (0, 3)
         )
-        if prefer_cudnn:
+        if prefer_cudnn and torch.version.cuda:
             sdpa_op = torch.ops.aten._scaled_dot_product_cudnn_attention.default
         else:
             sdpa_op = torch.ops.aten._scaled_dot_product_flash_attention.default

From 9e94d2dace4cdea1661fb0ba6123d8906bd0e568 Mon Sep 17 00:00:00 2001
From: eqy <eddiey@nvidia.com>
Date: Wed, 25 Feb 2026 14:56:03 -0800
Subject: [PATCH 22/87] [cuDNN][2.11] cuDNN upgrade / sync to 9.19 for
 linux/windows on 2.11  (#175672)

* [WINDOWS][cuDNN] Fix cuDNN version mismatch in Windows (#175547)

Authored with claude code
Previous PRs such as https://github.com/pytorch/pytorch/pull/174310 updated cuDNN versions for Linux builds but neglected to do so for Windows.

Claude wrote all of the lintrunner additions for consistency checking
Pull Request resolved: https://github.com/pytorch/pytorch/pull/175547
Approved by: https://github.com/Skylion007, https://github.com/atalman, https://github.com/malfet

* [cuDNN] Upgrade cuDNN to 9.19 for 12.8 and 13.0 wheels (#174310)

Currently being tested internally, currently looks OK

also needed for https://github.com/pytorch/pytorch/pull/172108

Pull Request resolved: https://github.com/pytorch/pytorch/pull/174310
Approved by: https://github.com/Skylion007, https://github.com/ngimel, https://github.com/malfet
---
 .ci/docker/common/install_cuda.sh             |  4 +-
 .ci/pytorch/windows/internal/cuda_install.bat |  8 +--
 .../scripts/generate_binary_build_matrix.py   | 66 ++++++++++++++++++-
 ...linux-aarch64-binary-manywheel-nightly.yml | 28 ++++----
 ...nerated-linux-binary-manywheel-nightly.yml | 28 ++++----
 test/jit/test_freezing.py                     |  3 +-
 6 files changed, 100 insertions(+), 37 deletions(-)

diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
index 2d1db795d9cb4..c031e0928784b 100644
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@@ -129,7 +129,7 @@ function install_129 {
 }
 
 function install_128 {
-  CUDNN_VERSION=9.17.1.4
+  CUDNN_VERSION=9.19.0.56
   echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
   # install CUDA 12.8.1 in the same container
   install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
@@ -147,7 +147,7 @@ function install_128 {
 }
 
 function install_130 {
-  CUDNN_VERSION=9.17.1.4
+  CUDNN_VERSION=9.19.0.56
   echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
   # install CUDA 13.0 in the same container
   install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat
index 1349d3e661f55..0c8c023831e45 100644
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@@ -43,7 +43,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6"
 )
 
-set CUDNN_FOLDER=cudnn-windows-x86_64-9.5.0.50_cuda12-archive
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive
 set CUDNN_LIB_FOLDER="lib"
 set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
@@ -70,7 +70,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     set "ARGS=cuda_profiler_api_12.8 thrust_12.8 nvcc_12.8 cuobjdump_12.8 nvprune_12.8 nvprof_12.8 cupti_12.8 cublas_12.8 cublas_dev_12.8 cudart_12.8 cufft_12.8 cufft_dev_12.8 curand_12.8 curand_dev_12.8 cusolver_12.8 cusolver_dev_12.8 cusparse_12.8 cusparse_dev_12.8 npp_12.8 npp_dev_12.8 nvrtc_12.8 nvrtc_dev_12.8 nvml_dev_12.8 nvjitlink_12.8 nvtx_12.8"
 )
 
-set CUDNN_FOLDER=cudnn-windows-x86_64-9.7.0.66_cuda12-archive
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda12-archive
 set CUDNN_LIB_FOLDER="lib"
 set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
@@ -97,7 +97,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     set "ARGS=cuda_profiler_api_12.9 thrust_12.9 nvcc_12.9 cuobjdump_12.9 nvprune_12.9 nvprof_12.9 cupti_12.9 cublas_12.9 cublas_dev_12.9 cudart_12.9 cufft_12.9 cufft_dev_12.9 curand_12.9 curand_dev_12.9 cusolver_12.9 cusolver_dev_12.9 cusparse_12.9 cusparse_dev_12.9 npp_12.9 npp_dev_12.9 nvrtc_12.9 nvrtc_dev_12.9 nvml_dev_12.9 nvjitlink_12.9 nvtx_12.9"
 )
 
-set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.17.1.4_cuda12-archive
 set CUDNN_LIB_FOLDER="lib"
 set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
@@ -124,7 +124,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     set "ARGS="
 )
 
-set CUDNN_FOLDER=cudnn-windows-x86_64-9.12.0.46_cuda13-archive
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda13-archive
 set CUDNN_LIB_FOLDER="lib"
 set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 267afbe216fa4..eda03260446be 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -60,7 +60,7 @@
     "12.8": (
         "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | "  # noqa: B950
         "cuda-bindings==12.9.4; platform_system == 'Linux' | "
-        "nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | "
         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
         "nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | "
         "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'"
@@ -76,7 +76,7 @@
     "13.0": (
         "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | "  # noqa: B950
         "cuda-bindings==13.0.3; platform_system == 'Linux' | "
-        "nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | "
         "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
         "nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | "
         "nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'"
@@ -175,6 +175,67 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
         )
 
 
+def _parse_linux_cudnn_versions() -> dict[str, str]:
+    """Return {cuda_short_version: cudnn_version} from install_cuda.sh."""
+    text = (REPO_ROOT / ".ci" / "docker" / "common" / "install_cuda.sh").read_text()
+    results: dict[str, str] = {}
+    func_re = re.compile(r"^function install_(\d+)\s*\{")
+    cudnn_re = re.compile(r"^\s*CUDNN_VERSION=(\S+)")
+    current_func: str | None = None
+    for line in text.splitlines():
+        m = func_re.match(line)
+        if m:
+            digits = m.group(1)
+            current_func = digits[:-1] + "." + digits[-1]
+            continue
+        if current_func is not None:
+            m = cudnn_re.match(line)
+            if m:
+                results[current_func] = m.group(1)
+                current_func = None
+    return results
+
+
+def _parse_windows_cudnn_versions() -> dict[str, str]:
+    """Return {cuda_short_version: cudnn_version} from cuda_install.bat."""
+    text = (
+        REPO_ROOT / ".ci" / "pytorch" / "windows" / "internal" / "cuda_install.bat"
+    ).read_text()
+    results: dict[str, str] = {}
+    label_re = re.compile(r"^:cuda(\d+)\s*$")
+    cudnn_re = re.compile(
+        r"^set CUDNN_FOLDER=cudnn-windows-x86_64-([0-9.]+)_cuda\d+-archive"
+    )
+    current_label: str | None = None
+    for line in text.splitlines():
+        m = label_re.match(line)
+        if m:
+            digits = m.group(1)
+            current_label = digits[:-1] + "." + digits[-1]
+            continue
+        if current_label is not None:
+            m = cudnn_re.match(line)
+            if m:
+                results[current_label] = m.group(1)
+                current_label = None
+    return results
+
+
+def validate_cudnn_version_consistency(arch_version: str) -> None:
+    linux_versions = _parse_linux_cudnn_versions()
+    windows_versions = _parse_windows_cudnn_versions()
+    linux_ver = linux_versions.get(arch_version)
+    windows_ver = windows_versions.get(arch_version)
+    if linux_ver is None or windows_ver is None:
+        return
+    if linux_ver != windows_ver:
+        raise RuntimeError(
+            f"cuDNN version mismatch for CUDA {arch_version}: "
+            f"Linux has {linux_ver} (.ci/docker/common/install_cuda.sh) "
+            f"but Windows has {windows_ver} (.ci/pytorch/windows/internal/cuda_install.bat)"
+        )
+
+
 def arch_type(arch_version: str) -> str:
     if arch_version in CUDA_ARCHES:
         return "cuda"
@@ -421,6 +482,7 @@ def generate_wheels_matrix(
 arch_version = ""
 for arch_version in CUDA_ARCHES:
     validate_nccl_dep_consistency(arch_version)
+    validate_cudnn_version_consistency(arch_version)
 del arch_version
 
 
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index 0dc6a42e77d24..567d099675a60 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -204,7 +204,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -346,7 +346,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -554,7 +554,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -696,7 +696,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -904,7 +904,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1046,7 +1046,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1254,7 +1254,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1396,7 +1396,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1604,7 +1604,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1746,7 +1746,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1954,7 +1954,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2096,7 +2096,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2304,7 +2304,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2446,7 +2446,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index fcd006886abed..2e3aaa64f4d42 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -195,7 +195,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -329,7 +329,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -859,7 +859,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -993,7 +993,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1523,7 +1523,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1657,7 +1657,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2187,7 +2187,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2321,7 +2321,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2851,7 +2851,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2985,7 +2985,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -3515,7 +3515,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -3649,7 +3649,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -4179,7 +4179,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -4313,7 +4313,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index d7fbbdab0096e..a695e0e9b3f7d 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -11,7 +11,7 @@
 import torch.nn.functional as F
 from torch.jit._recursive import wrap_cpp_module
 from torch.testing import FileCheck
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN, tf32_on_and_off
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.common_quantized import override_quantized_engine
 from torch.testing._internal.common_utils import (
@@ -2964,6 +2964,7 @@ def test_conv_to_mkldnn_no_mkldnn(self):
             inp = torch.rand([4, 3, 4, 4])
             self.assertEqual(frozen(inp), mod(inp))
 
+    @tf32_on_and_off(0.005)
     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
     def test_freeze_conv_relu_fusion(self):
         with set_default_dtype(torch.float):

From 195f9cd260844ec2ba9644cdb5a8d3c98e7d602e Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 26 Feb 2026 07:19:59 -0800
Subject: [PATCH 23/87] Fix pep517 release handling (#175793)

Fix pep517 release handling (#175635)

Fix pep517 release handling

Fix sdist upload: correct PEP 440 version and file path
PYTORCH_BUILD_VERSION was being set unconditionally to the raw tag/branch
name (including 'v' prefix for tags), which fails PEP 440 validation in
get_torch_version(), and was not exported so Python subprocesses couldn't
see it anyway.

Fix both issues: set and export PYTORCH_BUILD_VERSION only for release/RC
tags, stripping the 'v' prefix and converting '-rc' to 'rc' for PEP 440
compliance. For branch pushes and PRs, leave it unset so get_torch_version
falls back to version.txt.

Also fix the sdist upload path: python -m build places the sdist in dist/,
so move it to the workspace root for consistency with all upload steps
(release, GHA artifact, and S3).

These fixes are tested/verified in the second PR in this stack.

This commit was created with the help of Claude Sonnet 4.6.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175635
Approved by: https://github.com/atalman, https://github.com/malfet

(cherry picked from commit 11eba5b6efcd78f68730bdff9bb701d47a9e256f)

Co-authored-by: Klaus Zimmermann <klaus.zimmermann@quansight.com>
---
 .github/workflows/create_release.yml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index 4932631f2d2eb..2506ad4192bfd 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -54,11 +54,19 @@ jobs:
           tag_or_branch="${tag_or_branch#refs/heads/}"
           # replace directory separators with _ in branch name
           tag_or_branch="${tag_or_branch//\//_}"
+          # Set PYTORCH_BUILD_VERSION only for release/RC tags; convert to PEP 440
+          if [[ "$PT_GITHUB_REF" =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+(-rc[0-9]+)?$ ]]; then
+            ver="${PT_GITHUB_REF#refs/tags/v}"
+            export PYTORCH_BUILD_VERSION="${ver/-rc/rc}"
+            export PYTORCH_BUILD_NUMBER=0
+          fi
           torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
           {
             echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
             echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
             echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
+            echo "PYTORCH_BUILD_VERSION=${PYTORCH_BUILD_VERSION:-}";
+            echo "PYTORCH_BUILD_NUMBER=${PYTORCH_BUILD_NUMBER:-}";
           } >> "$GITHUB_ENV"
       - name: Checkout optional submodules
         run: python3 tools/optional_submodules.py
@@ -83,7 +91,7 @@ jobs:
         run: |
           pip install build==1.2.2.post1 || exit 1
           python -m build --sdist || exit 1
-          cd dist || exit 1
+          mv dist/$PT_PEP517_RELEASE_FILE . || exit 1
       - name: Upload source distribution for release
         if: ${{ github.event_name == 'release' }}
         uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
@@ -102,7 +110,7 @@ jobs:
         uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
           name: ${{ env.PT_PEP517_RELEASE_FILE }}
-          path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
+          path: ${{ env.PT_PEP517_RELEASE_FILE }}
       - name: Set output
         id: release_name
         run: |

From f99ab991dcd3719ee25dd3377a53ea12e518308e Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 27 Feb 2026 11:06:08 -0800
Subject: [PATCH 24/87] [CI] Update inductor CI jobs to CUDA 13.0 (#175826)
 (#175955)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Docker image switch — All workflows that used pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks now use the cuda13.0 variant. The
  unused CUDA 12.8 image definition was removed from .ci/docker/build.sh and its duplicate entry dropped from docker-builds.yml.
  3. Duplicate cleanup — Five workflows previously had both a CUDA 12.8 build and a separate -cuda13 build. After migrating the main build to CUDA 13.0, the
   -cuda13 duplicates were removed:
    - inductor-periodic.yml — removed periodic-dynamo-benchmarks-build-cuda13 + test
    - inductor-micro-benchmark.yml — removed build-cuda13 + test-cuda13
    - inductor-perf-compare.yml — removed build-cuda13 + test-cuda13
    - inductor-perf-test-nightly.yml — removed build-cuda13 + 3 test jobs
    - trunk.yml — removed inductor-build-cuda13
Pull Request resolved: https://github.com/pytorch/pytorch/pull/175826
Approved by: https://github.com/atalman

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .ci/docker/build.sh                           | 11 ---
 .github/workflows/docker-builds.yml           |  1 -
 .../workflows/inductor-micro-benchmark.yml    | 35 +------
 .github/workflows/inductor-perf-compare.yml   | 43 +-------
 .github/workflows/inductor-perf-test-b200.yml | 12 +--
 .../inductor-perf-test-nightly-h100.yml       |  4 +-
 .../workflows/inductor-perf-test-nightly.yml  | 98 ++-----------------
 .github/workflows/inductor-periodic.yml       | 59 +----------
 .github/workflows/inductor-unittest.yml       |  4 +-
 .github/workflows/inductor.yml                | 35 +------
 .github/workflows/pull.yml                    | 32 ------
 .github/workflows/torchbench.yml              |  8 +-
 .github/workflows/trunk.yml                   | 13 ---
 13 files changed, 34 insertions(+), 321 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 9a051dc84aef5..7df6453c22da9 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -129,17 +129,6 @@ case "$tag" in
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
   pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks)
     CUDA_VERSION=13.0.2
     ANACONDA_PYTHON_VERSION=3.10
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index dc9ecef7860ae..10dc09fcec2f1 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -52,7 +52,6 @@ jobs:
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
           pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
           pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks,
           pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks,
           pytorch-linux-jammy-py3.10-clang15,
           pytorch-linux-jammy-py3.11-clang15,
diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml
index 35a1a4ef972a5..19a6c764e403f 100644
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@@ -30,14 +30,14 @@ jobs:
       opt_out_experiments: lf
 
   build:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -46,7 +46,7 @@ jobs:
     secrets: inherit
 
   test:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     with:
@@ -55,30 +55,3 @@ jobs:
       test-matrix: ${{ needs.build.outputs.test-matrix }}
       timeout-minutes: 720
     secrets: inherit
-
-  build-cuda13:
-    name: cuda13.0-py3.10-gcc11-sm80
-    uses: ./.github/workflows/_linux-build.yml
-    needs:
-      - get-default-label-prefix
-    with:
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
-      cuda-arch-list: '8.0'
-      test-matrix: |
-        { include: [
-          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
-        ]}
-    secrets: inherit
-
-  test-cuda13:
-    name: cuda13.0-py3.10-gcc11-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: build-cuda13
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
-      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
-      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
-      timeout-minutes: 720
-    secrets: inherit
diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml
index 6235d02970849..3a9b01e97aed5 100644
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@@ -27,15 +27,15 @@ jobs:
       opt_out_experiments: lf
 
   build:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       runner: linux.4xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -48,7 +48,7 @@ jobs:
     secrets: inherit
 
   test:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     with:
@@ -60,38 +60,3 @@ jobs:
       monitor-log-interval: 15
       monitor-data-collect-interval: 4
     secrets: inherit
-
-  build-cuda13:
-    name: cuda13.0-py3.10-gcc11-sm80
-    uses: ./.github/workflows/_linux-build.yml
-    needs:
-      - get-default-label-prefix
-    with:
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      runner: linux.4xlarge.memory
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
-      cuda-arch-list: '8.0'
-      test-matrix: |
-        { include: [
-          { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
-          { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
-          { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
-          { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
-        ]}
-      build-additional-packages: "vision audio torchao"
-    secrets: inherit
-
-  test-cuda13:
-    name: cuda13.0-py3.10-gcc11-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: build-cuda13
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
-      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
-      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
-      # disable monitor in perf tests for more investigation
-      disable-monitor: false
-      monitor-log-interval: 15
-      monitor-data-collect-interval: 4
-    secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-b200.yml b/.github/workflows/inductor-perf-test-b200.yml
index 003f27476bcb9..0c2558fb772a8 100644
--- a/.github/workflows/inductor-perf-test-b200.yml
+++ b/.github/workflows/inductor-perf-test-b200.yml
@@ -78,7 +78,7 @@ jobs:
       opt_out_experiments: lf
 
   build:
-    name: cuda12.8-py3.10-gcc11-sm100
+    name: cuda13.0-py3.10-gcc11-sm100
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -88,8 +88,8 @@ jobs:
       # from trunk. Also use a memory-intensive runner here because memory is
       # usually the bottleneck
       runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '10.0'
       test-matrix: |
         { include: [
@@ -102,7 +102,7 @@ jobs:
     secrets: inherit
 
   test-periodically:
-    name: cuda12.8-py3.10-gcc11-sm100
+    name: cuda13.0-py3.10-gcc11-sm100
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '0 7 * * 1-6'
@@ -119,7 +119,7 @@ jobs:
     secrets: inherit
 
   test-weekly:
-    name: cuda12.8-py3.10-gcc11-sm100
+    name: cuda13.0-py3.10-gcc11-sm100
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '0 7 * * 0'
@@ -136,7 +136,7 @@ jobs:
     secrets: inherit
 
   test:
-    name: cuda12.8-py3.10-gcc11-sm100
+    name: cuda13.0-py3.10-gcc11-sm100
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     with:
diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml
index a929475355888..0c027682cc168 100644
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@@ -93,8 +93,8 @@ jobs:
       # from trunk. Also use a memory-intensive runner here because memory is
       # usually the bottleneck
       runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm90
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '9.0'
       test-matrix: |
         { include: [
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 6539a81f7c196..1c684a9d7a270 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -78,15 +78,15 @@ jobs:
       opt_out_experiments: lf
 
   build:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       # Every bit to make perf run faster helps
       runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -115,7 +115,7 @@ jobs:
     secrets: inherit
 
   test-nightly:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '0 7 * * 1-6'
@@ -131,7 +131,7 @@ jobs:
     secrets: inherit
 
   test-weekly:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '0 7 * * 0'
@@ -148,7 +148,7 @@ jobs:
     secrets: inherit
 
   test:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event_name == 'workflow_dispatch'
@@ -162,89 +162,3 @@ jobs:
       monitor-log-interval: 15
       monitor-data-collect-interval: 4
     secrets: inherit
-
-  build-cuda13:
-    name: cuda13.0-py3.10-gcc11-sm80
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      # Every bit to make perf run faster helps
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
-      cuda-arch-list: '8.0'
-      test-matrix: |
-        { include: [
-          { config: "inductor_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
-          { config: "inductor_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
-          { config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
-          { config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
-          { config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
-          { config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
-          { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
-        ]}
-      selected-test-configs: ${{ inputs.benchmark_configs }}
-      build-additional-packages: "vision audio torchao"
-    secrets: inherit
-
-  test-nightly-cuda13:
-    name: cuda13.0-py3.10-gcc11-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: build-cuda13
-    if: github.event.schedule == '0 7 * * 1-6'
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
-      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true
-      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
-      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
-      timeout-minutes: 720
-      disable-monitor: false
-      monitor-log-interval: 15
-      monitor-data-collect-interval: 4
-    secrets: inherit
-
-  test-weekly-cuda13:
-    name: cuda13.0-py3.10-gcc11-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: build-cuda13
-    if: github.event.schedule == '0 7 * * 0'
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
-      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true
-      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
-      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
-      timeout-minutes: 1440
-      # disable monitor in perf tests, next step is to enable it
-      disable-monitor: false
-      monitor-log-interval: 15
-      monitor-data-collect-interval: 4
-    secrets: inherit
-
-  test-cuda13:
-    name: cuda13.0-py3.10-gcc11-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: build-cuda13
-    if: github.event_name == 'workflow_dispatch'
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}
-      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
-      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
-      timeout-minutes: 720
-      disable-monitor: false
-      monitor-log-interval: 15
-      monitor-data-collect-interval: 4
-    secrets: inherit
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index 1e87adc965c74..1506f1ac375d9 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -38,8 +38,8 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       runner: linux.4xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '8.0;8.6'
       test-matrix: |
         { include: [
@@ -82,57 +82,6 @@ jobs:
       test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
     secrets: inherit
 
-  periodic-dynamo-benchmarks-build-cuda13:
-    name: periodic-dynamo-benchmarks-build-cuda13
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      runner: linux.4xlarge.memory
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
-      cuda-arch-list: '8.0;8.6'
-      test-matrix: |
-        { include: [
-          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-      build-additional-packages: "vision audio torchao"
-    secrets: inherit
-
-  periodic-dynamo-benchmarks-test-cuda13:
-    name: periodic-dynamo-benchmarks-test-cuda13
-    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-dynamo-benchmarks-build-cuda13
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
-      docker-image: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.test-matrix }}
-    secrets: inherit
-
   rocm-periodic-dynamo-benchmarks-build:
     if: github.repository_owner == 'pytorch'
     name: rocm-periodic-dynamo-benchmarks-build
@@ -191,8 +140,8 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       runner: linux.4xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml
index ea6ce55dbd470..bce149251a477 100644
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@@ -36,8 +36,8 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '8.6'
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 3736415f11b74..7fe6a193283f5 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -49,8 +49,8 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '8.6'
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runner: linux.4xlarge.memory
@@ -75,37 +75,6 @@ jobs:
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
     secrets: inherit
 
-  inductor-build-cuda13:
-    name: inductor-build-cuda13
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.4xlarge.memory
-      test-matrix: |
-        { include: [
-          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-      build-additional-packages: "vision audio torchao"
-    secrets: inherit
-
-  inductor-test-cuda13:
-    name: inductor-test-cuda13
-    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build-cuda13
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
-      docker-image: ${{ needs.inductor-build-cuda13.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build-cuda13.outputs.test-matrix }}
-    secrets: inherit
-
   inductor-cpu-build:
     name: inductor-cpu-build
     uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 22989263dd22f..96024e2ef1a6d 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -430,38 +430,6 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc11-inductor-build:
-    if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' cuda12.8-py3.10-gcc11-sm75 ') }}
-    name: cuda12.8-py3.10-gcc11-sm75
-    uses: ./.github/workflows/_linux-build.yml
-    needs:
-      - get-label-type
-      - job-filter
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-inductor-test:
-    if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' cuda12.8-py3.10-gcc11-sm75 ') }}
-    name: cuda12.8-py3.10-gcc11-sm75
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-inductor-build
-      - job-filter
-    with:
-      build-environment: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.build-environment }}
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.test-matrix }}
-      tests-to-include: ${{ github.event.inputs.tests-to-include || '' }}
-    secrets: inherit
-
   linux-jammy-cuda13_0-py3_10-gcc11-inductor-build:
     if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' cuda13.0-py3.10-gcc11-sm75 ') }}
     name: cuda13.0-py3.10-gcc11-sm75
diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml
index a84ff38e72471..ba18a1c8c2a32 100644
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@@ -26,14 +26,14 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
 
   build:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11-inductor-benchmarks
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -42,7 +42,7 @@ jobs:
     secrets: inherit
 
   test:
-    name: cuda12.8-py3.10-gcc11-sm80
+    name: cuda13.0-py3.10-gcc11-sm80
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     with:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 5b741eb67954a..18de36f752130 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -313,19 +313,6 @@ jobs:
     if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' inductor-build ') }}
     name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
-    needs:
-      - get-label-type
-      - job-filter
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
-      cuda-arch-list: '8.0'
-    secrets: inherit
-
-  inductor-build-cuda13:
-    if: ${{ needs.job-filter.outputs.jobs == '' || contains(needs.job-filter.outputs.jobs, ' inductor-build-cuda13 ') }}
-    name: inductor-build-cuda13
-    uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-label-type
       - job-filter

From f95d7a4bacff6a1e4f11a232c0f8a3f2b42bed4e Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Sun, 8 Mar 2026 06:55:48 -0700
Subject: [PATCH 25/87] update previous version 2.10 installation in get start
 xpu (#176408)

update previous version 2.10 installation in get start xpu  (#176141)

update previous version 2.10 installation in get start xpu for release 2.11
Pull Request resolved: https://github.com/pytorch/pytorch/pull/176141
Approved by: https://github.com/EikanWang

(cherry picked from commit 14f828cb8c2ac10e66497b3bfe32ffe557753d5f)

Co-authored-by: ZhaoqiongZ <106125927+ZhaoqiongZ@users.noreply.github.com>
---
 docs/source/notes/get_start_xpu.rst | 45 +++++++++++++++++------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/docs/source/notes/get_start_xpu.rst b/docs/source/notes/get_start_xpu.rst
index 5a1442e813805..dc0cb984c47fc 100644
--- a/docs/source/notes/get_start_xpu.rst
+++ b/docs/source/notes/get_start_xpu.rst
@@ -4,7 +4,8 @@ Getting Started on Intel GPU
 Hardware Prerequisite
 ---------------------
 
-For Intel Data Center GPU
+Intel Data Center GPU
+^^^^^^^^^^^^^^^^^^^^^
 
 .. list-table::
    :widths: 50 50 50 50
@@ -19,7 +20,8 @@ For Intel Data Center GPU
      - yes
      - yes
 
-For Intel Client GPU
+Intel Client GPU
+^^^^^^^^^^^^^^^^
 
 +---------------------------------------+-----------------------------------------------------------------------------------------------------+
 | Supported OS                          | Validated Hardware                                                                                  |
@@ -51,37 +53,42 @@ Binaries
 
 Now that we have `Intel GPU Driver <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu.html>`_ installed, use the following commands to install ``pytorch``, ``torchvision``, ``torchaudio``.
 
-For release wheels
+Stable Releases
+~~~~~~~~~~~~~~~
+
+To install the latest stable release wheels for Intel GPU (XPU):
 
 .. code-block:: bash
 
     pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
 
-For nightly wheels
+Nightly Builds
+~~~~~~~~~~~~~~
+
+To install the latest preview/nightly wheels:
 
 .. code-block:: bash
 
     pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
 
-For previous versions
+Previous Versions
+~~~~~~~~~~~~~~~~~
 
-v2.9.1
+**v2.10.0**
 
 .. code-block:: bash
 
-    pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/xpu
+    pip install torch==2.10.0 torchvision==0.25.0 torchaudio==2.10.0 --index-url https://download.pytorch.org/whl/xpu
 
-v2.9.0
+**v2.9.1**
 
 .. code-block:: bash
 
-    pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/xpu
-
-v2.8.0
+    pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/xpu
 
-.. code-block:: bash
+.. note::
 
-    pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/xpu
+   For older wheels, please refer to the `previous versions <https://pytorch.org/get-started/previous-versions/>`_ page and ensure you use the ``xpu`` index URL.
 
 From Source
 ^^^^^^^^^^^
@@ -137,7 +144,7 @@ Here are a few inference workflow examples.
 
 
 Inference with FP32
-"""""""""""""""""""
+~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
 
@@ -157,7 +164,7 @@ Inference with FP32
     print("Execution finished")
 
 Inference with AMP
-""""""""""""""""""
+~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
 
@@ -181,7 +188,7 @@ Inference with AMP
     print("Execution finished")
 
 Inference with ``torch.compile``
-""""""""""""""""""""""""""""""""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
 
@@ -222,7 +229,7 @@ Training Examples
 Here is a few training workflow examples.
 
 Train with FP32
-"""""""""""""""
+~~~~~~~~~~~~~~~
 
 .. code-block:: python
 
@@ -279,7 +286,7 @@ Train with FP32
     print("Execution finished")
 
 Train with AMP
-""""""""""""""
+~~~~~~~~~~~~~~
 
 .. note::
    Training with ``GradScaler`` requires hardware support for ``FP64``. ``FP64`` is not natively supported by the Intel® Arc™ A-Series Graphics. If you run your workloads on Intel® Arc™ A-Series Graphics, please disable ``GradScaler``.
@@ -347,7 +354,7 @@ Train with AMP
     print("Execution finished")
 
 Train with ``torch.compile``
-""""""""""""""""""""""""""""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
 

From dc12b65cd31ba18cbdc7f2e12e7d1564a67770d0 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Sun, 8 Mar 2026 06:57:24 -0700
Subject: [PATCH 26/87] [inductor] Fix Identity comparability and evalf
 recursion (#176783)

[inductor] Fix Identity comparability and evalf recursion (#175975)

Fixes #175856

## Summary

This PR adds a narrow `Identity._eval_evalf(self, prec)` override in
`torch/utils/_sympy/functions.py` to fix the SymPy recursion/comparison failure
seen in Inductor simplification (e.g. `Max(0, Identity(-6))`).

The implementation only unwraps comparable integer constants:

```python
def _eval_evalf(self, prec):
    arg = self.args[0]
    if arg.is_Integer and arg.is_comparable:
        return arg
    return None
```

This keeps the fix minimal for the index-math path involved in the bug.

Tests
Added targeted tests in test/inductor/test_utils.py:

`testIdentityComparisonNoRecursion`
`testIdentityComparableNumbersInMinMax`
`testIdentityEvalfIntegerOnly`

Validation
Repro fails on unpatched builds in the same SymPy/Inductor path.
Repro passes with this fix applied.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175975
Approved by: https://github.com/azahed98, https://github.com/laithsakka

(cherry picked from commit cea64de6cc14c55cf3d909787a51b2cd64b3aa04)

Co-authored-by: bhack <bhack@users.noreply.github.com>
---
 test/inductor/test_utils.py     | 16 ++++++++++++++++
 torch/utils/_sympy/functions.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/test/inductor/test_utils.py b/test/inductor/test_utils.py
index 41bb05c4cf594..cd37bcb57ede3 100644
--- a/test/inductor/test_utils.py
+++ b/test/inductor/test_utils.py
@@ -87,6 +87,22 @@ def testSympySubsIdentityNonComparable(self):
         result = sympy_subs(expr, {q0: I})
         self.assertTrue(result.has(I))
 
+    def testIdentityComparisonNoRecursion(self):
+        self.assertTrue(Identity(sympify("0")) >= 0)
+        self.assertFalse(Identity(sympify("-6")) >= 0)
+        self.assertTrue(0 >= Identity(sympify("-6")))
+
+    def testIdentityComparableNumbersInMinMax(self):
+        expr = Identity(sympify("-6"))
+        self.assertTrue(expr.is_number)
+        self.assertTrue(expr.is_comparable)
+        self.assertEqual(Max(0, expr), 0)
+
+    def testIdentityRationalComparisonNoRecursion(self):
+        expr = Identity(sympify("1/7"))
+        self.assertTrue(expr >= 0)
+        self.assertTrue(Max(0, expr).has(expr))
+
     def test_sympy_str(self):
         self.assertEqual(sympy_str(sympify("a+b+c")), "a + b + c")
         self.assertEqual(sympy_str(sympify("a*b+c")), "c + a * b")
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index ee04c2461f3c6..5102afd9b530e 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -1366,6 +1366,38 @@ def __int__(self) -> int:
         # pyrefly: ignore [missing-attribute]
         return int(self.args[0])
 
+    def _identity_atom_compare(self, other, op):
+        """
+        Fast path for comparing wrapped numeric atomics against other numeric atomics.
+        Keep compound expressions on SymPy's default symbolic path.
+        """
+        arg = self.args[0]
+        if isinstance(other, int):
+            other = sympy.Integer(other)
+        if not isinstance(other, sympy.Expr):
+            return None
+        if not (arg.is_Atom and arg.is_number and arg.is_comparable):
+            return None
+        if not (other.is_Atom and other.is_number and other.is_comparable):
+            return None
+        return sympy.S.true if op(arg, other) else sympy.S.false
+
+    def __ge__(self, other):
+        out = self._identity_atom_compare(other, lambda a, b: a >= b)
+        return out if out is not None else super().__ge__(other)
+
+    def __gt__(self, other):
+        out = self._identity_atom_compare(other, lambda a, b: a > b)
+        return out if out is not None else super().__gt__(other)
+
+    def __le__(self, other):
+        out = self._identity_atom_compare(other, lambda a, b: a <= b)
+        return out if out is not None else super().__le__(other)
+
+    def __lt__(self, other):
+        out = self._identity_atom_compare(other, lambda a, b: a < b)
+        return out if out is not None else super().__lt__(other)
+
     def __float__(self) -> float:
         # pyrefly: ignore [missing-attribute]
         return float(self.args[0])

From 63fcbe1040ffef63e82abd4e66da1d7554d23aa4 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Sun, 8 Mar 2026 09:40:08 -0700
Subject: [PATCH 27/87] [XPU] Fix SyclExtension Windows build for oneAPI
 2025.3+ breaking change (#175333)

[XPU] Fix SyclExtension Windows build for oneAPI 2025.3+ breaking change (#170701)

## Summary
Fixes SyclExtension compilation on Windows when using oneAPI 2025.3 or higher.

## Problem
oneAPI 2025.3 introduced a breaking change in how include paths are ordered to align with MSVC behavior. This causes build failures when compiling SyclExtension on Windows.

The issue occurs because MSVC include directories are explicitly passed on the compiler command line. With the new include path ordering in oneAPI 2025.3, this causes the wrong std headers included.

These MSVC directories are already added as correctly-ordered implicit include paths by the compiler, so they should not need to be passed explicitly on the command line. Passing them explicitly disrupts the intended include order.

## Solution
When building SYCL extensions on Windows with oneAPI version >= 2025.3, filter out Microsoft Visual Studio paths from the compiler's include directories.

The fix is version-gated to only apply for oneAPI 2025.3+ to avoid affecting users on older oneAPI versions.

Fixes:
https://github.com/intel/torch-xpu-ops/issues/2574

Pull Request resolved: https://github.com/pytorch/pytorch/pull/170701
Approved by: https://github.com/dvrogozh, https://github.com/EikanWang, https://github.com/atalman

(cherry picked from commit a09b29e732f52a690d4ca3764256f26369115858)

Co-authored-by: astachowiczhabana <adam.stachowicz@intel.com>
---
 torch/utils/cpp_extension.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index a63bff50d5ec3..109a6608aa4bd 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -947,6 +947,20 @@ def win_cuda_flags(cflags):
         def win_hip_flags(cflags):
             return (COMMON_HIPCC_FLAGS + COMMON_HIP_FLAGS + cflags + _get_rocm_arch_flags(cflags))
 
+        def win_filter_msvc_include_dirs(pp_opts) -> list[str]:
+            """Filter out MSVC include dirs from pp_opts for oneAPI 2025.3+."""
+            # oneAPI 2025.3+ changed include path ordering to match MSVC behavior.
+            # Filter out MSVC headers to avoid conflicting declarations with oneAPI's std headers.
+            icpx_version = int(_get_icpx_version())
+            if icpx_version >= 20250300:
+                vc_tools_dir = os.path.normcase(os.environ.get('VCToolsInstallDir', ''))
+                if vc_tools_dir:
+                    pp_opts = [
+                        path for path in pp_opts
+                        if vc_tools_dir not in os.path.normcase(path)
+                    ]
+            return pp_opts
+
         def win_wrap_single_compile(sources,
                                     output_dir=None,
                                     macros=None,
@@ -1116,7 +1130,7 @@ def win_wrap_ninja_compile(sources,
             sycl_post_cflags = None
             sycl_dlink_post_cflags = None
             if with_sycl:
-                sycl_cflags = common_cflags + pp_opts + _COMMON_SYCL_FLAGS
+                sycl_cflags = common_cflags + win_filter_msvc_include_dirs(pp_opts) + _COMMON_SYCL_FLAGS
                 if isinstance(extra_postargs, dict):
                     sycl_post_cflags = extra_postargs['sycl']
                 else:

From 5d919bfe0f2ba7c7aabdb75ef6a20512f163e662 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 9 Mar 2026 06:02:02 -0700
Subject: [PATCH 28/87] [Inductor] Reject non-contiguous subnode fusion in
 mix-order reduction. (#176410)

[Inductor] Reject non-contiguous subnode fusion in mix-order reduction. (#176131)

We observed assert error after PR #174947 on XPU in https://github.com/intel/torch-xpu-ops/issues/2932:
The assert error in line L2125:
https://github.com/pytorch/pytorch/blob/f99ab991dcd3719ee25dd3377a53ea12e518308e/torch/_inductor/scheduler.py#L2122-L2125

which is caused by:
https://github.com/pytorch/pytorch/blob/f99ab991dcd3719ee25dd3377a53ea12e518308e/torch/_inductor/scheduler.py#L2200-L2203

Root cause:
- MixOrderReduction.can_fuse is a pre-fusion heuristic; it only checks static conditions (both reductions, reversed orders, common reads, one contiguous pre-fusion, size/heuristics). It cannot see access-pattern changes introduced by backend.fuse.
- In the failing case, self.node1=op1115 (reduction, contiguous=True) is fused with other=op1123 (pointwise, contiguous=False), producing fused_node=op1115_op1123 (non-contiguous). self.node2=op1117_op1119 is already non-contiguous. The mix-order reduction invariant (at least one side contiguous) is violated, so FusedMixOrderReductions would assert.
```
self.node1 = op1115  (SchedulerNode, reduction, contiguous=True)
other      = op1123  (SchedulerNode, pointwise, contiguous=False)

backend.fuse(self.node1, other)
        |
        v
fused_node = op1115_op1123 (FusedSchedulerNode, reduction+pointwise, contiguous=False)

self.node2 = op1117_op1119 (FusedSchedulerNode, reduction+reduction, contiguous=False)

mix-order reduction attempt:
fused_node  +  self.node2  ->  FusedMixOrderReductions  (assert fails)
```

Fix:
- Add a general post-fusion validation in FusedMixOrderReductions.fuse_with: after backend.fuse, re-check the contiguity invariant and reject the fusion if both sides are non-contiguous.
- Implement a FusionRejected signal and catch it in Scheduler.fuse_two_nodes to keep nodes unfused.

Test:
- Added a regression test which reproduced the assert error on **cuda/xpu** and pass with this PR.
-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Pull Request resolved: https://github.com/pytorch/pytorch/pull/176131
Approved by: https://github.com/shunting314

(cherry picked from commit 5a6d6b3ece55cc4f4db0e377fa33b9c8374a507f)

Co-authored-by: xinan.lin <xinan.lin@intel.com>
---
 test/inductor/test_mix_order_reduction.py | 24 +++++++++++++++++++++++
 torch/_inductor/scheduler.py              |  7 +++++++
 2 files changed, 31 insertions(+)

diff --git a/test/inductor/test_mix_order_reduction.py b/test/inductor/test_mix_order_reduction.py
index 5dc30015976b6..e858f542e0e75 100644
--- a/test/inductor/test_mix_order_reduction.py
+++ b/test/inductor/test_mix_order_reduction.py
@@ -160,6 +160,30 @@ def f(x, y):
         # shared memory.
         self.assertEqual(metrics.codegen_mix_order_reduction, 0)
 
+    @inductor_config.patch(split_reductions=False)
+    def test_fuse_non_contiguous_pointwise(self):
+        if not inductor_config.triton.mix_order_reduction:
+            self.skipTest("Mix order reduction not enabled")
+
+        # Regression: mix-order reduction can appear valid pre-fusion, but a pointwise
+        # fused into one side can change access patterns and break the contiguity
+        # invariant. This test builds a reduction + pointwise path plus a second
+        # reduction, matching the shape/ordering pattern seen in the E2E failure.
+
+        def f(x):
+            # First reduction (contiguous on its own).
+            r1 = x.sum(dim=1)
+            # Pointwise depends on both reduced and unreduced data, so fusing it
+            # with the reduction can change access strides.
+            y = r1 * x[:, 0]
+            # Second reduction across a different dimension to trigger mix-order logic.
+            r2 = x.sum(dim=0)
+            return y, r2
+
+        # Large, asymmetric shape encourages mix-order reduction heuristics.
+        x = torch.randn(32768, 768, dtype=torch.float, device=GPU_TYPE)
+        self.check_numeric(f, (x,))
+
     @inductor_config.patch(coordinate_descent_tuning=True)
     def test_XBLOCK_coordest_tuning(self):
         """
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 6e1323ca942a3..55e55a6eda421 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -2153,6 +2153,13 @@ def sub_node_can_fuse(
         if not self.scheduler.can_fuse(node1, node2, allow_mix_order_reduction=False):
             return False
 
+        # Since node1 is from the current mix order reduction, if node1 is
+        # contiguous, the fused node should also be contiguous.
+        if MixOrderReduction.is_contiguous_node(
+            node1
+        ) and not MixOrderReduction.is_contiguous_node(node2):
+            return False
+
         def _get_ancestors(nodes: tuple[BaseSchedulerNode, ...]) -> OrderedSet[str]:
             out = OrderedSet()
             return out.union(*(n.ancestors for n in nodes))

From 3c40486f8a515b3f6f851a0cc4b3a2dc07744f6c Mon Sep 17 00:00:00 2001
From: shunting314 <52589240+shunting314@users.noreply.github.com>
Date: Mon, 9 Mar 2026 14:26:09 -0700
Subject: [PATCH 29/87] [inductor] avoid multi-stage for mix-order-red by
 default (#176228) (#176495)

The default >1 num_stages have causing multiple out of shared memory issues. Make it to be 1 by default.

We could explore other alternatives
1. always add a config with num_stages=1 while keeping the current heuristics. Could increase compilation time
2. dynamically scale down num-stages if all config fail to compile due to out of shared memory
3. minic Triton logic to estimate the amount of shared memory needed per stage and set num-stages accordingly based on smem capacity.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/176228
Approved by: https://github.com/eellison, https://github.com/drisspg, https://github.com/jansel

(cherry picked from commit ab17a385d9ce099ac68080b5493ab4e6e8b3131b)
---
 test/inductor/test_mix_order_reduction.py    | 123 +++++++++++++++++++
 torch/_inductor/codegen/triton.py            |   1 +
 torch/_inductor/config.py                    |   5 +
 torch/_inductor/runtime/triton_heuristics.py |   5 +-
 4 files changed, 133 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_mix_order_reduction.py b/test/inductor/test_mix_order_reduction.py
index e858f542e0e75..de96245ed51d3 100644
--- a/test/inductor/test_mix_order_reduction.py
+++ b/test/inductor/test_mix_order_reduction.py
@@ -6,11 +6,13 @@
 import torch
 import torch._inductor.config as inductor_config
 import torch.nn.functional as F
+from torch import nn
 from torch._dynamo.utils import same
 from torch._inductor import metrics, utils
 from torch._inductor.scheduler import MixOrderReduction
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing import FileCheck
+from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     isRocmArchAnyOf,
@@ -750,6 +752,127 @@ def f(x):
         compile_metrics = torch._dynamo.utils._compilation_metrics
         self.assertEqual(len(compile_metrics), 1, "Don't recompile")
 
+    @largeTensorTest("36GB", device=GPU_TYPE, inductor=True)
+    def test_out_of_shared_memory(self):
+        """
+        Fix https://github.com/pytorch/pytorch/issues/175250
+        """
+        if not inductor_config.triton.mix_order_reduction:
+            self.skipTest("Mix order reduction not enabled")
+
+        NUM_HEADS = 32
+        NUM_KV_HEADS = 8
+        HEAD_DIM = 128
+        HIDDEN_SIZE = NUM_HEADS * HEAD_DIM * 2
+        SEQ_LEN = 8192 * 2
+
+        def rotate_half(x):
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+
+        def apply_rotary_pos_emb(q, k, cos, sin):
+            cos = cos[:, None, :, :]
+            sin = sin[:, None, :, :]
+            return (q * cos) + (rotate_half(q) * sin), (k * cos) + (
+                rotate_half(k) * sin
+            )
+
+        @torch.compile
+        def forward(
+            x,
+            q_proj,
+            k_proj,
+            v_proj,
+            o_proj,
+            embed_norm,
+            hidden_norm,
+            cos,
+            sin,
+        ):
+            batch, seq_len, _ = x.shape
+
+            # Eagle3 first layer: split concatenated [embeds, hidden] input
+            mid = x.shape[2] // 2
+            embeds, hidden = x.split(mid, dim=-1)
+
+            # Dual RMSNorm (pow, sum, div, mul in backward)
+            embeds = embed_norm(embeds)
+            hidden = hidden_norm(hidden)
+            residual = hidden
+
+            # Recombine for attention input (2 * HIDDEN_SIZE)
+            x = torch.cat([embeds, hidden], dim=-1)
+
+            # Adding a graph break here "fixes" the issue
+            # by breaking up the fused op
+            # torch._dynamo.graph_break()
+
+            # Q/K/V projections from 2*hidden_size input
+            q = q_proj(x).view(batch, seq_len, NUM_HEADS, HEAD_DIM).transpose(1, 2)
+            k = k_proj(x).view(batch, seq_len, NUM_KV_HEADS, HEAD_DIM).transpose(1, 2)
+            v = v_proj(x).view(batch, seq_len, NUM_KV_HEADS, HEAD_DIM).transpose(1, 2)
+
+            q, k = apply_rotary_pos_emb(q, k, cos, sin)
+            k = torch.repeat_interleave(k, NUM_HEADS // NUM_KV_HEADS, dim=1)
+            v = torch.repeat_interleave(v, NUM_HEADS // NUM_KV_HEADS, dim=1)
+            out = q.contiguous() @ k.contiguous().transpose(-2, -1) @ v.contiguous()
+
+            out = out.transpose(1, 2).contiguous().reshape(batch, seq_len, -1)
+            return o_proj(out) + residual
+
+        # Layers
+        embed_norm = nn.RMSNorm(HIDDEN_SIZE).to(GPU_TYPE)
+        hidden_norm = nn.RMSNorm(HIDDEN_SIZE).to(GPU_TYPE)
+        # Q/K/V project from 2*HIDDEN_SIZE (concatenated embeds + hidden)
+        q_proj = nn.Linear(2 * HIDDEN_SIZE, NUM_HEADS * HEAD_DIM, bias=False).to(
+            GPU_TYPE
+        )
+        k_proj = nn.Linear(2 * HIDDEN_SIZE, NUM_KV_HEADS * HEAD_DIM, bias=False).to(
+            GPU_TYPE
+        )
+        v_proj = nn.Linear(2 * HIDDEN_SIZE, NUM_KV_HEADS * HEAD_DIM, bias=False).to(
+            GPU_TYPE
+        )
+        o_proj = nn.Linear(NUM_HEADS * HEAD_DIM, HIDDEN_SIZE, bias=False).to(GPU_TYPE)
+
+        # Block mask - simple causal only
+        def causal_mask(_b, _h, q, kv):
+            return q >= kv
+
+        # Rotary embeddings (precomputed, no grad needed)
+        inv_freq = 1.0 / (
+            500000.0
+            ** (
+                torch.arange(0, HEAD_DIM, 2, dtype=torch.float32, device=GPU_TYPE)
+                / HEAD_DIM
+            )
+        )
+        pos = torch.arange(1, SEQ_LEN + 1, dtype=torch.float32, device=GPU_TYPE)
+        freqs = torch.outer(pos, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1).unsqueeze(0)
+        cos, sin = emb.cos(), emb.sin()
+
+        # Input: 2*HIDDEN_SIZE to match split [embeds, hidden]
+        x = torch.randn(
+            1, SEQ_LEN, 2 * HIDDEN_SIZE, device=GPU_TYPE, requires_grad=True
+        )
+
+        out = forward(
+            x,
+            q_proj,
+            k_proj,
+            v_proj,
+            o_proj,
+            embed_norm,
+            hidden_norm,
+            cos,
+            sin,
+        )
+        loss = out.sum()
+        loss.backward()
+        self.assertTrue(metrics.codegen_mix_order_reduction > 1)
+
 
 @inductor_config.patch(
     "triton.mix_order_reduction", not inductor_config.triton.mix_order_reduction
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index bca5da15e3e22..039b53ee1f2fc 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -5297,6 +5297,7 @@ def inductor_meta_common(cls):
             "store_cubin": config.triton.store_cubin,
             "deterministic": config.deterministic,
             "force_filter_reduction_configs": config.test_configs.force_filter_reduction_configs,
+            "mix_order_reduction_allow_multi_stages": config.triton.mix_order_reduction_allow_multi_stages,
         }
 
         if config.write_are_deterministic_algorithms_enabled:
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 9bf1f70fa4c4c..e2fee26f45cc1 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1793,6 +1793,11 @@ class triton:
     # this could be helpful to avoid recompilations in some cases
     mix_order_reduction_non_strict_mode = False
 
+    # Don't allow multi-stages by default to avoid out of shared memory
+    mix_order_reduction_allow_multi_stages = (
+        os.environ.get("TORCHINDUCTOR_MIX_ORDER_REDUCTION_ALLOW_MULTI_STAGES") == "1"
+    )
+
     enable_tlx_templates: bool = (
         os.environ.get("TORCHINDUCTOR_ENABLE_TLX_TEMPLATES", "0") == "1"
     )
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 61bb640f5a072..2a1447fbf0bda 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -3776,7 +3776,10 @@ def persistent_reduction(
             # With large rnumel, we have higher chance of out-of-shared memory
             # To avoid adding too much autotuning overhead, we just constrain NUM_STAGES
             # if rnumel is large
-            MAX_NUM_STAGES = 2 if rnumel_hint > 8192 else 3
+            if inductor_meta.get("mix_order_reduction_allow_multi_stages", True):
+                MAX_NUM_STAGES = 2 if rnumel_hint > 8192 else 3
+            else:
+                MAX_NUM_STAGES = 1
             c.kwargs["NUM_STAGES"] = min(max(num_iters // 4, 1), MAX_NUM_STAGES)
 
             if rnumel_hint <= 1024:

From f31baaae5d066cf833ac96a270495d71bdc1d508 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 9 Mar 2026 14:49:46 -0700
Subject: [PATCH 30/87] Fix the torch.Stream context manager reentrance
 (#176603)

Fix the torch.Stream context manager reentrance (#176568)

# Motivation
This PR aims to fix `torch.Stream` as a context manager nested/reentrance scenario. `torch.cuda.stream` and `torch.xpu.stream` could support these usages.

The following scenario would be fixed with this PR:
```python
import torch
s0 = torch.Stream()
with s0, s0:
    pass
```
```python
import torch
s0 = torch.Stream()
s1 = torch.Stream()
with s0, s1:
    with s0, s1:
        pass
```

# Addtional Context
Fix https://github.com/pytorch/pytorch/issues/176560

Pull Request resolved: https://github.com/pytorch/pytorch/pull/176568
Approved by: https://github.com/albanD

(cherry picked from commit d43570c98bf31b1f7a14e821b0e197abecf92758)

Co-authored-by: Yu, Guangye <guangye.yu@intel.com>
---
 test/test_accelerator.py | 15 ++++++++
 torch/csrc/Stream.cpp    | 74 ++++++++++++++++++++++++++++++++++------
 2 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/test/test_accelerator.py b/test/test_accelerator.py
index 7daebc01adfe9..67b92969aefd0 100644
--- a/test/test_accelerator.py
+++ b/test/test_accelerator.py
@@ -111,6 +111,21 @@ def test_stream_context_manager(self):
             self.assertEqual(torch.accelerator.current_stream(), s)
         self.assertEqual(torch.accelerator.current_stream(), prev_stream)
 
+    def test_stream_context_manager_reentrance(self):
+        prev_stream = torch.accelerator.current_stream()
+        s0 = torch.Stream()
+        with s0, s0:
+            self.assertEqual(torch.accelerator.current_stream(), s0)
+        self.assertEqual(torch.accelerator.current_stream(), prev_stream)
+        s1 = torch.Stream()
+        with s0:
+            self.assertEqual(torch.accelerator.current_stream(), s0)
+            with s1:
+                self.assertEqual(torch.accelerator.current_stream(), s1)
+                with s0:
+                    self.assertEqual(torch.accelerator.current_stream(), s0)
+        self.assertEqual(torch.accelerator.current_stream(), prev_stream)
+
     @unittest.skipIf(not TEST_MULTIACCELERATOR, "only one accelerator detected")
     def test_multi_device_stream_context_manager(self):
         src_device = 0
diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp
index fc1ca916cfbed..c5a8e343e6b27 100644
--- a/torch/csrc/Stream.cpp
+++ b/torch/csrc/Stream.cpp
@@ -119,6 +119,7 @@ PyObject* THPStream_Wrap(const c10::Stream& stream) {
 
 static void THPStream_dealloc(THPStream* self) {
   PyObject_ClearWeakRefs((PyObject*)self);
+  Py_CLEAR(self->context);
   Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
 }
 
@@ -277,38 +278,71 @@ static PyObject* THPStream_enter(PyObject* _self, PyObject* unused) {
   auto self = reinterpret_cast<THPStream*>(_self);
   c10::DeviceType stream_device_type =
       static_cast<c10::DeviceType>(self->device_type);
+
   // No operation is performed if the stream does not belong to an accelerator.
   if (C10_UNLIKELY(!at::accelerator::isAccelerator(stream_device_type))) {
     Py_INCREF(_self);
     return _self;
   }
+
+  // Note [Reentrant Stream Context Manager]
+  //
+  // We maintain a stack of context entries to support nested/reentrant
+  // stream context managers. Each entry records the previously active
+  // stream and device so that they can be restored in __exit__.
+  //
+  // The stack is stored as a Python list where each entry is either:
+  //   - Py_None: no-op enter (stream was already current);
+  //   - dict:    {_ctx_stream, _ctx_device_index} saved before switching.
+  //
+  // self->context is initialized lazily as a PyList on first __enter__.
+  if (!self->context) {
+    auto list = THPObjectPtr(PyList_New(0));
+    if (!list) {
+      throw python_error();
+    }
+    self->context = list.release();
+  }
+
   c10::DeviceIndex cur_device_idx = at::accelerator::getDeviceIndex();
   c10::DeviceIndex stream_device_idx =
       static_cast<c10::DeviceIndex>(self->device_index);
+  c10::Stream cur_stream = at::accelerator::getCurrentStream(stream_device_idx);
+
+  // If the stream is already current, push None as a no-op sentinel.
+  if (cur_stream.id() == self->stream_id &&
+      cur_stream.device_index() == stream_device_idx) {
+    if (PyList_Append(self->context, Py_None) < 0) {
+      throw python_error();
+    }
+    Py_INCREF(_self);
+    return _self;
+  }
+
   // If the stream is not on the current device, switch the current device to
   // the device of the stream.
   if (stream_device_idx != cur_device_idx) {
     at::accelerator::setDeviceIndex(stream_device_idx);
   }
-  c10::Stream cur_stream = at::accelerator::getCurrentStream(stream_device_idx);
   at::accelerator::setCurrentStream(c10::Stream::unpack3(
       self->stream_id, stream_device_idx, stream_device_type));
-  // Save the current device index and previous stream to the context.
+
+  // Save the current device index and previous stream as a dict on the stack.
   auto ctx_device_index =
       THPObjectPtr(THPUtils_packDeviceIndex(cur_device_idx));
   auto ctx_stream = THPObjectPtr(THPStream_Wrap(cur_stream));
-  TORCH_CHECK(!(self->context), "Stream's context should not be initialized.");
   auto dict = THPObjectPtr(PyDict_New());
   if (!dict) {
     throw python_error();
   }
-  self->context = dict.release();
   if (PyDict_SetItemString(
-          self->context, "_ctx_device_index", ctx_device_index.get()) < 0) {
+          dict.get(), "_ctx_device_index", ctx_device_index.get()) < 0) {
+    throw python_error();
+  }
+  if (PyDict_SetItemString(dict.get(), "_ctx_stream", ctx_stream.get()) < 0) {
     throw python_error();
   }
-  if (PyDict_SetItemString(self->context, "_ctx_stream", ctx_stream.get()) <
-      0) {
+  if (PyList_Append(self->context, dict.get()) < 0) {
     throw python_error();
   }
   Py_INCREF(_self);
@@ -319,19 +353,34 @@ static PyObject* THPStream_enter(PyObject* _self, PyObject* unused) {
 static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) {
   HANDLE_TH_ERRORS
   auto self = reinterpret_cast<THPStream*>(_self);
+
   // No operation is performed if the stream does not belong to an accelerator.
   if (C10_UNLIKELY(!at::accelerator::isAccelerator(
           static_cast<c10::DeviceType>(self->device_type)))) {
     Py_RETURN_NONE;
   }
+
+  // Pop the top entry from the stack.
+  Py_ssize_t stack_size = PyList_Size(self->context);
+  TORCH_INTERNAL_ASSERT(stack_size > 0, "Stream context stack is empty.");
+  PyObject* top = PyList_GET_ITEM(self->context, stack_size - 1);
+
+  // Sentinel: this __enter__ was a no-op, nothing to restore.
+  if (top == Py_None) {
+    if (PyList_SetSlice(self->context, stack_size - 1, stack_size, nullptr) <
+        0) {
+      throw python_error();
+    }
+    Py_RETURN_NONE;
+  }
+
   PyObject* py_stream = nullptr;
-  if (PyDict_GetItemStringRef(self->context, "_ctx_stream", &py_stream) < 0) {
+  if (PyDict_GetItemStringRef(top, "_ctx_stream", &py_stream) < 0) {
     throw python_error();
   }
   auto ctx_stream = THPObjectPtr(py_stream);
   PyObject* py_device_index = nullptr;
-  if (PyDict_GetItemStringRef(
-          self->context, "_ctx_device_index", &py_device_index) < 0) {
+  if (PyDict_GetItemStringRef(top, "_ctx_device_index", &py_device_index) < 0) {
     throw python_error();
   }
   auto ctx_device_index = THPObjectPtr(py_device_index);
@@ -342,6 +391,7 @@ static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) {
       ctx_device_index.get(),
       "ctx_device_index should be present on the context dict.");
   auto prev_device_index = THPUtils_unpackDeviceIndex(ctx_device_index.get());
+
   at::accelerator::setCurrentStream(c10::Stream::unpack3(
       prev_stream->stream_id,
       static_cast<c10::DeviceIndex>(prev_stream->device_index),
@@ -350,7 +400,9 @@ static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) {
   if (static_cast<c10::DeviceIndex>(self->device_index) != prev_device_index) {
     at::accelerator::setDeviceIndex(prev_device_index);
   }
-  Py_CLEAR(self->context);
+  if (PyList_SetSlice(self->context, stack_size - 1, stack_size, nullptr) < 0) {
+    throw python_error();
+  }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

From 052ff5c474718ac457abc5cfb7674f2876231eb3 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 11 Mar 2026 10:15:02 -0700
Subject: [PATCH 31/87] Windows override AMI pre-installed cudnn (#177094)

Windows override AMI pre-installed cudnn (#177027)

Fixes: https://github.com/pytorch/pytorch/issues/167242

Also refactor code to avoid duplication.

Test via ciflow/binaries. I do see:
```
cuDNN version       : 9.19.0
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/177027
Approved by: https://github.com/malfet

(cherry picked from commit 61fae89be8357e04d4dc38a5424d6b307025d2e6)

Co-authored-by: atalman <atalman@fb.com>
---
 .ci/pytorch/windows/internal/cuda_install.bat | 172 +++++++++---------
 1 file changed, 91 insertions(+), 81 deletions(-)

diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat
index 0c8c023831e45..456b53183f186 100644
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@@ -20,8 +20,8 @@ set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
 set CUDNN_FOLDER="cuda"
 set CUDNN_LIB_FOLDER="lib\x64"
 
-:: Skip all of this if we already have cuda installed
-if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto set_cuda_env_vars
+:: If CUDA is already installed, skip CUDA installation but still verify cuDNN
+if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto check_cudnn
 
 if %CUDA_VER% EQU 126 goto cuda126
 if %CUDA_VER% EQU 128 goto cuda128
@@ -34,110 +34,47 @@ exit /b 1
 goto cuda_common
 
 :cuda126
-
 set CUDA_INSTALL_EXE=cuda_12.6.2_560.94_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6"
-)
-
+set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6"
 set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive
-set CUDNN_LIB_FOLDER="lib"
-set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-)
-
-@REM cuDNN 8.3+ required zlib to be installed on the path
-echo Installing ZLIB dlls
-curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-
-goto cuda_common
+goto cuda_download
 
 :cuda128
-
 set CUDA_INSTALL_EXE=cuda_12.8.0_571.96_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=cuda_profiler_api_12.8 thrust_12.8 nvcc_12.8 cuobjdump_12.8 nvprune_12.8 nvprof_12.8 cupti_12.8 cublas_12.8 cublas_dev_12.8 cudart_12.8 cufft_12.8 cufft_dev_12.8 curand_12.8 curand_dev_12.8 cusolver_12.8 cusolver_dev_12.8 cusparse_12.8 cusparse_dev_12.8 npp_12.8 npp_dev_12.8 nvrtc_12.8 nvrtc_dev_12.8 nvml_dev_12.8 nvjitlink_12.8 nvtx_12.8"
-)
-
+set "ARGS=cuda_profiler_api_12.8 thrust_12.8 nvcc_12.8 cuobjdump_12.8 nvprune_12.8 nvprof_12.8 cupti_12.8 cublas_12.8 cublas_dev_12.8 cudart_12.8 cufft_12.8 cufft_dev_12.8 curand_12.8 curand_dev_12.8 cusolver_12.8 cusolver_dev_12.8 cusparse_12.8 cusparse_dev_12.8 npp_12.8 npp_dev_12.8 nvrtc_12.8 nvrtc_dev_12.8 nvml_dev_12.8 nvjitlink_12.8 nvtx_12.8"
 set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda12-archive
-set CUDNN_LIB_FOLDER="lib"
-set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-)
-
-@REM cuDNN 8.3+ required zlib to be installed on the path
-echo Installing ZLIB dlls
-curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-
-goto cuda_common
+goto cuda_download
 
 :cuda129
-
 set CUDA_INSTALL_EXE=cuda_12.9.1_576.57_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=cuda_profiler_api_12.9 thrust_12.9 nvcc_12.9 cuobjdump_12.9 nvprune_12.9 nvprof_12.9 cupti_12.9 cublas_12.9 cublas_dev_12.9 cudart_12.9 cufft_12.9 cufft_dev_12.9 curand_12.9 curand_dev_12.9 cusolver_12.9 cusolver_dev_12.9 cusparse_12.9 cusparse_dev_12.9 npp_12.9 npp_dev_12.9 nvrtc_12.9 nvrtc_dev_12.9 nvml_dev_12.9 nvjitlink_12.9 nvtx_12.9"
-)
-
+set "ARGS=cuda_profiler_api_12.9 thrust_12.9 nvcc_12.9 cuobjdump_12.9 nvprune_12.9 nvprof_12.9 cupti_12.9 cublas_12.9 cublas_dev_12.9 cudart_12.9 cufft_12.9 cufft_dev_12.9 curand_12.9 curand_dev_12.9 cusolver_12.9 cusolver_dev_12.9 cusparse_12.9 cusparse_dev_12.9 npp_12.9 npp_dev_12.9 nvrtc_12.9 nvrtc_dev_12.9 nvml_dev_12.9 nvjitlink_12.9 nvtx_12.9"
 set CUDNN_FOLDER=cudnn-windows-x86_64-9.17.1.4_cuda12-archive
-set CUDNN_LIB_FOLDER="lib"
-set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-)
-
-@REM cuDNN 8.3+ required zlib to be installed on the path
-echo Installing ZLIB dlls
-curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-
-goto cuda_common
+goto cuda_download
 
 :cuda130
-
 set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe
+set "ARGS="
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda13-archive
+goto cuda_download
+
+:: Common download logic for CUDA toolkit, cuDNN, and ZLIB
+:cuda_download
+set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
     if errorlevel 1 exit /b 1
     set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS="
 )
 
-set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda13-archive
-set CUDNN_LIB_FOLDER="lib"
-set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
     curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
     if errorlevel 1 exit /b 1
     set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
 )
 
-@REM cuDNN 8.3+ required zlib to be installed on the path
-echo Installing ZLIB dlls
-curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+call :install_zlib
 
 goto cuda_common
 
@@ -211,6 +148,69 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_
 
 goto set_cuda_env_vars
 
+:check_cudnn
+:: When CUDA is pre-installed on the AMI, cuDNN may still be missing.
+:: Set the correct cuDNN variables for the CUDA version, then install if needed.
+
+set CUDNN_LIB_FOLDER="lib"
+if %CUDA_VER% EQU 126 (
+    set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive
+    set EXPECTED_CUDNN_VERSION=9.10.2
+)
+if %CUDA_VER% EQU 128 (
+    set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda12-archive
+    set EXPECTED_CUDNN_VERSION=9.19.0
+)
+if %CUDA_VER% EQU 129 (
+    set CUDNN_FOLDER=cudnn-windows-x86_64-9.17.1.4_cuda12-archive
+    set EXPECTED_CUDNN_VERSION=9.17.1
+)
+if %CUDA_VER% EQU 130 (
+    set CUDNN_FOLDER=cudnn-windows-x86_64-9.19.0.56_cuda13-archive
+    set EXPECTED_CUDNN_VERSION=9.19.0
+)
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+
+set "CUDNN_VERSION_FILE=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include\cudnn_version.h"
+
+if not exist "%CUDNN_VERSION_FILE%" (
+    echo cuDNN not found, installing %CUDNN_FOLDER%...
+    goto install_cudnn
+)
+
+for /f "tokens=3" %%a in ('findstr /C:"#define CUDNN_MAJOR " "%CUDNN_VERSION_FILE%"') do set INSTALLED_MAJOR=%%a
+for /f "tokens=3" %%a in ('findstr /C:"#define CUDNN_MINOR " "%CUDNN_VERSION_FILE%"') do set INSTALLED_MINOR=%%a
+for /f "tokens=3" %%a in ('findstr /C:"#define CUDNN_PATCHLEVEL " "%CUDNN_VERSION_FILE%"') do set INSTALLED_PATCHLEVEL=%%a
+set "INSTALLED_CUDNN_VERSION=%INSTALLED_MAJOR%.%INSTALLED_MINOR%.%INSTALLED_PATCHLEVEL%"
+
+if "%INSTALLED_CUDNN_VERSION%" == "%EXPECTED_CUDNN_VERSION%" (
+    echo cuDNN %INSTALLED_CUDNN_VERSION% already installed at %ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%
+    goto set_cuda_env_vars
+)
+
+echo cuDNN version mismatch: installed %INSTALLED_CUDNN_VERSION%, expected %EXPECTED_CUDNN_VERSION%. Reinstalling...
+
+:install_cudnn
+
+if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
+
+curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
+if errorlevel 1 exit /b 1
+
+7z x "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" -o"%SRC_DIR%\temp_build\cudnn"
+if errorlevel 1 (
+    echo Failed to extract cuDNN archive %CUDNN_INSTALL_ZIP%
+    exit /b 1
+)
+xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
+xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
+xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
+
+call :install_zlib
+
+echo Cleaning temp files
+rd /s /q "%SRC_DIR%\temp_build" || ver > nul
+
 :set_cuda_env_vars
 
 echo Setting up environment...
@@ -218,3 +218,13 @@ set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\b
 set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
 set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
 set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt"
+
+goto :eof
+
+@REM cuDNN 8.3+ requires zlib to be installed on the path
+:install_zlib
+echo Installing ZLIB dlls
+curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+goto :eof

From bac7b59c6fe3241bb6d6cca89cb4bf1da0662788 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 11 Mar 2026 11:47:12 -0700
Subject: [PATCH 32/87] fix acc failure for vit_base_patch14_dinov2.lvd142m
 (#177142)

fix acc failure for vit_base_patch14_dinov2.lvd142m (#177042)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/177042
Approved by: https://github.com/v0i0, https://github.com/jansel

(cherry picked from commit 78eae0472f45575d2d45b7d45d5fe5eccc4a8dcd)

Co-authored-by: Shunting Zhang <shunting@fb.com>
---
 benchmarks/dynamo/timm_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index 5d845e4fcee00..94da21d25bc1b 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -100,6 +100,7 @@ def pip_install(package):
 REQUIRE_LARGER_MULTIPLIER_FOR_SMALLER_TENSOR = {
     "inception_v3",
     "mobilenetv3_large_100",
+    "vit_base_patch14_dinov2.lvd142m",
 }
 
 
From e04ddeaf45651e14b819f232af8e19d5615adfcd Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 11 Mar 2026 11:47:54 -0700
Subject: [PATCH 33/87] [Inductor] Don't unfuse addmm for bf16/fp16 to avoid
 precision loss (#177144)

[Inductor] Don't unfuse addmm for bf16/fp16 to avoid precision loss (#176848)

This fixes https://github.com/pytorch/pytorch/issues/176411 (see: [PyTorch Hud](https://hud.pytorch.org/benchmark/v3/dashboard/compiler_inductor?renderGroupId=detail_view&time.start=2026-03-02T00%3A00%3A00.000Z&time.end=2026-03-09T23%3A59%3A59.999Z&filters.benchmarkName=compiler&filters.mode=inference&filters.dtype=bfloat16&filters.deviceName=cuda+%28h100%29&filters.device=cuda&filters.arch=h100&filters.suite=huggingface&filters.compiler=cudagraphs&filters.model=openai%2Fwhisper-tiny&lcommit.commit=3bfa1aaa46152e895089d5314002e216092e924a&lcommit.workflow_id=22865042382&lcommit.date=2026-03-09T18%3A00%3A00Z&lcommit.branch=gh%2FNikhilAPatel%2F124%2Fhead&rcommit.commit=3bfa1aaa46152e895089d5314002e216092e924a&rcommit.workflow_id=22865042382&rcommit.date=2026-03-09T18%3A00%3A00Z&rcommit.branch=gh%2FNikhilAPatel%2F124%2Fhead&lbranch=gh%2FNikhilAPatel%2F124%2Fhead&rbranch=gh%2FNikhilAPatel%2F124%2Fhead&maxSampling=110))

It looks like the cuDNN frontend 1.16.1 upgrade changed the default SDPA backend. With addmm, cuBLAS keeps the matmul result in higher precision before combining with bias. With mm + add, the result is truncated to bf16 first. This ~1.4x per-layer RMSE difference compounds through whisper's 8 attention layers, exceeding the 3.0x accuracy threshold.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/176848
Approved by: https://github.com/jansel, https://github.com/mlazos

(cherry picked from commit 1a270b4aa73b7169a76dd3aa09c52a04d1312c13)

Co-authored-by: NikhilAPatel <nikhilap@meta.com>
---
 test/inductor/test_pattern_matcher.py  | 17 +++++++++++++++++
 torch/_inductor/fx_passes/post_grad.py |  5 +++++
 2 files changed, 22 insertions(+)

diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index 08575462bef5f..d0504470bd976 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -1212,6 +1212,23 @@ def fn2(inp, a, b):
         _, (code) = run_and_get_code(fn2, args[0], args[1], args[2])
         FileCheck().check_not("extern_kernels.addmm(").run(code[0])
 
+    @parametrize("dtype", [torch.bfloat16, torch.float16])
+    def test_unfuse_bias_addmm_half_dtypes(self, dtype):
+        args = [
+            torch.randn(20, device=GPU_TYPE, dtype=dtype),
+            torch.randn(10, 15, device=GPU_TYPE, dtype=dtype),
+            torch.randn(15, 20, device=GPU_TYPE, dtype=dtype),
+        ]
+
+        # addmm with pointwise consumer should not be unfused for half dtypes
+        # to avoid precision loss from extra truncation at the mm output
+        @torch.compile()
+        def fn(inp, a, b):
+            return torch.nn.functional.gelu(torch.ops.aten.addmm(inp, a, b))
+
+        _, (code) = run_and_get_code(fn, args[0], args[1], args[2])
+        FileCheck().check("extern_kernels.addmm(").run(code[0])
+
     def test_addmm_alpha_beta_with_pointwise(self):
         # Test that addmm with alpha/beta != 1 is unfused correctly with pointwise ops
         # See https://github.com/pytorch/pytorch/issues/167313
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index 427a6918a9cea..5c560b9dda4b7 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -1517,6 +1517,11 @@ def should_prefer_unfused_addmm(match):
     extra_check=should_prefer_unfused_addmm,
 )
 def unfuse_bias_add_to_pointwise(match: Match, mat1, mat2, *, inp, alpha, beta):
+    # Unfusing addmm introduces an extra bf16/fp16 truncation at the mm output
+    # that compounds through deep models and causes accuracy failures.
+    if inp.meta["val"].dtype in (torch.bfloat16, torch.float16):
+        return
+
     def repl(inp, x1, x2, alpha, beta):
         mm_result = x1 @ x2
         if alpha != 1:

From e2fa2953033020ad7e0f823ec534044fac15a3c7 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 11 Mar 2026 11:49:21 -0700
Subject: [PATCH 34/87] [CD] Unpin cuda-bindings dependencies (#177159)

[CD] Unpin cuda-bindings dependencies (#176042)

Within the same CUDA major version

Fixes https://github.com/pytorch/pytorch/issues/175948
Pull Request resolved: https://github.com/pytorch/pytorch/pull/176042
Approved by: https://github.com/ngimel, https://github.com/drisspg

(cherry picked from commit 87f052cebb66c799b6fef71c5e5fa13af2165ac3)

Co-authored-by: Nikita Shulga <nshulga@meta.com>
---
 .../scripts/generate_binary_build_matrix.py   |  8 +--
 ...linux-aarch64-binary-manywheel-nightly.yml | 56 +++++++++----------
 ...nerated-linux-binary-manywheel-nightly.yml | 56 +++++++++----------
 3 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index eda03260446be..afcd637c6c57c 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -51,7 +51,7 @@
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
     "12.6": (
         "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | "  # noqa: B950
-        "cuda-bindings==12.9.4; platform_system == 'Linux' | "
+        "cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | "
         "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
         "nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | "
@@ -59,7 +59,7 @@
     ),
     "12.8": (
         "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | "  # noqa: B950
-        "cuda-bindings==12.9.4; platform_system == 'Linux' | "
+        "cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | "
         "nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | "
         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
         "nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | "
@@ -67,7 +67,7 @@
     ),
     "12.9": (
         "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | "  # noqa: B950
-        "cuda-bindings==12.9.4; platform_system == 'Linux' | "
+        "cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | "
         "nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | "
         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
         "nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | "
@@ -75,7 +75,7 @@
     ),
     "13.0": (
         "cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | "  # noqa: B950
-        "cuda-bindings==13.0.3; platform_system == 'Linux' | "
+        "cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | "
         "nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | "
         "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
         "nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | "
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index 567d099675a60..fe4f51edf2c19 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -133,7 +133,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -204,7 +204,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -275,7 +275,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -346,7 +346,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -483,7 +483,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -554,7 +554,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -625,7 +625,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -696,7 +696,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -833,7 +833,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -904,7 +904,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -975,7 +975,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1046,7 +1046,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1183,7 +1183,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1254,7 +1254,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1325,7 +1325,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1396,7 +1396,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1533,7 +1533,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1604,7 +1604,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1675,7 +1675,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1746,7 +1746,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1883,7 +1883,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1954,7 +1954,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2025,7 +2025,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2096,7 +2096,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2233,7 +2233,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2304,7 +2304,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2375,7 +2375,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2446,7 +2446,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 2e3aaa64f4d42..b820042fba170 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -128,7 +128,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -195,7 +195,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -262,7 +262,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -329,7 +329,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -792,7 +792,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -859,7 +859,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -926,7 +926,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -993,7 +993,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1456,7 +1456,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1523,7 +1523,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1590,7 +1590,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1657,7 +1657,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2120,7 +2120,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2187,7 +2187,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2254,7 +2254,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2321,7 +2321,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2784,7 +2784,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2851,7 +2851,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2918,7 +2918,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2985,7 +2985,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -3448,7 +3448,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -3515,7 +3515,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -3582,7 +3582,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -3649,7 +3649,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -4112,7 +4112,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.6.3; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -4179,7 +4179,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.8.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -4246,7 +4246,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==12.9.1; platform_system == 'Linux' | cuda-bindings>=12.9.4,<13; platform_system == 'Linux' | nvidia-cudnn-cu12==9.17.1.4; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -4313,7 +4313,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-toolkit[nvrtc,cudart,cupti,cufft,curand,cusolver,cusparse,cublas,cufile,nvjitlink,nvtx]==13.0.2; platform_system == 'Linux' | cuda-bindings>=13.0.3,<14; platform_system == 'Linux' | nvidia-cudnn-cu13==9.19.0.56; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.9; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 

From 41f8e3e0381395e1669ca4bc6e36a7872d25cdcd Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 11 Mar 2026 11:52:13 -0700
Subject: [PATCH 35/87] [CI] Stop using G3 runners (#177161)

[CI] Stop using G3 runners (#175938)

Which is an old Tesla M60, that reached EOL back in October 2025

It's really hard to find an official doc, but here is public issue about it https://github.com/SummitRoute/aws_breaking_changes/issues/114
Pull Request resolved: https://github.com/pytorch/pytorch/pull/175938
Approved by: https://github.com/seemethere, https://github.com/jeanschmidt

(cherry picked from commit 3b68f13463ea5499c5af8ca1a3138ea06a26c852)

Co-authored-by: Nikita Shulga <nshulga@meta.com>
---
 .github/actionlint.yaml                       |  6 --
 .../linux_binary_build_workflow.yml.j2        |  5 +-
 .github/workflows/_binary-test-linux.yml      |  2 +-
 ...nerated-linux-binary-manywheel-nightly.yml | 56 +++++++++----------
 4 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 95637501e1069..8ef1ee2240a2e 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -25,9 +25,6 @@ self-hosted-runner:
     - linux.aws.h100
     - linux.aws.h100.4
     - linux.aws.h100.8
-    - linux.4xlarge.nvidia.gpu
-    - linux.8xlarge.nvidia.gpu
-    - linux.16xlarge.nvidia.gpu
     - linux.g5.4xlarge.nvidia.gpu
     - linux.c7i.2xlarge
     # Pytorch/pytorch AWS Linux Runners on Linux Foundation account
@@ -37,9 +34,6 @@ self-hosted-runner:
     - lf.linux.12xlarge
     - lf.linux.24xlarge
     - lf.linux.arm64.2xlarge
-    - lf.linux.4xlarge.nvidia.gpu
-    - lf.linux.8xlarge.nvidia.gpu
-    - lf.linux.16xlarge.nvidia.gpu
     - lf.linux.g5.4xlarge.nvidia.gpu
     - lf.linux.c7i.2xlarge
     # Repo-specific IBM hosted S390x runner
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index e110f33d8ce39..3f41256728e52 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -116,12 +116,9 @@ jobs:
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       {%- elif config["gpu_arch_type"] == "rocm" %}
       runs_on: linux.rocm.gpu
-      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.6"] %}
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
       {%- elif config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
       {%- else %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.4xlarge
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index ed7738ecbdcc2..9f22bf2a01dda 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -63,7 +63,7 @@ on:
       runs_on:
         required: true
         type: string
-        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
+        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.g4dn.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
     secrets:
       github-token:
         required: true
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index b820042fba170..3dd2d544fd7f0 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -152,7 +152,7 @@ jobs:
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-upload:  # Uploading
@@ -219,7 +219,7 @@ jobs:
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-upload:  # Uploading
@@ -286,7 +286,7 @@ jobs:
       build_name: manywheel-py3_10-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_9-upload:  # Uploading
@@ -353,7 +353,7 @@ jobs:
       build_name: manywheel-py3_10-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda13_0-upload:  # Uploading
@@ -816,7 +816,7 @@ jobs:
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-upload:  # Uploading
@@ -883,7 +883,7 @@ jobs:
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-upload:  # Uploading
@@ -950,7 +950,7 @@ jobs:
       build_name: manywheel-py3_11-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_9-upload:  # Uploading
@@ -1017,7 +1017,7 @@ jobs:
       build_name: manywheel-py3_11-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda13_0-upload:  # Uploading
@@ -1480,7 +1480,7 @@ jobs:
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-upload:  # Uploading
@@ -1547,7 +1547,7 @@ jobs:
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-upload:  # Uploading
@@ -1614,7 +1614,7 @@ jobs:
       build_name: manywheel-py3_12-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_9-upload:  # Uploading
@@ -1681,7 +1681,7 @@ jobs:
       build_name: manywheel-py3_12-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda13_0-upload:  # Uploading
@@ -2144,7 +2144,7 @@ jobs:
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-upload:  # Uploading
@@ -2211,7 +2211,7 @@ jobs:
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-upload:  # Uploading
@@ -2278,7 +2278,7 @@ jobs:
       build_name: manywheel-py3_13-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_9-upload:  # Uploading
@@ -2345,7 +2345,7 @@ jobs:
       build_name: manywheel-py3_13-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda13_0-upload:  # Uploading
@@ -2808,7 +2808,7 @@ jobs:
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-upload:  # Uploading
@@ -2875,7 +2875,7 @@ jobs:
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-upload:  # Uploading
@@ -2942,7 +2942,7 @@ jobs:
       build_name: manywheel-py3_13t-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_9-upload:  # Uploading
@@ -3009,7 +3009,7 @@ jobs:
       build_name: manywheel-py3_13t-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda13_0-upload:  # Uploading
@@ -3472,7 +3472,7 @@ jobs:
       build_name: manywheel-py3_14-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_6-upload:  # Uploading
@@ -3539,7 +3539,7 @@ jobs:
       build_name: manywheel-py3_14-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_8-upload:  # Uploading
@@ -3606,7 +3606,7 @@ jobs:
       build_name: manywheel-py3_14-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_9-upload:  # Uploading
@@ -3673,7 +3673,7 @@ jobs:
       build_name: manywheel-py3_14-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda13_0-upload:  # Uploading
@@ -4136,7 +4136,7 @@ jobs:
       build_name: manywheel-py3_14t-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_6-upload:  # Uploading
@@ -4203,7 +4203,7 @@ jobs:
       build_name: manywheel-py3_14t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_8-upload:  # Uploading
@@ -4270,7 +4270,7 @@ jobs:
       build_name: manywheel-py3_14t-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_9-upload:  # Uploading
@@ -4337,7 +4337,7 @@ jobs:
       build_name: manywheel-py3_14t-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda13_0-upload:  # Uploading

From 036b25f5a29dc58cbc62e7b976efb860ff128c3f Mon Sep 17 00:00:00 2001
From: mikaylagawarecki <mikaylagawarecki@gmail.com>
Date: Wed, 11 Mar 2026 15:00:15 -0400
Subject: [PATCH 36/87] Let stable::from_blob accept a lambda as deleter
 (cherry-pick) (#176440)

---
 .../csrc/my_from_blob_with_lambda_deleter.cpp | 101 ++++++++++++++++++
 .../libtorch_agn_2_11/ops.py                  |  53 +++++++++
 test/cpp_extensions/test_libtorch_agnostic.py |  75 +++++++++++++
 torch/csrc/shim_common.cpp                    |  10 +-
 torch/csrc/stable/c/shim.h                    |   8 +-
 torch/csrc/stable/ops.h                       |  71 ++++++++----
 6 files changed, 294 insertions(+), 24 deletions(-)
 create mode 100644 test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp

diff --git a/test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp b/test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp
new file mode 100644
index 0000000000000..8e498cbf2b9f4
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp
@@ -0,0 +1,101 @@
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+#ifdef LAE_USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+using torch::stable::Tensor;
+
+// Global counter to track lambda deleter calls for testing
+static int64_t g_lambda_deleter_call_count = 0;
+
+// Wrapper for from_blob with a capturing-lambda deleter.
+// The lambda captures a pointer to the global counter and increments it,
+// which exercises the capturing-lambda code path in torch_from_blob.
+Tensor my_from_blob_with_lambda_deleter(
+    int64_t data_ptr,
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> sizes,
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> strides,
+    torch::stable::Device device,
+    torch::headeronly::ScalarType dtype) {
+  void* data = reinterpret_cast<void*>(data_ptr);
+  int64_t* counter = &g_lambda_deleter_call_count;
+  auto deleter = [counter](void* /*data*/) { (*counter)++; };
+  return torch::stable::from_blob(data, sizes, strides, device, dtype, deleter);
+}
+
+int64_t get_lambda_deleter_call_count() {
+  return g_lambda_deleter_call_count;
+}
+
+void reset_lambda_deleter_call_count() {
+  g_lambda_deleter_call_count = 0;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(STABLE_LIB_NAME, m) {
+  m.def(
+      "my_from_blob_with_lambda_deleter(int data_ptr, int[] sizes, int[] strides, Device device, ScalarType dtype) -> Tensor");
+  m.def("get_lambda_deleter_call_count() -> int");
+  m.def("reset_lambda_deleter_call_count() -> ()");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    STABLE_LIB_NAME,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl(
+      "my_from_blob_with_lambda_deleter",
+      TORCH_BOX(&my_from_blob_with_lambda_deleter));
+  m.impl(
+      "get_lambda_deleter_call_count",
+      TORCH_BOX(&get_lambda_deleter_call_count));
+  m.impl(
+      "reset_lambda_deleter_call_count",
+      TORCH_BOX(&reset_lambda_deleter_call_count));
+}
+
+#ifdef LAE_USE_CUDA
+
+// Same as my_from_blob_with_cuda_deleter (from 2.11) but uses a non-capturing
+// lambda deleter.
+Tensor my_from_blob_with_cuda_lambda_deleter(
+    int64_t numel,
+    torch::stable::Device device) {
+  size_t size_bytes = numel * sizeof(float);
+
+  void* data = nullptr;
+  cudaError_t err = cudaMalloc(&data, size_bytes);
+  if (err != cudaSuccess) {
+    throw std::runtime_error("cudaMalloc failed");
+  }
+
+  // Zero the memory
+  cudaMemset(data, 0, size_bytes);
+
+  std::array<int64_t, 1> sizes = {numel};
+  std::array<int64_t, 1> strides = {1};
+
+  // This lambda doesn't capture anything, but capture is tested above in
+  // my_from_blob_with_lambda_deleter
+  auto deleter = [](void* data) { cudaFree(data); };
+  return torch::stable::from_blob(
+      data,
+      torch::headeronly::HeaderOnlyArrayRef<int64_t>(sizes.data(), sizes.size()),
+      torch::headeronly::HeaderOnlyArrayRef<int64_t>(strides.data(), strides.size()),
+      device,
+      torch::headeronly::ScalarType::Float,
+      deleter);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(STABLE_LIB_NAME, m) {
+  m.def("my_from_blob_with_cuda_lambda_deleter(int numel, Device device) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(STABLE_LIB_NAME, CompositeExplicitAutograd, m) {
+  m.impl("my_from_blob_with_cuda_lambda_deleter", TORCH_BOX(&my_from_blob_with_cuda_lambda_deleter));
+}
+
+#endif  // LAE_USE_CUDA
diff --git a/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py b/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py
index 4315898009269..442f9c97ce5db 100644
--- a/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py
+++ b/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py
@@ -57,6 +57,59 @@ def my_from_blob_with_cuda_deleter(numel: int, device) -> Tensor:
     )
 
 
+def my_from_blob_with_lambda_deleter(data_ptr, sizes, strides, device, dtype) -> Tensor:
+    """
+    Creates a Tensor from existing memory with a capturing-lambda deleter.
+
+    The deleter is a capturing lambda that updates a global call count,
+    exercising the capturing-lambda code path in torch_from_blob.
+
+    Args:
+        data_ptr: int - pointer to the data buffer
+        sizes: tuple[int] - size of the tensor
+        strides: tuple[int] - strides of the tensor
+        device: Device - device on which the tensor resides
+        dtype: ScalarType - data type of the tensor
+
+    Returns: Tensor - tensor wrapping the existing memory
+    """
+    return torch.ops.libtorch_agn_2_11.my_from_blob_with_lambda_deleter.default(
+        data_ptr, sizes, strides, device, dtype
+    )
+
+
+def get_lambda_deleter_call_count() -> int:
+    """
+    Returns the number of times the lambda test deleter has been called.
+    """
+    return torch.ops.libtorch_agn_2_11.get_lambda_deleter_call_count.default()
+
+
+def reset_lambda_deleter_call_count() -> None:
+    """
+    Resets the lambda deleter call counter to zero.
+    """
+    torch.ops.libtorch_agn_2_11.reset_lambda_deleter_call_count.default()
+
+
+def my_from_blob_with_cuda_lambda_deleter(numel: int, device) -> Tensor:
+    """
+    Creates a CUDA tensor that owns its memory via cudaMalloc, using a lambda deleter.
+
+    Similar to my_from_blob_with_cuda_deleter but uses the capturing-lambda
+    code path in torch_from_blob.
+
+    Args:
+        numel: int - number of elements in the tensor
+        device: Device - CUDA device
+
+    Returns: Tensor - a 1D float32 tensor of zeros
+    """
+    return torch.ops.libtorch_agn_2_11.my_from_blob_with_cuda_lambda_deleter.default(
+        numel, device
+    )
+
+
 # =============================================================================
 # Proxy for inherited ops (from libtorch_agn_2_9 and libtorch_agn_2_10 csrc/)
 #
diff --git a/test/cpp_extensions/test_libtorch_agnostic.py b/test/cpp_extensions/test_libtorch_agnostic.py
index 195651c4284ae..b2843fef09822 100644
--- a/test/cpp_extensions/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/test_libtorch_agnostic.py
@@ -1809,6 +1809,81 @@ def test_my_from_blob_with_cuda_deleter_no_leak(self, device):
             curr_mem = torch.cuda.memory_allocated(device)
             self.assertEqual(curr_mem, init_mem)
 
+    @skipIfTorchVersionLessThan(2, 11)
+    @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+    def test_my_from_blob_with_lambda_deleter(self, device):
+        """Test for from_blob with capturing-lambda deleter (2.11 feature)."""
+        import libtorch_agn_2_11 as libtorch_agnostic
+
+        from_blob_fn = libtorch_agnostic.ops.my_from_blob_with_lambda_deleter
+        get_count = libtorch_agnostic.ops.get_lambda_deleter_call_count
+        reset_count = libtorch_agnostic.ops.reset_lambda_deleter_call_count
+
+        is_cuda = torch.device(device).type == "cuda"
+        if is_cuda:
+            init_mem = torch.cuda.memory_allocated(device)
+
+        def inner():
+            reset_count()
+            self.assertEqual(get_count(), 0)
+
+            # We need an original tensor to create the tensor with from_blob.
+            original = torch.rand(2, 3, device=device, dtype=torch.float32)
+            blob_tensor = from_blob_fn(
+                original.data_ptr(),
+                original.size(),
+                original.stride(),
+                device,
+                torch.float32,
+            )
+
+            self.assertEqual(blob_tensor, original)
+            self.assertEqual(blob_tensor.data_ptr(), original.data_ptr())
+
+            self.assertEqual(get_count(), 0)
+
+            del blob_tensor
+            gc.collect()
+
+            # Ensure the deleter was called. The original tensor still exists
+            # and can be used.
+            self.assertEqual(get_count(), 1)
+            original += 1
+            # original goes out of scope here and its cuda memory should be
+            # freed.
+
+        inner()
+
+        if is_cuda:
+            # original tensor is out of scope, all the memory should be freed
+            torch.cuda.synchronize(device)
+            curr_mem = torch.cuda.memory_allocated(device)
+            self.assertEqual(curr_mem, init_mem)
+
+    @onlyCUDA
+    @skipIfTorchVersionLessThan(2, 11)
+    def test_my_from_blob_with_cuda_lambda_deleter_no_leak(self, device):
+        """Test that from_blob lambda deleter properly frees cudaMalloc'd memory."""
+        import libtorch_agn_2_11 as libtorch_agnostic
+
+        from_blob_fn = libtorch_agnostic.ops.my_from_blob_with_cuda_lambda_deleter
+
+        torch.cuda.synchronize(device)
+        init_mem = torch.cuda.memory_allocated(device)
+        numel = 1024 * 1024  # 4 MB per tensor
+
+        for _ in range(10):
+            tensor = from_blob_fn(numel, device)
+            # Verify tensor was created correctly
+            self.assertEqual(tensor.numel(), numel)
+            self.assertEqual(tensor.device, torch.device(device))
+            del tensor
+            gc.collect()
+            torch.cuda.synchronize(device)
+
+            curr_mem = torch.cuda.memory_allocated(device)
+            self.assertEqual(curr_mem, init_mem)
+
     @onlyCPU
     def test_my_layout(self, device):
         """Test layout() method for various tensor layouts."""
diff --git a/torch/csrc/shim_common.cpp b/torch/csrc/shim_common.cpp
index d6c3cd2b41e0d..eac8147a30b29 100644
--- a/torch/csrc/shim_common.cpp
+++ b/torch/csrc/shim_common.cpp
@@ -667,7 +667,8 @@ AOTI_TORCH_EXPORT AOTITorchError torch_from_blob(
     int32_t layout,
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size,
-    void (*deleter)(void*)) {
+    void (*deleter_callback)(void* data, void* ctx),
+    void* deleter_ctx) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     c10::IntArrayRef sizes(sizes_ptr, ndim);
     c10::IntArrayRef strides(strides_ptr, ndim);
@@ -676,11 +677,14 @@ AOTI_TORCH_EXPORT AOTITorchError torch_from_blob(
         static_cast<c10::ScalarType>(dtype));
     at::Tensor tensor;
     if (data != nullptr) {
-      if (deleter != nullptr) {
+      if (deleter_callback != nullptr) {
+        auto wrapped_deleter = [deleter_callback, deleter_ctx](void* data) {
+          deleter_callback(data, deleter_ctx);
+        };
         tensor = at::for_blob(data, sizes)
                      .strides(strides)
                      .storage_offset(storage_offset)
-                     .deleter(deleter)
+                     .deleter(wrapped_deleter)
                      .options(options)
                      .make_tensor();
       } else {
diff --git a/torch/csrc/stable/c/shim.h b/torch/csrc/stable/c/shim.h
index f8a21e6c570f6..ec3caec593beb 100644
--- a/torch/csrc/stable/c/shim.h
+++ b/torch/csrc/stable/c/shim.h
@@ -165,8 +165,9 @@ AOTI_TORCH_EXPORT int32_t torch_dtype_float8_e8m0fnu();
 AOTI_TORCH_EXPORT int32_t torch_dtype_float4_e2m1fn_x2();
 
 // Creates a tensor from an existing data blob with an optional deleter.
-// The deleter is called with the data pointer when the tensor's storage
-// is deallocated.
+// The deleter receives both the data pointer and a caller-supplied context
+// pointer, which allows passing capturing lambdas across the C ABI boundary
+// by heap-allocating the callable and passing it as deleter_ctx.
 AOTI_TORCH_EXPORT AOTITorchError torch_from_blob(
     void* data,
     int64_t ndim,
@@ -180,7 +181,8 @@ AOTI_TORCH_EXPORT AOTITorchError torch_from_blob(
     int32_t layout,
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size,
-    void (*deleter)(void*));
+    void (*deleter)(void* data, void* ctx),
+    void* deleter_ctx);
 
 #endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_11_0
 
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
index 19c109404cb5b..dbede4faba49e 100644
--- a/torch/csrc/stable/ops.h
+++ b/torch/csrc/stable/ops.h
@@ -5,6 +5,7 @@
 #include <cstdint>
 #include <optional>
 #include <string>
+#include <type_traits>
 
 #include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
 #include <torch/csrc/stable/c/shim.h>
@@ -722,26 +723,28 @@ inline torch::stable::Tensor from_blob(
 ///
 /// This is the same as the from_blob function above, but allows specifying a
 /// custom deleter function that will be called when the tensor's storage is
-/// deallocated.
+/// deallocated. Accepts both plain function pointers and capturing lambdas.
+///
 /// Minimum compatible version: PyTorch 2.11.
 ///
+/// @tparam F The callable type. Must be invocable with (void*).
 /// @param data Pointer to the data buffer.
 /// @param sizes The size of each dimension of the tensor.
 /// @param strides The stride for each dimension.
 /// @param device The device where the data resides.
 /// @param dtype The scalar type of the data.
-/// @param deleter Function to call when the tensor is deallocated. May be
-///                nullptr if no cleanup is needed.
+/// @param deleter Callable to invoke when the tensor is deallocated.
 /// @param storage_offset The offset into the data buffer. Defaults to 0.
 /// @param layout The memory layout. Defaults to Strided.
 /// @return A tensor backed by the provided data.
+template <class F, std::enable_if_t<std::is_invocable_v<F, void*>, int> = 0>
 inline torch::stable::Tensor from_blob(
     void* data,
     torch::headeronly::IntHeaderOnlyArrayRef sizes,
     torch::headeronly::IntHeaderOnlyArrayRef strides,
     torch::stable::Device device,
     torch::headeronly::ScalarType dtype,
-    DeleterFnPtr deleter,
+    F deleter,
     int64_t storage_offset = 0,
     torch::headeronly::Layout layout = torch::headeronly::Layout::Strided) {
   auto shim_dtype =
@@ -750,21 +753,53 @@ inline torch::stable::Tensor from_blob(
       torch::stable::detail::from(device.type()));
   auto shim_layout =
       torch::stable::detail::to<int32_t>(torch::stable::detail::from(layout));
+
   AtenTensorHandle ath;
-  TORCH_ERROR_CODE_CHECK(torch_from_blob(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      storage_offset,
-      shim_dtype,
-      shim_device_type,
-      device.index(),
-      &ath,
-      shim_layout,
-      nullptr,
-      0,
-      deleter));
+  if constexpr (std::is_convertible_v<F, DeleterFnPtr>) {
+    // Simple function pointer: pass it as ctx, no heap allocation.
+    auto deleter_callback = [](void* data, void* ctx) {
+      auto fn = reinterpret_cast<DeleterFnPtr>(ctx);
+      fn(data);
+    };
+    TORCH_ERROR_CODE_CHECK(torch_from_blob(
+        data,
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        storage_offset,
+        shim_dtype,
+        shim_device_type,
+        device.index(),
+        &ath,
+        shim_layout,
+        nullptr,
+        0,
+        deleter_callback,
+        reinterpret_cast<void*>(static_cast<DeleterFnPtr>(deleter))));
+  } else {
+    // Capturing lambda: heap-allocate and type-erase.
+    F* heap_allocated_deleter = new F(std::move(deleter));
+    auto deleter_callback = [](void* data, void* ctx) {
+      F* func = static_cast<F*>(ctx);
+      (*func)(data);
+      delete func;
+    };
+    TORCH_ERROR_CODE_CHECK(torch_from_blob(
+        data,
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        storage_offset,
+        shim_dtype,
+        shim_device_type,
+        device.index(),
+        &ath,
+        shim_layout,
+        nullptr,
+        0,
+        deleter_callback,
+        static_cast<void*>(heap_allocated_deleter)));
+  }
   return torch::stable::Tensor(ath);
 }
 #endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_11_0

From fa384de31efe6548e694758d47ff295f2c2edb57 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <2453524+malfet@users.noreply.github.com>
Date: Wed, 11 Mar 2026 15:03:16 -0700
Subject: [PATCH 37/87] [Inductor][MPS] Fix half-precision type mismatches in
 Metal shader codegen (#176436) (#177193)

Metal Shading Language rejects implicit float-to-bfloat conversions, so
bare float literals like `0.0` in generated shaders cause compilation
failures when the target variable is `bfloat` (or `half`). Three codegen
methods were affected:

- `constant()` ignored its `dtype` parameter and returned raw literals.
- `masked()` assigned a bare literal in the else-branch (`} else tmp = 0.0;`).
- `where()` passed a bare literal through the ternary without casting.

All three now emit `static_cast<bfloat>(...)` / `static_cast<half>(...)`
where needed. Tests added for half-precision constants, reductions, and
conditionals.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/176436
Approved by: https://github.com/malfet

Test plan: Run `python -c "import torch;F=torch.nn.functional;print(torch.compile(lambda x: F.pad(F.gelu(x), [1, 0]))(torch.randn(4, device='mps', dtype=torch.bfloat16)))"`

(cherry picked from commit 3b161e7a756798e6eb1ab096f4ef1232d163a68d)

Co-authored-by: Mergen Nachin <mnachin@meta.com>
---
 torch/_inductor/codegen/mps.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index 05d0e84c681ad..4e409238d0b72 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -240,13 +240,17 @@ def masked(mask: CSEVariable, body: sympy.Expr, other: CSEVariable) -> str:
             )
             with V.kernel.compute.indent():
                 V.kernel.compute.splice(scoped_body)
-                V.kernel.compute.writeline(f"{var} = {rc};")
-            V.kernel.compute.writeline(f"}} else {var} = {other_str};")
+                V.kernel.compute.writeline(
+                    f"{var} = static_cast<decltype({var})>({rc});"
+                )
+            V.kernel.compute.writeline(
+                f"}} else {var} = static_cast<decltype({var})>({other_str});"
+            )
         return var
 
     @staticmethod
     def where(a: OpVarT, b: OpVarT, c: OpVarT) -> str:
-        return f"{a} ? {b} : {value_to_metal(c)}"
+        return f"{a} ? {b} : static_cast<decltype({b})>({value_to_metal(c)})"
 
     @staticmethod
     def remainder(a: OpVarT, b: OpVarT) -> str:

From 76fd07897dd9126df160e9723d97511b79888087 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Thu, 12 Mar 2026 15:59:41 -0400
Subject: [PATCH 38/87] [release-only] Fix libtorch builds. Fix lint (#177299)

---
 .../workflows/generated-linux-binary-libtorch-nightly.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index db8ed62b924ef..88152f2cf92dd 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -156,7 +156,7 @@ jobs:
       build_name: libtorch-cuda12_6-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_6-shared-with-deps-release-upload:  # Uploading
@@ -225,7 +225,7 @@ jobs:
       build_name: libtorch-cuda12_8-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_8-shared-with-deps-release-upload:  # Uploading
@@ -294,7 +294,7 @@ jobs:
       build_name: libtorch-cuda12_9-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
@@ -363,7 +363,7 @@ jobs:
       build_name: libtorch-cuda13_0-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading

From 7f2cdeb75b76bf07bb73776444bbb94456adbfa0 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 12 Mar 2026 20:43:15 -0700
Subject: [PATCH 39/87] [windows][smoke test] Add an option to install cuda if
 required cuda/cudnn on windows AMI do not match (#177369)

[windows][smoke test] Add an option to install cuda if required cuda/cudnn on windows AMI do not match (#177273)

Followup fix for https://github.com/pytorch/pytorch/issues/167242
After https://github.com/pytorch/pytorch/pull/175547 Windows AMI and intendent version of CUDNN do not match. Hence followup fixes required. With this approach we are flexible to update windows CUDNN without actually updating the Windows AMI. Windows AMI update can be done later.

Fixes failure during test: https://github.com/pytorch/pytorch/actions/runs/22979336872/job/66724249070

```
RuntimeError: cuDNN version incompatibility: PyTorch was compiled  against (9, 19, 0) but found runtime version (9, 10, 2). PyTorch already comes bundled with cuDNN. One option to resolving this error is to ensure PyTorch can find the bundled cuDNN.
```

Please note: The cuda/cudnn version will not be updated if the right version is already installed.

Test Plan: In CI via ciflow/binaries
Pull Request resolved: https://github.com/pytorch/pytorch/pull/177273
Approved by: https://github.com/malfet, https://github.com/albanD

(cherry picked from commit e55da9f31ef3ca0a5e0bb0ba29c6d2a6b1352f52)

Co-authored-by: atalman <atalman@fb.com>
---
 .ci/pytorch/windows/internal/cuda_install.bat | 35 +++++++++++++------
 .ci/pytorch/windows/internal/smoke_test.bat   |  4 +++
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat
index 456b53183f186..3538c7aa2d323 100644
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@@ -17,8 +17,8 @@ set /a CUDA_VER=%CUDA_VERSION%
 set CUDA_VER_MAJOR=%CUDA_VERSION:~0,-1%
 set CUDA_VER_MINOR=%CUDA_VERSION:~-1,1%
 set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
-set CUDNN_FOLDER="cuda"
-set CUDNN_LIB_FOLDER="lib\x64"
+set CUDNN_FOLDER=cuda
+set CUDNN_LIB_FOLDER=lib\x64
 
 :: If CUDA is already installed, skip CUDA installation but still verify cuDNN
 if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto check_cudnn
@@ -59,7 +59,7 @@ goto cuda_download
 
 :: Common download logic for CUDA toolkit, cuDNN, and ZLIB
 :cuda_download
-set CUDNN_LIB_FOLDER="lib"
+set CUDNN_LIB_FOLDER=lib
 set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
@@ -126,9 +126,12 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_
 
     echo Installing cuDNN...
     7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
-    xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
-    xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
-    xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
+    xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\"
+    if exist "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\x64\*.*" (
+        xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\x64\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\"
+    )
+    xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64\"
+    xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include\"
 
     echo Installing GPU driver DLLs
     7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -o"C:\Windows\System32"
@@ -152,7 +155,7 @@ goto set_cuda_env_vars
 :: When CUDA is pre-installed on the AMI, cuDNN may still be missing.
 :: Set the correct cuDNN variables for the CUDA version, then install if needed.
 
-set CUDNN_LIB_FOLDER="lib"
+set CUDNN_LIB_FOLDER=lib
 if %CUDA_VER% EQU 126 (
     set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive
     set EXPECTED_CUDNN_VERSION=9.10.2
@@ -190,6 +193,11 @@ if "%INSTALLED_CUDNN_VERSION%" == "%EXPECTED_CUDNN_VERSION%" (
 
 echo cuDNN version mismatch: installed %INSTALLED_CUDNN_VERSION%, expected %EXPECTED_CUDNN_VERSION%. Reinstalling...
 
+:: Remove old cuDNN DLLs so they don't shadow the new version at runtime.
+:: AMI-installed cuDNN places DLLs directly in bin\, while newer archives
+:: use bin\x64\. Without cleanup the old DLLs in bin\ are found first.
+del /Q "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\cudnn*.dll" 2>nul
+
 :install_cudnn
 
 if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
@@ -202,9 +210,16 @@ if errorlevel 1 (
     echo Failed to extract cuDNN archive %CUDNN_INSTALL_ZIP%
     exit /b 1
 )
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
+echo Listing extracted cuDNN archive contents:
+dir /S /B "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%"
+xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\"
+:: Newer cuDNN archives place DLLs under bin\x64\. Flatten them into bin\
+:: so they are found via PATH (which only includes bin\, not bin\x64\).
+if exist "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\x64\*.*" (
+    xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\x64\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\"
+)
+xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64\"
+xcopy /Y /S "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include\"
 
 call :install_zlib
 
diff --git a/.ci/pytorch/windows/internal/smoke_test.bat b/.ci/pytorch/windows/internal/smoke_test.bat
index f671a9d0e0abb..c920dc2aeb165 100644
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@@ -5,6 +5,10 @@ pushd %SRC_DIR%\..
 if not "%CUDA_VERSION%" == "cpu" if not "%CUDA_VERSION%" == "xpu" call internal\driver_update.bat
 if errorlevel 1 exit /b 1
 
+echo "Check if CUDA and CUDNN versions need to be updated"
+call internal\cuda_install.bat
+if errorlevel 1 exit /b 1
+
 if "%CUDA_VERSION%" == "xpu" (
     call internal\xpu_install.bat
     if errorlevel 1 exit /b 1

From 483b55d84c74b92b3c2c67be4b9b7c7359ec2bbc Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 17 Mar 2026 11:03:09 -0700
Subject: [PATCH 40/87] Update pytorch_sphinx_theme2 version to 0.4.6 (#177616)

Update pytorch_sphinx_theme2 version to 0.4.6 (#177562)

Changelog here: https://github.com/pytorch/pytorch_sphinx_theme/blob/pytorch_sphinx_theme2/CHANGELOG.md

Pull Request resolved: https://github.com/pytorch/pytorch/pull/177562
Approved by: https://github.com/AlannaBurke, https://github.com/albanD

(cherry picked from commit c5dcefde3fe9bd88f25e626aa8201c54b4143f87)

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 .ci/docker/requirements-docs.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index 7f3e0b5cc9215..7e556a80e0025 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -2,9 +2,9 @@ sphinx==7.2.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 7.2.6
 
-pytorch_sphinx_theme2==0.4.3
+pytorch_sphinx_theme2==0.4.6
 #Description: This is needed to generate PyTorch docs
-#Pinned versions: 0.4.3
+#Pinned versions: 0.4.6
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably

From db741c72097871e384b22ee6fff1d6083adf23cc Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 20 Mar 2026 20:04:26 -0700
Subject: [PATCH 41/87] [MPS] fix compiling of SDPA producing nan results
 (#178009)

[MPS] fix compiling of SDPA producing nan results (#175481)

Fixes #171764

Took me a while to figure out wth was going wrong.

Mini reproducer:
```python
import torch

# (uint / 65536) % non_power of 2, gives wrong result
lib = torch.mps.compile_shader('''
    kernel void func(device int* out, uint idx [[thread_position_in_grid]]) {
        out[idx] = (idx / 65536) % 6;
    }
''')

out = torch.empty(128, device='mps', dtype=torch.int32)
lib.func(out)

# Every value should be 0 since xindex/65536 == 0 for xindex in [0,127]
for i in [0, 5, 6, 7, 63, 64]:
    print(f"{i=} got {out[i].item()}")
```

Same purely in swift
```swift
import Metal

let device = MTLCreateSystemDefaultDevice()!
let queue = device.makeCommandQueue()!

let shaderSource = """
  kernel void func(device int* out [[buffer(0)]], uint idx [[thread_position_in_grid]]) {
    out[idx] = (idx / 65536) % 6;
  }
"""

let library = try device.makeLibrary(source: shaderSource, options: nil)
let function = library.makeFunction(name: "func")!
let pipeline = try device.makeComputePipelineState(function: function)

let count = 128
let buffer = device.makeBuffer(length: count * MemoryLayout<Int32>.stride, options: .storageModeShared)!

let cmdBuf = queue.makeCommandBuffer()!
let encoder = cmdBuf.makeComputeCommandEncoder()!
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(buffer, offset: 0, index: 0)
encoder.dispatchThreads(
    MTLSizeMake(count, 1, 1),
    threadsPerThreadgroup: MTLSizeMake(min(count, pipeline.maxTotalThreadsPerThreadgroup), 1, 1)
)
encoder.endEncoding()
cmdBuf.commit()
cmdBuf.waitUntilCompleted()

let ptr = buffer.contents().bindMemory(to: Int32.self, capacity: count)
for i in [0, 5, 6, 7, 63, 64] {
    print("i=\(i) got \(ptr[i])")
}
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175481
Approved by: https://github.com/malfet

(cherry picked from commit 3a9554c6436a1636d98db225af699d7e40c3bf12)

Co-authored-by: Isalia20 <irakli.salia854@gmail.com>
---
 c10/metal/utils.h               | 12 ++++++++++++
 test/inductor/test_mps_basic.py | 19 +++++++++++++++++++
 test/test_mps.py                | 16 ++++++++++++++++
 torch/_inductor/codegen/mps.py  |  5 +++++
 4 files changed, 52 insertions(+)

diff --git a/c10/metal/utils.h b/c10/metal/utils.h
index 8d58d0dfdd1f2..cc946e4fc4aa4 100644
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@@ -189,6 +189,18 @@ inline common_dtype<T, U> floor_divide(T x, U y) {
   return ::metal::floor(x / y);
 }
 
+// Workaround for Metal compiler bug: the compiler produces wrong results
+// when optimizing fused (x / A) % B expressions for integral types.
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_integral_v<T> && is_scalar_integral_v<U>,
+        bool> = true>
+inline common_dtype<T, U> safe_mod(volatile T x, U y) {
+  return x % y;
+}
+
 // fmod
 template <
     typename T,
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
index 5c1691e581ccc..49d25247a8e32 100644
--- a/test/inductor/test_mps_basic.py
+++ b/test/inductor/test_mps_basic.py
@@ -191,6 +191,25 @@ def fn(a, b):
             ),
         )
 
+    def test_sdpa_split_qkv(self):
+        # regression test for metal compiler bug where fused (x / A) % B
+        # produces wrong results, causing incorrect reads from non-contiguous.
+        n_head, n_embd, seq_len = 6, 384, 1024
+        x = torch.randn(16, seq_len, n_embd, device="mps")
+        c_attn = torch.nn.Linear(n_embd, 3 * n_embd).to("mps").eval()
+        qkv = c_attn(x)
+        q, k, v = qkv.split(n_embd, dim=2)
+        q = q.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2)
+        k = k.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2)
+        v = v.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2)
+
+        def fn(q, k, v):
+            return torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, is_causal=True
+            )
+
+        self.common(fn, (q, k, v), atol=1e-4, rtol=1e-4, check_lowp=False)
+
 
 class MPSBasicTestsAOTI(TestCase):
     def check_model(self, m, inp, dynamic_shapes=None):
diff --git a/test/test_mps.py b/test/test_mps.py
index 02e291017582e..31a2e0161c2e6 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -13313,6 +13313,22 @@ def test_metal_error_buffer(self):
         with self.assertRaisesRegex(RuntimeError, "Index .* exceeds limit"):
             torch.mps.synchronize()
 
+    def test_metal_compiler_bug_workaround(self):
+        # (uint / 65536) % non_power of 2, gives wrong result without safe_mod
+        lib = torch.mps.compile_shader('''
+            #include <c10/metal/utils.h>
+
+            kernel void func(device int* out, uint idx [[thread_position_in_grid]]) {
+                out[idx] = c10::metal::safe_mod((idx / 65536), 6);
+            }
+        ''')
+        out = torch.empty(128, device='mps', dtype=torch.int32)
+        lib.func(out)
+        # Every value should be 0 since xindex/65536 == 0 for xindex in [0,127]
+        for i in [0, 5, 6, 7, 63, 64]:
+            self.assertEqual(out[i], 0)
+
+
 
 # TODO: Actually instantiate that test for the "mps" device to better reflect what it is doing.
 # This requires mps to be properly registered in the device generic test framework which is not the
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index 4e409238d0b72..b413a6f43f636 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -80,6 +80,9 @@ def _print_FloorDiv(self, expr: sympy.Expr) -> str:
 
     def _print_ModularIndexing(self, expr: sympy.Expr) -> str:
         x, div, mod = expr.args
+        # Workaround for Metal compiler bug with fused (x / A) % B, see PR 175481
+        use_safe_mod = div == 65536 and (mod & (mod - 1)) != 0
+
         x = self.doprint(x)
         if div != 1:
             div = self.doprint(div)
@@ -88,6 +91,8 @@ def _print_ModularIndexing(self, expr: sympy.Expr) -> str:
             else:
                 x = f"metal::floor({x}) / ({div})"
         mod = self.doprint(mod)
+        if use_safe_mod:
+            return f"c10::metal::safe_mod({x}, {mod})"
         return f"({x}) % ({mod})"
 
     def _print_Min(self, expr: sympy.Expr) -> str:

From 3e05c5a9ca8aacd0d137541876f8bf4cfca7e940 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 20 Mar 2026 20:05:16 -0700
Subject: [PATCH 42/87] [MPS] Properly handle conjugated tensors in bmm
 (#178010)

[MPS] Properly handle conjugated tensors in bmm (#177522)

Both `bmm` and `addmm` lacked proper handling for conjugated inputs for some of its arguments

- Add regression tests
- Fixes` test_noncontiguous_samples_linalg_svd_complex64`

Fixes https://github.com/pytorch/pytorch/issues/177474
Pull Request resolved: https://github.com/pytorch/pytorch/pull/177522
Approved by: https://github.com/Skylion007, https://github.com/kurtamohler

(cherry picked from commit bd1afa6b33a9f933a6da464b2f688b042bb5f275)

Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com>
---
 .../native/mps/operations/LinearAlgebra.mm    | 24 +++++++++++--------
 test/test_mps.py                              | 21 ++++++++++++++++
 .../_internal/common_methods_invocations.py   |  8 -------
 .../_internal/opinfo/definitions/linalg.py    | 11 ---------
 4 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index d98134469ec04..fa3796e59b969 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -888,7 +888,8 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
     std::string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, *bias_}) + ":" +
         std::to_string(beta.toDouble()) + ":" + std::to_string(alpha.toDouble());
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* biasTensor = mpsGraphRankedPlaceHolder(mpsGraph, *bias_);
+      auto biasTensor = mpsGraphRankedPlaceHolder(mpsGraph, *bias_);
+      auto biasTensor_ = bias_->is_conj() ? [mpsGraph conjugateWithTensor:biasTensor name:nil] : biasTensor;
 
       // TODO: Use alpha and beta here with fill_.Scalar and mul
       auto [selfTensor, otherTensor, productTensor] = do_mm(mpsGraph, self, other);
@@ -901,11 +902,11 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
                                                             secondaryTensor:alphaTensor
                                                                        name:@"MM/alpha*(mat1@mat2)"];
       }
-      auto biasTimesBetaTensor = biasTensor;
+      auto biasTimesBetaTensor = biasTensor_;
       if (is_beta_non_zero && beta.toDouble() != 1.0) {
         auto betaTensor = [mpsGraph constantWithScalar:beta.toDouble()
                                               dataType:getMPSScalarType((*bias_).scalar_type())];
-        biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:biasTensor
+        biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:biasTensor_
                                                         secondaryTensor:betaTensor
                                                                    name:@"MM/beta*input"];
       }
@@ -1112,7 +1113,8 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
   // Call tiled implementation if the number of elements exceeds 2^32
   uint64_t resultSize = batch1.size(0) * batch1.size(1) * batch2.size(2);
   if (resultSize > pow(2, 32)) {
-    result = tiled_bmm_out_mps_impl(batch1, batch2, result);
+    // Tiled path uses MPSNDArray directly, so resolve conjugate views upfront
+    result = tiled_bmm_out_mps_impl(batch1.resolve_conj(), batch2.resolve_conj(), result);
     return result;
   }
 
@@ -1130,16 +1132,18 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
         std::to_string(doTranspose);
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* batch1Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch1.scalar_type()));
-      MPSGraphTensor* batch2Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch2.scalar_type()));
-      MPSGraphTensor* batch2TensorTranspose = batch2Tensor;
+      auto batch1Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch1.scalar_type()));
+      auto batch2Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch2.scalar_type()));
+
+      auto batch1TensorOp = batch1.is_conj() ? [mpsGraph conjugateWithTensor:batch1Tensor name:nil] : batch1Tensor;
+      auto batch2TensorOp = batch2.is_conj() ? [mpsGraph conjugateWithTensor:batch2Tensor name:nil] : batch2Tensor;
 
       if (doTranspose) {
-        batch2TensorTranspose = [mpsGraph transposeTensor:batch2Tensor dimension:-1 withDimension:-2 name:nil];
+        batch2TensorOp = [mpsGraph transposeTensor:batch2TensorOp dimension:-1 withDimension:-2 name:nil];
       }
 
-      MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1Tensor
-                                                                      secondaryTensor:batch2TensorTranspose
+      MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1TensorOp
+                                                                      secondaryTensor:batch2TensorOp
                                                                                  name:@"MM/(batch1@batch2)"];
 
       newCachedGraph->batch1Tensor_ = batch1Tensor;
diff --git a/test/test_mps.py b/test/test_mps.py
index 31a2e0161c2e6..5ca9f2ea484b8 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1207,6 +1207,27 @@ def test_bmm(self):
         self.assertEqual(output_cpu, output_mps)
         self.assertEqual(output_cpu.size(), output_mps.size())
 
+    def test_bmm_conj(self):
+        # bmm must respect the conjugate bit on input tensors.
+        # See https://github.com/pytorch/pytorch/issues/177474
+        a = torch.randn(4, 3, 5, dtype=torch.complex64, device="mps")
+        b = torch.randn(4, 5, 2, dtype=torch.complex64, device="mps")
+        result_mps = torch.bmm(a, b.conj())
+        result_cpu = torch.bmm(a.cpu(), b.cpu().conj())
+        self.assertEqual(result_cpu, result_mps)
+        result_mps = torch.bmm(a.conj(), b)
+        result_cpu = torch.bmm(a.cpu().conj(), b.cpu())
+        self.assertEqual(result_cpu, result_mps)
+
+    def test_addmm_conj(self):
+        # Regression test: addmm must respect the conjugate bit on the bias tensor.
+        bias = torch.randn(3, 2, dtype=torch.complex64, device="mps")
+        a = torch.randn(3, 5, dtype=torch.complex64, device="mps")
+        b = torch.randn(5, 2, dtype=torch.complex64, device="mps")
+        result_mps = torch.addmm(bias.conj(), a, b)
+        result_cpu = torch.addmm(bias.cpu().conj(), a.cpu(), b.cpu())
+        self.assertEqual(result_cpu, result_mps)
+
     @xfailIf(MACOS_VERSION < 15.0)
     @parametrize("dtype", [torch.float16, torch.bfloat16])
     def test_large_bmm(self, dtype):
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index e3ac2e1c4aade..a019f5f1bfdce 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -19251,14 +19251,6 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                             device_type='mps', dtypes=[torch.float32]),
                # The operator 'aten::take' is not currently implemented for the MPS device
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type='mps'),
-               # RuntimeError: svd_backward: The singular vectors in the complex
-               # case are specified up to multiplication by e^{i phi}. The
-               # specified loss function depends on this phase term, making it
-               # ill-defined.
-               DecorateInfo(
-                   unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples',
-                   device_type='mps', dtypes=(torch.complex64,)
-               ),
            )),
     OpInfo('svd_lowrank',
            op=lambda *args, **kwargs: wrapper_set_seed(
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 70c650dc327d8..b1b68d744751f 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -2728,17 +2728,6 @@ def make_input():
                 "test_out_warning",
                 device_type="mps",
             ),
-            # MPS: RuntimeError: svd_backward: The singular vectors in the
-            # complex case are specified up to multiplication by e^{i phi}. The
-            # specified loss function depends on this phase term, making it
-            # ill-defined.
-            DecorateInfo(
-                unittest.expectedFailure,
-                "TestCommon",
-                "test_noncontiguous_samples",
-                device_type="mps",
-                dtypes=(torch.complex64,),
-            ),
         ),
     ),
     OpInfo(

From 70d99e998b4955e0049d13a98d77ae1b14db1f45 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 20 Mar 2026 23:11:45 -0400
Subject: [PATCH 43/87] [release only] Increase timeout for rocm libtorch and
 manywheel builds (#178006)

---
 .../linux_binary_build_workflow.yml.j2        |  2 +-
 ...enerated-linux-binary-libtorch-nightly.yml |  4 +--
 ...nerated-linux-binary-manywheel-nightly.yml | 28 +++++++++----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index 3f41256728e52..f1b85e8a8bf65 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -79,7 +79,7 @@ jobs:
       timeout-minutes: 420
       {%- elif config["gpu_arch_type"] == "rocm" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.24xlarge.ephemeral
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index 88152f2cf92dd..b05b969362f13 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -406,7 +406,7 @@ jobs:
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: libtorch-rocm7_1-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
@@ -524,7 +524,7 @@ jobs:
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: libtorch-rocm7_2-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 3dd2d544fd7f0..08e238832f474 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -394,7 +394,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.1
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_10-rocm7_1
       build_environment: linux-binary-manywheel
     secrets:
@@ -509,7 +509,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.2
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_10-rocm7_2
       build_environment: linux-binary-manywheel
     secrets:
@@ -1058,7 +1058,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.1
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_11-rocm7_1
       build_environment: linux-binary-manywheel
     secrets:
@@ -1173,7 +1173,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.2
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_11-rocm7_2
       build_environment: linux-binary-manywheel
     secrets:
@@ -1722,7 +1722,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.1
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_12-rocm7_1
       build_environment: linux-binary-manywheel
     secrets:
@@ -1837,7 +1837,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.2
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_12-rocm7_2
       build_environment: linux-binary-manywheel
     secrets:
@@ -2386,7 +2386,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.1
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_13-rocm7_1
       build_environment: linux-binary-manywheel
     secrets:
@@ -2501,7 +2501,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.2
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_13-rocm7_2
       build_environment: linux-binary-manywheel
     secrets:
@@ -3050,7 +3050,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.1
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_13t-rocm7_1
       build_environment: linux-binary-manywheel
     secrets:
@@ -3165,7 +3165,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.2
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_13t-rocm7_2
       build_environment: linux-binary-manywheel
     secrets:
@@ -3714,7 +3714,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.1
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_14-rocm7_1
       build_environment: linux-binary-manywheel
     secrets:
@@ -3829,7 +3829,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.2
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_14-rocm7_2
       build_environment: linux-binary-manywheel
     secrets:
@@ -4378,7 +4378,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.1
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_14t-rocm7_1
       build_environment: linux-binary-manywheel
     secrets:
@@ -4493,7 +4493,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm7.2
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
+      timeout-minutes: 420
       build_name: manywheel-py3_14t-rocm7_2
       build_environment: linux-binary-manywheel
     secrets:

From 4e45bcd8746bd77118ebfe853cbeda940a540b83 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 03:43:38 +0000
Subject: [PATCH 44/87] [release/2.8] Upgrade numpy versions; Use different
 package versions for py3.9; upgrade tensorboard compatible with numpy 2

Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>
(cherry picked from commit e867a3de4b0196621e8e53d5338a8bb8bb62e828)
(cherry picked from commit c7a1e32fbcf9e0a458d959a453de65c27c51452c)
(cherry picked from commit 2a215e4a2115c999e4bb058956d888aed67787d1)
(cherry picked from commit 866cc1dbb9c93f807af1ef59801c645062cbb95e)
(cherry picked from commit 4b46310999bc247e0a5b97ea90a96a44b8579d09)
---
 .ci/docker/requirements-ci.txt | 23 +++++++++++------------
 requirements.txt               |  3 +++
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 9a033b90fcb46..0e59119e3c4f7 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -117,8 +117,10 @@ ninja==1.11.1.4
 #Pinned versions: 1.11.1.4
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-numba==0.57.1 ; python_version == "3.10" and platform_machine != "s390x"
-numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
+numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
+numba==0.60.0 ; python_version == "3.9" and platform_machine != "s390x"
+numba==0.61.2 ; python_version > "3.9" and platform_machine != "s390x"
+
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.55.2, 0.60.0
 #test that import: test_numba_integration.py
@@ -136,13 +138,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.23.2; python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
-numpy==2.3.4; python_version >= "3.14"
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
 
-pandas==2.0.3; python_version < "3.12"
-pandas==2.2.3; python_version >= "3.12" and python_version < "3.14"
+pandas==2.2.3; python_version < "3.14"
 pandas==2.3.3; python_version >= "3.14"
 
 #onnxruntime
@@ -254,9 +253,10 @@ scikit-image==0.22.0
 #Pinned versions: 0.20.3
 #test that import:
 
-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
+scipy==1.13.1 ; python_version == "3.9"
+scipy==1.14.1 ; python_version > "3.9" and python_version < "3.14"
 scipy==1.16.2 ; python_version >= "3.14"
+
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@@ -316,8 +316,7 @@ z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
-tensorboard==2.13.0 ; python_version < "3.13"
-tensorboard==2.18.0 ; python_version >= "3.13"
+tensorboard==2.18.0
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard
diff --git a/requirements.txt b/requirements.txt
index ae7f335c883cf..4559c2ad331f0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,9 @@ hypothesis
 jinja2
 lintrunner ; platform_machine != "s390x"
 networkx>=2.5.1
+ninja
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
 optree>=0.13.0
 psutil
 spin

From 3482eafe33d25e146394b685e23e61758122002c Mon Sep 17 00:00:00 2001
From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com>
Date: Mon, 14 Jul 2025 12:23:45 -0500
Subject: [PATCH 45/87] Clean up CUDA state between tests (#2335)

This PR fixes the unit test,

test/test_cuda.py::TestCuda::test_set_per_process_memory_fraction FAILED
[0.1163s]

```
Traceback (most recent call last):
  File "/var/lib/jenkins/pytorch/test/test_cuda.py", line 471, in test_set_per_process_memory_fraction
    tmp_tensor = torch.empty(application, dtype=torch.int8, device="cuda")
RuntimeError: Trying to create tensor with negative dimension -5681285432: [-5681285432]
```
This error occurs only on gfx1101 arch.

This error is coming from an integer overflow when another unit test,
test/test_cuda.py::TestCuda::test_randint_generation_for_large_numel
creates a tensor with a huge numel, which overflows into a higher
torch.cuda.max_memory_reserved() when you call
test/test_cuda.py::TestCuda::test_set_per_process_memory_fraction
afterward. To avoid this we introduced torch.cuda.empty_cache() and
torch.cuda.reset_peak_memory_stats() to clean up CUDA states.

JIRA: https://ontrack-internal.amd.com/browse/SWDEV-535295
(cherry picked from commit f86d18439897232a374504c36b40da99c14ade1a)
(cherry picked from commit 1b442282359fd69384634c882051c18565a5f744)
---
 test/test_cuda.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 72a4e5e1296a6..bfbec8c308985 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -460,6 +460,9 @@ def test_out_of_memory_retry(self):
         IS_JETSON, "oom reporting has issues on jetson igx due to partial nvml support"
     )
     def test_set_per_process_memory_fraction(self):
+        if torch.version.hip and ('gfx1101' in torch.cuda.get_device_properties(0).gcnArchName):
+           torch.cuda.empty_cache()
+           torch.cuda.reset_peak_memory_stats()
         orig = torch.cuda.get_per_process_memory_fraction(0)
         torch.cuda.reset_peak_memory_stats(0)
         try:

From 55881ef8485b9929394af12bda963617c6490d2b Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Mon, 17 Nov 2025 16:34:42 -0800
Subject: [PATCH 46/87] reset per process memory fraction in test_cuda.py
 test_mempool_limited_memory_with_allocator (#2811)

Use try/finally block. This follows a similar pattern elsewhere in
test_cuda.py.

Fixes #https://github.com/ROCm/TheRock/issues/2118.
---
 test/test_cuda.py | 109 +++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index bfbec8c308985..df317f1f01cbf 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -6058,67 +6058,68 @@ def test_mempool_limited_memory_with_allocator(self):
         nelem_1mb = 1024 * 1024 // 4
 
         self._setup_mempool_limited_memory_test(80)
-        # remaining free mem: 80 mb
-        # mempool_use [] 0 mb
-        # mempool_do_not_use [] 0 mb
-        # default pool [] 0 mb
-        with torch.cuda.use_mem_pool(pool_do_not_use):
-            a = torch.randn(40 * nelem_1mb, device="cuda")
-        with torch.cuda.use_mem_pool(pool_use):
-            b = torch.randn(40 * nelem_1mb, device="cuda")
-        a_dataptr = a.data_ptr()
-        b_dataptr = b.data_ptr()
-        # remaining free mem: 0 mb
-        # mempool_do_not_use [aaaa] 40 mb
-        # mempool_use [bbbb] 40 mb
-        # default pool [] 0 mb
-        with self.assertRaises(torch.OutOfMemoryError):
-            # out of memory
-            c = torch.randn(40 * nelem_1mb, device="cuda")
-
-        del a, b
-        # remaining free mem: 0 mb
-        # mempool_do_not_use [____] 40 mb
-        # mempool_use [____] 40 mb
-        # default pool [] 0 mb
-
-        # c should not oom and instead can use mempool_use as fallback
-        c = torch.randn(30 * nelem_1mb, device="cuda")
-        c_dataptr = c.data_ptr()
-        # remaining free mem: 0 mb
-        # mempool_do_not_use [____] 40 mb
-        # mempool_use [ccc_] 40 mb
-        # default pool [] 0 mb
-        with self.assertRaises(torch.OutOfMemoryError):
-            # out of memory since can't use mempool_do_not_use
-            d = torch.randn(30 * nelem_1mb, device="cuda")
+        try:
+            # remaining free mem: 80 mb
+            # mempool_use [] 0 mb
+            # mempool_do_not_use [] 0 mb
+            # default pool [] 0 mb
+            with torch.cuda.use_mem_pool(pool_do_not_use):
+                a = torch.randn(40 * nelem_1mb, device="cuda")
+            with torch.cuda.use_mem_pool(pool_use):
+                b = torch.randn(40 * nelem_1mb, device="cuda")
+            a_dataptr = a.data_ptr()
+            b_dataptr = b.data_ptr()
+            # remaining free mem: 0 mb
+            # mempool_do_not_use [aaaa] 40 mb
+            # mempool_use [bbbb] 40 mb
+            # default pool [] 0 mb
+            with self.assertRaises(torch.OutOfMemoryError):
+                # out of memory
+                c = torch.randn(40 * nelem_1mb, device="cuda")
 
-        del c
-        # remaining free mem: 0 mb
-        # mempool_do_not_use [____] 40 mb
-        # mempool_use [____] 40 mb
-        # default pool [] 0 mb
+            del a, b
+            # remaining free mem: 0 mb
+            # mempool_do_not_use [____] 40 mb
+            # mempool_use [____] 40 mb
+            # default pool [] 0 mb
+
+            # c should not oom and instead can use mempool_use as fallback
+            c = torch.randn(30 * nelem_1mb, device="cuda")
+            c_dataptr = c.data_ptr()
+            # remaining free mem: 0 mb
+            # mempool_do_not_use [____] 40 mb
+            # mempool_use [ccc_] 40 mb
+            # default pool [] 0 mb
+            with self.assertRaises(torch.OutOfMemoryError):
+                # out of memory since can't use mempool_do_not_use
+                d = torch.randn(30 * nelem_1mb, device="cuda")
 
-        # expect that we used same memory address for both a and c
-        self.assertEqual(b_dataptr, c_dataptr)
+            del c
+            # remaining free mem: 0 mb
+            # mempool_do_not_use [____] 40 mb
+            # mempool_use [____] 40 mb
+            # default pool [] 0 mb
 
-        # make sure we can still use mempool_use as intended after c is deleted
-        with torch.cuda.use_mem_pool(pool_use):
-            e = torch.randn(20 * nelem_1mb, device="cuda")
-        # remaining free mem: 0 mb
-        # mempool_do_not_use [____] 40 mb
-        # mempool_use [ee__] 40 mb
-        # default pool [] 0 mb
+            # expect that we used same memory address for both a and c
+            self.assertEqual(b_dataptr, c_dataptr)
 
-        e_dataptr = e.data_ptr()
-        del e
+            # make sure we can still use mempool_use as intended after c is deleted
+            with torch.cuda.use_mem_pool(pool_use):
+                e = torch.randn(20 * nelem_1mb, device="cuda")
+            # remaining free mem: 0 mb
+            # mempool_do_not_use [____] 40 mb
+            # mempool_use [ee__] 40 mb
+            # default pool [] 0 mb
 
-        self.assertEqual(e_dataptr, c_dataptr)
+            e_dataptr = e.data_ptr()
+            del e
 
-        # pool's destructor calls emptyCache()
-        del pool_use, pool_do_not_use
+            self.assertEqual(e_dataptr, c_dataptr)
 
-        self._teardown_mempool_limited_memory_test()
+            # pool's destructor calls emptyCache()
+            del pool_use, pool_do_not_use
+        finally:
+            self._teardown_mempool_limited_memory_test()
 
     @serialTest()
     def test_mempool_no_split(self):

From 54750ed7ba6aed71b473c44a2f0cb9e82b46ef57 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Sun, 10 Aug 2025 19:41:41 -0500
Subject: [PATCH 47/87] [AUTOGENERATED] [release/2.8] [SWDEV-539215] - Autotune
 support for persistent reduction and no_x_dim removal (#2454)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2417
Need to resolve conflicts

---------

Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com>
(cherry picked from commit eb4715850bcdab5abb35de94bef8981153a1f0fe)

[release/2.9][ROCm][inductor] Add ROCm specific persistent reduction config. (#2861)

In support of
[SWDEV-566103](https://ontrack-internal.amd.com/browse/SWDEV-566103)

[release/2.10] Fix Inductor Triton Heuristics (#2931)

The ROCm release/2.10 branch was created by applying 15 commits to
upstream release/2.10 branch.
(See
https://github.com/pytorch/pytorch/compare/release/2.10...ROCm:pytorch:release/2.10)

This PR fixes the issue with the missing disable_pointwise_autotuning
function.

There are three commits in this PR:

First commit is a revert:
1c96f23e68227384c34f3fe3191de44902ddd159 - Autotuning support for
persistent reduction

since it is already available in upstream release/2.10 and is not
needed. (It reintroduced disable_pointwise_autotuning function.)

The second commit (b9facd069668dad33f9bb550f85e1773b937bb91) is needed
for provenance, so I can apply the third commit:
e5eee742f431738f97a03ba8ff7c69e4541577e3 - Heuristics improvements for
reduction kernels

which was reverted last minute before the release/2.10 cutoff and then
re-landed shortly afterwards the cutoff date but with a minor change.

---------

Co-authored-by: Pandya, Vivek Vasudevbhai <vpandya@qti.qualcomm.com>
---
 torch/_inductor/runtime/triton_heuristics.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 2a1447fbf0bda..8108636663b90 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -3691,6 +3691,17 @@ def _persistent_reduction_configs(
                 if conf not in configs:
                     configs.append(conf)
 
+            # Additional custom configs in support of customer workloads
+            configs.append(
+                triton_config_reduction(
+                    size_hints,
+                    1,
+                    rnumel,
+                    num_stages=3,
+                    num_warps=2,
+                )  # 18% improvement
+            )
+
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
         for prefix in size_hints:

From 0bfe1e3c39ecc644a620eecffc596d21f471d263 Mon Sep 17 00:00:00 2001
From: Chinmay Kuchinad <ChinmayDattanand.Kuchinad@amd.com>
Date: Wed, 17 Dec 2025 07:05:17 +0000
Subject: [PATCH 48/87] Update version to 2.11.0

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index f925b7d0ce58a..46b81d815a23b 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-2.11.0a0
+2.11.0

From 03639f4d63fb3999058c24acea028616772acc43 Mon Sep 17 00:00:00 2001
From: "rocm-repo-management-api[bot]"
 <189310625+rocm-repo-management-api[bot]@users.noreply.github.com>
Date: Tue, 10 Mar 2026 08:16:02 -0700
Subject: [PATCH 49/87] [AUTOGENERATED] [release/2.11] Move getenv to main
 thread to avoid NCCL race condition (#3054)

Cherry-pick of https://github.com/ROCm/pytorch/pull/3043

Co-authored-by: tom.jen <tomjen12@amd.com>
Co-authored-by: Jeff Daily <jeff.daily@amd.com>
---
 torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp |  4 +++-
 torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp | 13 +++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 827a8c1b13db7..ae9a73c01e189 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -922,6 +922,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   PrefixStore* prefixStore = dynamic_cast<PrefixStore*>(store_.get());
   globalStore_ =
       prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_;
+  debugInfoPipeFile_ = getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, "");
   auto desyncDebug = getCvarBool(TORCH_NCCL_DESYNC_DEBUG, false) ||
       (dist_debug_level_ >= DebugLevel::Detail);
 #ifdef ENABLE_NCCL_ERROR_CHECKING
@@ -1778,7 +1779,8 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
     // DumpPipe is one per-trainer process, and its convenient to name them
     // after 'global' ranks in the system, So we assume processgroup (uid)==0 is
     // the global PG and has globally unique rank ids across trainers.
-    dumpPipe.emplace(pg_->globalRank());
+    dumpPipe.emplace(
+        pg_->globalRank(), pg_->debugInfoPipeFile_, pg_->traceBufferSize_);
   }
   while (true) {
     // This won't have any lock since this lock is only used here.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 12aeb49660f6c..c5d3eec1a03db 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -195,16 +195,10 @@ static std::vector<std::string> TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK =
 
 #if defined(__linux__)
 struct DumpPipe {
-  DumpPipe(int rank) {
-    std::string fileStem =
-        getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, "");
-    // NOTE: This default value (2000) is duplicated in FlightRecorder.hpp.
-    // Keep in sync. See FlightRecorder.hpp for details.
-    if (fileStem.empty() ||
-        getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 2000) <= 0) {
+  DumpPipe(int rank, const std::string& fileStem, int traceBufferSize) {
+    if (fileStem.empty() || traceBufferSize <= 0) {
       return;
     }
-    TORCH_CHECK(!fileStem.empty(), "TORCH_NCCL_DEBUG_INFO_PIPE_FILE is empty");
     std::string filename = c10::str(fileStem, rank, ".pipe");
     TORCH_CHECK(
         unlink(filename.c_str()) != -1 || errno == ENOENT,
@@ -1357,6 +1351,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Size of ring buffer where we store NCCL Traces for debugging.
   int traceBufferSize_;
 
+  // Stores TORCH_NCCL_DEBUG_INFO_PIPE_FILE
+  std::string debugInfoPipeFile_;
+
   // We gate the cudaEventCache so that we can roll it out gradually.
   std::atomic<bool> cudaEventCacheEnabled_;
 

From 41a0f89e53b36a3dd95fe906ab266dc2168cd2c0 Mon Sep 17 00:00:00 2001
From: Anatoliy Litvinenko <alitvine@amd.com>
Date: Thu, 12 Mar 2026 10:04:48 -0500
Subject: [PATCH 50/87] [Release/2.11] No-fence in normalization kernel
 (#175286) (#3057)

Removing need for fences in normalization kernel by converting the
stores into atomics+return. This is crucial for perf in architectures
with split caches (e.g. MI300), where fences are inherently costly. This
change speedups `batch_norm_stats ` function for tensors in
`channels_last` format.

### Performance result on MI300:
<img width="2311" height="1537" alt="batchnorm_latency_comparison"
src="https://github.com/user-attachments/assets/dee39088-9f55-499a-a39b-b170805416bb"
/>

**Particular Example:**
Before:
Avg time for shape (20, 896, 59, 91): **1102.39 us**

After:
Avg time for shape (20, 896, 59, 91): **122.94 us**

Reproducer:
```

import torch

shapes = [(20, 896, 59, 91)]

eps = 1e-5

for shape in shapes:
    x = torch.randn(shape, device='cuda', dtype=torch.bfloat16)
    x = x.to(memory_format=torch.channels_last)
    for _ in range(20):
        _ = torch.batch_norm_stats(x, eps)
    torch.cuda.synchronize()

    start_evt = torch.cuda.Event(enable_timing=True)
    end_evt = torch.cuda.Event(enable_timing=True)
    start_evt.record()
    for _ in range(100):
        _ = torch.batch_norm_stats(x, eps)
    end_evt.record()
    torch.cuda.synchronize()
    print(f"Avg time for shape {shape}: {start_evt.elapsed_time(end_evt) / 100 * 1e3:.2f} us")

```

Related fix which is released:
https://github.com/pytorch/pytorch/pull/161180

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175286
Approved by: https://github.com/amd-hhashemi,
https://github.com/jerrymannil, https://github.com/jeffdaily
---
 aten/src/ATen/native/cuda/KernelUtils.cuh   | 15 ++++++++++-----
 aten/src/ATen/native/cuda/Normalization.cuh | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index 9c100ca206adf..12feeb6d63af3 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -228,7 +228,9 @@ __device__ __forceinline__ void fastAtomicAdd(
 // This function implements a committed store.
 // Upon returning, the store is committed to global memory.
 // This is useful in avoiding the need for fences.
-template <typename T>
+// If multiple stores are done in a row there is option to skip
+// waiting for commit for all but the last store.
+template <typename T, bool wait_for_commit = true>
 __device__ inline void cmtdStore(void* address, T value) {
       int constexpr num_long_per_val = sizeof(value)/sizeof(long);
       int constexpr num_int_per_val = sizeof(value)/sizeof(int);
@@ -252,13 +254,16 @@ __device__ inline void cmtdStore(void* address, T value) {
       else if constexpr (num_char_per_val*sizeof(char) == sizeof(value))
         for (int i=0; i<num_char_per_val; i++)
           __hip_atomic_store(reinterpret_cast<char *>(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+      if constexpr (wait_for_commit)
+      {
+        __atomic_signal_fence(__ATOMIC_SEQ_CST);
 #ifdef __gfx1250__
-      asm volatile("s_wait_loadcnt(0)" ::: "memory");
+        asm volatile("s_wait_loadcnt(0)" ::: "memory");
 #else
-      asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
+        asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
 #endif
-      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+        __atomic_signal_fence(__ATOMIC_SEQ_CST);
+      }
 }
 #endif
 
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
index bbd65419bbb92..8e31f8fa9a694 100644
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -8,6 +8,7 @@
 #include <ATen/cuda/DeviceUtils.cuh>
 #include <ATen/native/cuda/block_reduce.cuh>
 #include <ATen/native/cuda/DeviceSqrt.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 #include <ATen/native/cuda/LaunchUtils.h>
 #include <c10/macros/Macros.h>
 
@@ -1063,12 +1064,22 @@ batch_norm_collect_statistics_channels_last_kernel(
     address_base = c_offset + blockIdx.y * stride;
     // write data to staging_data;
     if (threadIdx.y == 0 && c_offset < stride) {
+#ifndef USE_ROCM
       staging_mean[address_base] = mean_th;
       staging_m2n[address_base] = m2_th;
       staging_count[address_base] = count_th;
+#else
+      // In architectures with split caches, global fences are costly.
+      // Here we preempt need for fences by committing stores to global memory.
+      cmtdStore<accscalar_t, false>((void*)&staging_mean[address_base], mean_th);
+      cmtdStore<accscalar_t, false>((void*)&staging_m2n[address_base], m2_th);
+      cmtdStore((void*)&staging_count[address_base], count_th);
+#endif
     }
 
+#ifndef USE_ROCM
     __threadfence();
+#endif
     __syncthreads(); // ensuring writes to staging_ is visible to all blocks
 
     __shared__ bool is_last_block_done;
@@ -1288,11 +1299,20 @@ __global__ void batch_norm_backward_reduce_channels_last_kernel(
     address_base = c_offset + blockIdx.y * stride;
     // write data to staging_data;
     if (threadIdx.y == 0 && c_offset < stride) {
+#ifndef USE_ROCM
       staging_sum_dy[address_base] = sum_dy_th;
       staging_sum_dy_xmu[address_base] = sum_dy_xmu_th;
+#else
+      // In architectures with split caches, global fences are costly.
+      // Here we preempt need for fences by committing stores to global memory.
+      cmtdStore<accscalar_t, false>((void*)&staging_sum_dy[address_base], sum_dy_th);
+      cmtdStore((void*)&staging_sum_dy_xmu[address_base], sum_dy_xmu_th);
+#endif
     }
 
+#ifndef USE_ROCM
     __threadfence();
+#endif
     __syncthreads(); // ensuring writes to staging_ is visible to all blocks
 
     __shared__ bool is_last_block_done;

From be321e462bfbcf5319669a96be19c65e0ff71f48 Mon Sep 17 00:00:00 2001
From: Milica Stankovic <milica.stankovic@amd.com>
Date: Tue, 17 Mar 2026 11:51:54 +0100
Subject: [PATCH 51/87] Prefer cublas when TORCH_BLAS_PREFER_CUBLASLT is false
 (#https://github.com/pytorch/pytorch/pull/174377) (#3077)

### Summary

Set blas_preferred_backend = at::BlasBackend::Cublas when
TORCH_BLAS_PREFER_CUBLASLT / TORCH_BLAS_PREFER_HIPBLASLT is explicitly
set to false.

For ROCm, if a gfx arch is in the list returned by
getHipblasltPreferredArchs() hipBLASLt will be set as
blas_preferred_backend by default regardless of
TORCH_BLAS_PREFER_HIPBLASLT setting. This PR enables users to explicitly
select cublas/rocblas and makes this env. variable behave like a binary
toggle.

### Changes

- Modified checks for
TORCH_BLAS_PREFER_CUBLASLT/TORCH_BLAS_PREFER_HIPBLASLT env. variables
- Updated test_preferred_blas_library_settings Pull Request resolved:
https://github.com/pytorch/pytorch/pull/174377 Approved by:
https://github.com/jeffdaily, https://github.com/nikitaved

Co-authored-by: Filip Jankovic <filip.jankovic@amd.com>
---
 aten/src/ATen/Context.h |  6 +++++-
 test/test_cuda.py       | 18 ++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 9f24ea32245ec..de6a7dda66d73 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -502,7 +502,11 @@ class TORCH_API Context {
       (c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == true ||
        c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") == true) // alias
       ? at::BlasBackend::Cublaslt
-      : at::BlasBackend::Default;
+      : ((c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == false ||
+          c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") ==
+              false) // alias
+             ? at::BlasBackend::Cublas
+             : at::BlasBackend::Default);
   at::ROCmFABackend rocm_fa_preferred_backend =
       c10::utils::check_env("TORCH_ROCM_FA_PREFER_CK") == true
       ? at::ROCmFABackend::Ck
diff --git a/test/test_cuda.py b/test/test_cuda.py
index df317f1f01cbf..2a3e27dab814e 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -660,11 +660,13 @@ def _check_default():
             torch.backends.cuda.preferred_blas_library(1.0)
         # check env var override
         custom_envs = [
-            {"TORCH_BLAS_PREFER_CUBLASLT": "1"},
-            {"TORCH_BLAS_PREFER_HIPBLASLT": "1"},
+            ({"TORCH_BLAS_PREFER_CUBLASLT": "1"}, "_BlasBackend.Cublaslt"),
+            ({"TORCH_BLAS_PREFER_HIPBLASLT": "1"}, "_BlasBackend.Cublaslt"),
+            ({"TORCH_BLAS_PREFER_CUBLASLT": "0"}, "_BlasBackend.Cublas"),
+            ({"TORCH_BLAS_PREFER_HIPBLASLT": "0"}, "_BlasBackend.Cublas"),
         ]
         test_script = "import torch;print(torch.backends.cuda.preferred_blas_library())"
-        for env_config in custom_envs:
+        for env_config, expected in custom_envs:
             env = os.environ.copy()
             for key, value in env_config.items():
                 env[key] = value
@@ -673,7 +675,15 @@ def _check_default():
                 .decode("ascii")
                 .strip()
             )
-            self.assertEqual("_BlasBackend.Cublaslt", r)
+            self.assertEqual(expected, r)
+
+        # explicitly check default when no env vars are set
+        if not any(
+            os.environ.get(v)
+            for v in ("TORCH_BLAS_PREFER_CUBLASLT", "TORCH_BLAS_PREFER_HIPBLASLT")
+        ):
+            torch.backends.cuda.preferred_blas_library("default")
+            _check_default()
 
     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "temporarily disabled for async")
     @setBlasBackendsToDefaultFinally

From cc97152722937113b932e4e1c39e11a4484635a8 Mon Sep 17 00:00:00 2001
From: Anatoliy Litvinenko <alitvine@amd.com>
Date: Thu, 19 Mar 2026 21:55:09 -0500
Subject: [PATCH 52/87] Increase precision for golden solution in transformer
 tests. (#3087)

# Overview

Force FP32 precision for "golden" solution computation when TF32 is set
for compute test solution.

# Rationale

The `test/test_transformers.py` testing suite calculates the numerical
tolerance by comparing output tensors from the same precision
("reference")
and higher precision ("golden"), both calculated by `SDPBackend.MATH`.
However, if the golden output is calculated with TF32 rather than FP32,
which in
fact is less accurate than the FA/ME backend if they used IEEE rather
than
TF32 for their accumulation.

The loss of precison causes false negatives in SDPA tests like

`TestSDPACudaOnlyCUDA.test_flash_attention_vs_math_ref_grads_batch_size_8_seq_len_q_143_seq_len_k_4_head_dim_203_is_causal_False_dropout_p_0_22_float16_scale_l1_enable_gqa_True_n_heads1_cuda_float16`
, at least on ROCM platform. The false negative disappears after forcing
`higher_precision_dtype = torch.float64`

# Major Changes

To restore the precision of golden output, a wrapper function is used
where golden solution is calculated.
This function sets FP32 precision in scope of calculation of golden
solution and its gradient.

This is based of PR https://github.com/pytorch/pytorch/pull/167157
Upstream PR: https://github.com/pytorch/pytorch/pull/169694
---
 test/test_transformers.py | 74 ++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 29 deletions(-)

diff --git a/test/test_transformers.py b/test/test_transformers.py
index 777b85cb173d3..284eb5ad64704 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -53,6 +53,7 @@
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     PLATFORM_SUPPORTS_FUSED_ATTENTION,
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+    tf32_off,
     tf32_on_and_off,
     tf32_enabled,
 )
@@ -3575,8 +3576,9 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
         if dropout_p == 0.0:
             with sdpa_kernel(backends=[SDPBackend.MATH]):
                 # High Precision Math Reference
-                out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref,
-                                                         dropout_p=dropout_p, is_causal=is_causal, scale=scale)
+                with tf32_off():
+                    out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref,
+                                                             dropout_p=dropout_p, is_causal=is_causal, scale=scale)
                 # Low Precision Math Reference
                 out_lp_ref = F.scaled_dot_product_attention(query, key, value,
                                                             dropout_p=dropout_p, is_causal=is_causal, scale=scale)
@@ -3587,8 +3589,9 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             torch.manual_seed(seed)
             dropout_mask = _get_mem_eff_drop_mask(batch_size, n_heads, seq_len_q, seq_len_k, dropout_p, seed, 0, device=device)
             # High Precision Math Reference
-            out_ref = torch.ops.aten._scaled_dot_product_attention_math(
-                query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, scale=scale, dropout_mask=dropout_mask)[0]
+            with tf32_off():
+                out_ref = torch.ops.aten._scaled_dot_product_attention_math(
+                    query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, scale=scale, dropout_mask=dropout_mask)[0]
             # Low Precision Math Reference
             out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
                 query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale,
@@ -3598,7 +3601,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
 
         grads = torch.autograd.grad(out, (query, key, value), upstream_grad)
         grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad)
-        grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
+        with tf32_off():
+            grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
 
         fudge_factors = {
             'out': 3.0 ,
@@ -3695,8 +3699,9 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
         if dropout_p == 0.0:
             with sdpa_kernel(backends=[SDPBackend.MATH]):
                 # High Precision Math Reference
-                out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, attn_mask_ref,
-                                                         dropout_p=dropout_p, is_causal=is_causal, scale=scale)
+                with tf32_off():
+                    out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, attn_mask_ref,
+                                                             dropout_p=dropout_p, is_causal=is_causal, scale=scale)
                 # Low Precision Math Reference
                 out_lp_ref = F.scaled_dot_product_attention(query, key, value, attn_mask,
                                                             dropout_p=dropout_p, is_causal=is_causal, scale=scale)
@@ -3708,9 +3713,10 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             dropout_mask = _get_mem_eff_drop_mask(batch_size, n_heads, seq_len_q,
                                                   seq_len_k, dropout_p, seed, 0, device=device)
             # High Precision Math Reference
-            out_ref = torch.ops.aten._scaled_dot_product_attention_math(
-                query_ref, key_ref, value_ref, attn_mask_ref, dropout_p=dropout_p, is_causal=is_causal,
-                scale=scale, dropout_mask=dropout_mask)[0]
+            with tf32_off():
+                out_ref = torch.ops.aten._scaled_dot_product_attention_math(
+                    query_ref, key_ref, value_ref, attn_mask_ref, dropout_p=dropout_p, is_causal=is_causal,
+                    scale=scale, dropout_mask=dropout_mask)[0]
             # Low Precision Math Reference
             out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
                 query, key, value, attn_mask,
@@ -3721,7 +3727,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
 
         grads = torch.autograd.grad(out, (query, key, value, attn_mask), upstream_grad)
         grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value, attn_mask), upstream_grad)
-        grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref, attn_mask_ref), upstream_grad)
+        with tf32_off():
+            grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref, attn_mask_ref), upstream_grad)
 
         fudge_factors = {
             "out": 4,
@@ -3825,8 +3832,9 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
                     query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa)
             with sdpa_kernel(backends=[SDPBackend.MATH]):
                 # High Precision Math Reference
-                out_ref = F.scaled_dot_product_attention(
-                    query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa)
+                with tf32_off():
+                    out_ref = F.scaled_dot_product_attention(
+                        query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa)
                 # Low Precision Math Reference
                 out_lp_ref = F.scaled_dot_product_attention(
                     query, key, value, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa)
@@ -3855,9 +3863,10 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
                 causal=is_causal)[:, :, :seq_len_q, :seq_len_k]
             dropout_mask = softmax_mask >= 0
             # High Precision Math Reference
-            out_ref = torch.ops.aten._scaled_dot_product_attention_math(
-                query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal,
-                scale=scale, dropout_mask=dropout_mask, enable_gqa=enable_gqa)[0]
+            with tf32_off():
+                out_ref = torch.ops.aten._scaled_dot_product_attention_math(
+                    query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal,
+                    scale=scale, dropout_mask=dropout_mask, enable_gqa=enable_gqa)[0]
             # Low Precision Math Reference
             out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
                 query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale,
@@ -3872,7 +3881,8 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
 
         grads = torch.autograd.grad(out, (query, key, value), upstream_grad)
         grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad)
-        grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
+        with tf32_off():
+            grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
 
         fudge_factors = {
             'out': 4,
@@ -4034,8 +4044,9 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
         with sdpa_kernel(backends=[SDPBackend.MATH]):
             if dropout_p == 0.0:
                 # High Precision Math Reference
-                out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref,
-                                                         dropout_p=dropout_p, is_causal=is_causal)
+                with tf32_off():
+                    out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref,
+                                                             dropout_p=dropout_p, is_causal=is_causal)
                 # Low Precision Math Reference
                 out_lp_ref = F.scaled_dot_product_attention(query, key, value,
                                                             dropout_p=dropout_p, is_causal=is_causal)
@@ -4045,9 +4056,10 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
                 dropout_mask = get_dropout_mask(output_tuple, fused_kernel, batch_size,
                                                 n_heads, seq_len_q, seq_len_k, dropout_p, device)
                 # High Precision Math Reference
-                out_ref = torch.ops.aten._scaled_dot_product_attention_math(
-                    query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal,
-                    dropout_mask=dropout_mask)[0]
+                with tf32_off():
+                    out_ref = torch.ops.aten._scaled_dot_product_attention_math(
+                        query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal,
+                        dropout_mask=dropout_mask)[0]
                 # Low Precision Math Reference
                 out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
                     query, key, value, dropout_p=dropout_p, is_causal=is_causal,
@@ -4059,7 +4071,8 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
         g1.replay()
         if fused_kernel != SDPBackend.CUDNN_ATTENTION or dropout_p == 0.0:
             grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad)
-            grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
+            with tf32_off():
+                grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
 
             fudge_factors = {
                 'out': 3.0,
@@ -4283,8 +4296,9 @@ def rand_nt(sequence_list, num_heads, head_dim):
                 out = F.scaled_dot_product_attention(query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale)
             with sdpa_kernel(backends=[SDPBackend.MATH]):
                 # High Precision Math Reference
-                out_ref = F.scaled_dot_product_attention(
-                    query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale)
+                with tf32_off():
+                    out_ref = F.scaled_dot_product_attention(
+                        query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale)
                 # Low Precision Math Reference
                 out_lp_ref = F.scaled_dot_product_attention(
                     query_ref_lp, key_ref_lp, value_ref_lp, is_causal=is_causal, scale=scale)
@@ -4319,9 +4333,10 @@ def rand_nt(sequence_list, num_heads, head_dim):
                 nt_stack.append(torch.cat(batch_stack))
             nested_dropout_mask = torch.nested.nested_tensor(nt_stack)
             # High Precision Math Reference
-            out_ref = torch.ops.aten._scaled_dot_product_attention_math(
-                query_ref, key_ref, value_ref, dropout_p=dropout_p,
-                is_causal=is_causal, scale=scale, dropout_mask=nested_dropout_mask)[0]
+            with tf32_off():
+                out_ref = torch.ops.aten._scaled_dot_product_attention_math(
+                    query_ref, key_ref, value_ref, dropout_p=dropout_p,
+                    is_causal=is_causal, scale=scale, dropout_mask=nested_dropout_mask)[0]
             # Low Precision Math Reference
             out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
                 query_ref_lp, key_ref_lp, value_ref_lp, dropout_p=dropout_p, is_causal=is_causal, scale=scale,
@@ -4330,7 +4345,8 @@ def rand_nt(sequence_list, num_heads, head_dim):
         upstream_grad = out.detach().clone().contiguous()
 
         out.backward(upstream_grad)
-        out_ref.backward(upstream_grad.to(out_ref.dtype))
+        with tf32_off():
+            out_ref.backward(upstream_grad.to(out_ref.dtype))
         out_lp_ref.backward(upstream_grad.to(out_lp_ref.dtype))
 
         dropout_fudge_factor = 1.0 if dropout_p == 0.0 else 2.0

From dfff4e13b4a95f6f5e3e4c50aaf402e76c5cdedb Mon Sep 17 00:00:00 2001
From: Milica Stankovic <milica.stankovic@amd.com>
Date: Fri, 20 Mar 2026 11:29:15 +0100
Subject: [PATCH 53/87] [ROCm][CI] Fix failing FP8 tests on RDNA4 (#174873)
 (#3090)

## Summary

This PR fixes FP8 inductor test failures that occur on AMD RDNA4 GPUs
when testing matrix multiplications with small M dimensions (M < 16).

## Problem

On gfx120x GPUs, FP8 scaled matrix multiplication tests fail with:
- 92.4% NaN outputs when M < BLOCK_M (typically 16)
- Large numerical mismatches between eager and compiled results
- Only occurs in `max-autotune` mode

**Root cause:** Autotuned Triton kernels on gfx120x generate incorrect
tensor indexing for small M values, using partial indices instead of
full computed indices in load/store operations.

## Solution

 - Added GPU-specific compile mode selection for small M values
 - gfx120x with M < 16: use `compile_mode="default"`
 - All other cases: use `compile_mode="max-autotune"`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/174873
Approved by: https://github.com/jeffdaily

(cherry picked from commit d667ffef1f48caafc745b5c4266d1e1f23be1d5a)
---
 test/inductor/test_fp8.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index aab220511c374..1d540ffca635e 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -1045,9 +1045,20 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             w_inverse_scale,
             bias,
         )
+
+        # On gfx120x, autotuned kernels have issues with small M
+        compile_mode = "max-autotune"
+        if (
+            torch.version.hip is not None
+            and M < 16
+            and torch.cuda.is_available()
+            and "gfx120" in torch.cuda.get_device_properties(0).gcnArchName
+        ):
+            compile_mode = "default"
+
         with config.patch({"triton.enable_persistent_tma_matmul": persistent_matmul}):
             linear_compiled = torch.compile(
-                linear, backend="inductor", mode="max-autotune"
+                linear, backend="inductor", mode=compile_mode
             )
             y_compiled = linear_compiled(
                 x_fp8,
@@ -1344,9 +1355,20 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             w_inverse_scale,
             bias,
         )
+
+        # On gfx120x, autotuned kernels have issues with small M
+        compile_mode = "max-autotune"
+        if (
+            torch.version.hip is not None
+            and M < 16
+            and torch.cuda.is_available()
+            and "gfx120" in torch.cuda.get_device_properties(0).gcnArchName
+        ):
+            compile_mode = "default"
+
         with config.patch({"triton.enable_persistent_tma_matmul": persistent_matmul}):
             linear_compiled = torch.compile(
-                linear, backend="inductor", mode="max-autotune"
+                linear, backend="inductor", mode=compile_mode
             )
             y_compiled = linear_compiled(
                 x_fp8,

From 83524a4c7d748e5de89a7a93cd72dab4bd1fa42d Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Wed, 25 Mar 2026 13:12:51 -0700
Subject: [PATCH 54/87] [CI][release/2.11]Pin all Python dependency versions in
 requirements files (#3098)

Pin dependencies in release/2.11. Tested and installed with python 3.10,
3.11, 3.12, 3.13.

Build failed because triton requires cmake less than 4 which we fixed in
release/2.10 with
https://github.com/ROCm/triton/commit/8edc6c4f11ac73ec145e9a0dbe311b83d58d54d7

```
 #63 35.94 writing manifest file 'python/triton.egg-info/SOURCES.txt'
 #63 35.98
 #63 35.98 ERROR Missing dependencies:
 #63 35.98 	cmake<4.0,>=3.20
 #63 35.98 ERROR conda.cli.main_run:execute(142): `conda run python -m build --wheel --no-isolation` failed. (See above for error)
 #63 ERROR: process "/bin/sh -c if [ -n \"${TRITON}\" ]; then bash ./install_triton.sh; fi" did not complete successfully: exit code: 1
 ------
  > [stage-0 55/62] RUN if [ -n "yes" ]; then bash ./install_triton.sh; fi:
 35.92 writing top-level names to python/triton.egg-info/top_level.txt
 35.92 writing manifest file 'python/triton.egg-info/SOURCES.txt'
 35.93 reading manifest file 'python/triton.egg-info/SOURCES.txt'
 35.93 reading manifest template 'MANIFEST.in'
 35.94 adding license file 'LICENSE'
 35.94 writing manifest file 'python/triton.egg-info/SOURCES.txt'
 35.98
 35.98 ERROR Missing dependencies:
 35.98 	cmake<4.0,>=3.20
 35.98 ERROR conda.cli.main_run:execute(142): `conda run python -m build --wheel --no-isolation` failed. (See above for error)
 ------
 Dockerfile:122
 --------------------
  120 |     COPY ci_commit_pins/triton.txt triton.txt
  121 |     COPY triton_version.txt triton_version.txt
  122 | >>> RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
  123 |     RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
```
---
 .ci/docker/requirements-ci.txt | 41 +++++++++++++++++-----------------
 requirements-build.txt         | 23 ++++++++++---------
 requirements.txt               | 29 ++++++++++++------------
 3 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 0e59119e3c4f7..24be093b31e7e 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -15,13 +15,13 @@ build==1.3.0
 #Pinned versions: 1.3.0
 #test that import:
 
-click
+click==8.3.1
 #Description: Command Line Interface Creation Kit
-#Pinned versions:
+#Pinned versions: 8.3.1
 #test that import:
 
 coremltools==5.0b5 ; python_version < "3.12"
-coremltools==8.3 ; python_version == "3.12"
+coremltools==8.3.0 ; python_version == "3.12"
 #Description: Apple framework for ML integration
 #Pinned versions: 5.0b5
 #test that import:
@@ -68,7 +68,7 @@ lark==0.12.0
 #Pinned versions: 0.12.0
 #test that import:
 
-librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
+librosa==0.10.2 ; python_version < "3.11" and platform_machine != "s390x"
 librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
@@ -149,9 +149,9 @@ pandas==2.3.3; python_version >= "3.14"
 #Pinned versions: 1.9.0
 #test that import:
 
-opt-einsum==3.3
+opt-einsum==3.3.0
 #Description: Python library to optimize tensor contraction order, used in einsum
-#Pinned versions: 3.3
+#Pinned versions: 3.3.0
 #test that import: test_linalg.py
 
 optree==0.13.0 ; python_version < "3.14"
@@ -178,9 +178,9 @@ protobuf==6.33.5
 #Pinned versions: 6.33.2
 #test that import: test_tensorboard.py, test/onnx/*
 
-psutil
+psutil==7.2.2
 #Description: information on running processes and system utilization
-#Pinned versions:
+#Pinned versions: 7.2.2
 #test that import: test_profiler.py, test_openmp.py, test_dataloader.py
 
 pytest==7.3.2
@@ -198,9 +198,9 @@ pytest-flakefinder==1.1.0
 #Pinned versions: 1.1.0
 #test that import:
 
-pytest-rerunfailures>=10.3
+pytest-rerunfailures==14.0
 #Description: plugin for rerunning failure tests in pytest
-#Pinned versions:
+#Pinned versions: 14.0
 #test that import:
 
 pytest-subtests==0.13.1
@@ -270,8 +270,7 @@ scipy==1.16.2 ; python_version >= "3.14"
 #test that import:
 
 # needed by torchgen utils
-typing-extensions==4.12.2 ; python_version < "3.14"
-typing-extensions==4.15.0 ; python_version >= "3.14"
+typing-extensions==4.15.0
 #Description: type hints for python
 #Pinned versions:
 #test that import:
@@ -281,7 +280,7 @@ typing-extensions==4.15.0 ; python_version >= "3.14"
 #Pinned versions:
 #test that import:
 
-unittest-xml-reporting<=3.2.0,>=2.0.0
+unittest-xml-reporting==3.2.0
 #Description: saves unit test results to xml
 #Pinned versions:
 #test that import:
@@ -292,7 +291,7 @@ lintrunner==0.12.11
 #Pinned versions: 0.12.11
 #test that import:
 
-redis>=4.0.0
+redis==7.4.0
 #Description: redis database
 #test that import: anything that tests OSS caching/mocking (inductor/test_codecache.py, inductor/test_max_autotune.py)
 
@@ -370,10 +369,10 @@ pwlf==2.2.1
 
 # To build PyTorch itself
 pyyaml==6.0.3
-pyzstd
-setuptools==78.1.1
-packaging==24.0
-six
+pyzstd==0.16.2
+setuptools==79.0.1
+packaging==25.0
+six==1.17.0
 
 scons==4.5.2 ; platform_machine == "aarch64"
 
@@ -387,7 +386,7 @@ dataclasses_json==0.6.7
 #Pinned versions: 0.6.7
 #test that import:
 
-cmake==3.31.6
+cmake==4.0.0
 #Description: required for building
 
 tlparse==0.4.0
@@ -396,7 +395,7 @@ tlparse==0.4.0
 filelock==3.20.3
 #Description: required for inductor testing
 
-cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" and platform_system != "Darwin"
+cuda-bindings==12.9.6 ; platform_machine != "s390x" and platform_system != "Darwin"
 #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
 #test that import: test_cuda.py
 
@@ -406,7 +405,7 @@ pyre-extensions==0.0.32
 tabulate==0.9.0
 #Description: These package are needed to build FBGEMM and torchrec on PyTorch CI
 
-tqdm>=4.66.0
+tqdm==4.67.3
 #Description: progress bar library required for dynamo benchmarks
 #test that import: benchmarks/dynamo/*
 
diff --git a/requirements-build.txt b/requirements-build.txt
index 863bc9f921d8d..88a80dfaf1b30 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,11 +1,14 @@
 # Build System requirements
-setuptools>=70.1.0,<82
-cmake>=3.27
-ninja
-numpy
-packaging
-pyyaml
-requests
-six  # dependency chain: NNPACK -> PeachPy -> six
-typing-extensions>=4.15.0
-pip  # not technically needed, but this makes setup.py invocation work
+# setuptools and cmake pinned to match rocm/pytorch release/2.10:
+# https://github.com/ROCm/pytorch/blob/0b21eac93ff682d862b257770fff5f9fc069b30a/requirements-build.txt
+setuptools==79.0.1
+cmake==4.0.0
+ninja==1.11.1.4
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
+packaging==25.0
+pyyaml==6.0.3
+requests==2.32.5
+six==1.17.0  # dependency chain: NNPACK -> PeachPy -> six
+typing_extensions==4.15.0
+pip==26.0.1  # not technically needed, but this makes setup.py invocation work
diff --git a/requirements.txt b/requirements.txt
index 4559c2ad331f0..f8b6ebfd25ce1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,19 +4,20 @@
 --requirement requirements-build.txt
 
 # Install / Development extra requirements
-build[uv]  # for building sdist and wheel
-expecttest>=0.3.0
-filelock
-fsspec>=0.8.5
-hypothesis
-jinja2
-lintrunner ; platform_machine != "s390x"
-networkx>=2.5.1
-ninja
+build[uv]==1.3.0  # for building sdist and wheel
+expecttest==0.3.0
+filelock==3.20.3
+fsspec==2026.2.0
+hypothesis==6.56.4
+jinja2==3.1.6
+lintrunner==0.12.11 ; platform_machine != "s390x"
+networkx==2.8.8
+ninja==1.11.1.4
 numpy==2.0.2 ; python_version == "3.9"
 numpy==2.1.2 ; python_version > "3.9"
-optree>=0.13.0
-psutil
-spin
-sympy>=1.13.3
-wheel
+optree==0.13.0 ; python_version < "3.14"
+optree==0.17.0 ; python_version >= "3.14"
+psutil==7.2.2
+spin==0.17
+sympy==1.13.3
+wheel==0.46.3

From bb7978b519536a4f38b7f4988a36af8a4e606869 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 25 Mar 2026 21:50:11 +0000
Subject: [PATCH 55/87] Update triton pin to tip of
 https://github.com/ROCm/triton/commits/release/internal/3.6.x

---
 .ci/docker/ci_commit_pins/triton.txt | 2 +-
 .ci/docker/common/install_triton.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 23407b4d540c4..0a2a5f707f24f 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-9844da955a9db14ec69c9aac828ee9803085e288
+b31789602ee0e40b06a1fbc6e63dfae6df7e131d
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index 1b68e3c247839..b2fdebdcc4747 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then
   TRITON_REPO="https://github.com/triton-lang/triton-cpu"
   TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/triton-lang/triton"
+  TRITON_REPO="https://github.com/ROCm/triton"
   TRITON_TEXT_FILE="triton"
 fi
 

From 752cc24c376a7e329963ff3a5dce11ce6f480c02 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 25 Mar 2026 21:52:30 +0000
Subject: [PATCH 56/87] Skip nccl_device.h header include for ROCm (causes
 build failures in theRock nightly builds); changes ported from
 https://github.com/pytorch/pytorch/pull/175443

---
 torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp b/torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp
index 63b3d452b7e4f..fbf0cf6b50c3e 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/nccl_dev_cap.hpp
@@ -10,9 +10,11 @@
 #endif
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0)
+#if !defined(USE_ROCM)
 #define NCCL_HAS_SYMMEM_DEVICE_SUPPORT
 #include <nccl_device.h>
 #endif
+#endif
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 29, 0)
 #define NCCL_HAS_ONE_SIDED_API

From 2fea1465481a731ee40accce0b0a420edfbd22d3 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Fri, 27 Mar 2026 16:39:30 -0500
Subject: [PATCH 57/87] [ROCm] Reland: Enable expandable segments (#173330)
 (#177974) (#3106)

Summary:
Original pull request: https://github.com/pytorch/pytorch/pull/173330
Fixes https://github.com/pytorch/pytorch/issues/168737. Fixes
https://github.com/pytorch/pytorch/issues/168736.

The original diff enabled expandable segments for ROCm by adding `#ifdef
USE_ROCM` guards throughout CUDACachingAllocator.cpp to use HIP APIs
(hipMemAddressReserve, hipMemCreate, hipMemMap, etc.) instead of CUDA
driver APIs when building for ROCm.

Root cause: In HIP/ROCm 6.2.1, the field name for memory allocation
properties is `requestedHandleType` (singular), not
`requestedHandleTypes` (plural) as in CUDA. Additionally,
`hipMemHandleTypeFabric` does not exist in HIP, so the
`CU_MEM_HANDLE_TYPE_FABRIC` assignment must be skipped on ROCm.

Fix applied on top of the original diff (from D96652342):
- Use `prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor`
under `#ifdef USE_ROCM` (singular field name, HIP constant)
- Use `prop.requestedHandleTypes =
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR` for CUDA (plural field name,
CUDA constant)
- Skip the `CU_MEM_HANDLE_TYPE_FABRIC` assignment entirely on ROCm under
`#ifndef USE_ROCM`, as `hipMemHandleTypeFabric` does not exist in HIP

Co-authored-by: Prachi Gupta prachi.gupta@amd.com
Co-authored-by: Jeff Daily jeff.daily@amd.com
Co-authored-by: moonshadow-25 moonshadow-25@users.noreply.github.com
Co-authored-by: Vighanesh Sharma vighaneshsharma@gmail.com

Test Plan:
```
fbpkg build //aps_models/ads/ecosystem/eval/cogwheel_tests/amd:cogwheel_aps_ads_icvr_kd_eval_amd_test_harness --build-remote
```

https://www.internalfb.com/sandcastle/workflow/1049338713192153464

Differential Revision: D97211385

Pull Request resolved: https://github.com/pytorch/pytorch/pull/177974
Approved by: https://github.com/jeffdaily, https://github.com/echen4096

(cherry picked from commit 57927012e4360a14acf8f48801a1f4f2c49a32ad)

## Motivation

<!-- Explain the purpose of this PR and the goals it aims to achieve.
-->

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->

## Test Plan

<!-- Explain any relevant testing done to verify this PR. -->

## Test Result

<!-- Briefly summarize test outcomes. -->

## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

Co-authored-by: Haoyu Zhang <haoyuz@meta.com>
---
 c10/cuda/CUDAAllocatorConfig.h          |   2 +-
 c10/cuda/CUDACachingAllocator.cpp       | 122 +++++++++++++++++++++++-
 test/distributed/test_cupy_as_tensor.py |   6 +-
 test/test_cuda.py                       |   9 +-
 test/test_cuda_expandable_segments.py   |   9 +-
 torch/_C/__init__.pyi.in                |   1 +
 torch/_dynamo/trace_rules.py            |   1 +
 torch/csrc/DeviceAccelerator.cpp        |   4 +
 8 files changed, 142 insertions(+), 12 deletions(-)

diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index 4e6097a406bc2..cd9c9b86285c4 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -34,7 +34,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
   static bool expandable_segments() {
     bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
         use_expandable_segments();
-#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
+#if !defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && !defined(USE_ROCM)
     if (enabled) {
       TORCH_WARN_ONCE("expandable_segments not supported on this platform")
     }
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 2ab4effc7853d..7a053b8134ef7 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -17,11 +17,17 @@
 #include <c10/util/llvmMathExtras.h>
 #include <c10/util/static_tracepoint.h>
 
-#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#if defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || defined(USE_ROCM)
+#if defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
+#endif
+#ifndef _WIN32
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>
+#else
+#include <process.h>
+#endif
 #endif
 
 #include <c10/util/Exception.h>
@@ -269,7 +275,8 @@ struct SegmentRange {
   SegmentRange(void* p, size_t s) : ptr(static_cast<char*>(p)), size(s) {}
 };
 
-#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || \
+    defined(USE_ROCM)
 
 /*
 Note [Expandable Segments]
@@ -383,8 +390,13 @@ struct ExpandableSegment {
     // This allows for some cases where we have to unmap pages earlier in the
     // segment to put them at the end.
     max_handles_ = numSegments(prop.totalGlobalMem + prop.totalGlobalMem / 8);
+#ifdef USE_ROCM
+    C10_CUDA_CHECK(hipMemAddressReserve(
+        &ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL));
+#else
     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressReserve_(
         &ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL));
+#endif
   }
   ExpandableSegment(const ExpandableSegment&) = delete;
   ExpandableSegment(ExpandableSegment&&) = delete;
@@ -408,12 +420,14 @@ struct ExpandableSegment {
     // if it fails, use posix file handle
     if (CUDAAllocatorConfig::expandable_segments_handle_type() ==
         Expandable_Segments_Handle_Type::UNSPECIFIED) {
+#ifndef USE_ROCM
       CUDAAllocatorConfig::set_expandable_segments_handle_type(
           Expandable_Segments_Handle_Type::FABRIC_HANDLE);
       auto output = map(range);
       if (output.ptr != nullptr) {
         return output;
       }
+#endif
       // if fabric handle is not supported, use posix file handle.
       CUDAAllocatorConfig::set_expandable_segments_handle_type(
           Expandable_Segments_Handle_Type::POSIX_FD);
@@ -445,33 +459,60 @@ struct ExpandableSegment {
       if (enable_ipc_handles) {
         if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
             Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
+#ifdef USE_ROCM
+          prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor;
+#else
           prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#endif
         } else {
+#ifndef USE_ROCM
           prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+#endif
         }
       }
       int flag = 0;
+#ifndef USE_ROCM
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuDeviceGetAttribute_(
           &flag,
           CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
           device_));
+#endif
       if (flag)
         prop.allocFlags.gpuDirectRDMACapable = 1;
       prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
       // NOLINTNEXTLINE(bugprone-signed-char-misuse)
       prop.location.id = static_cast<int>(device_);
+#ifdef USE_ROCM
+      auto status = hipMemCreate(&handle, segment_size_, &prop, 0);
+#else
       auto status =
           DriverAPI::get()->cuMemCreate_(&handle, segment_size_, &prop, 0);
+#endif
       if (status != CUDA_SUCCESS) {
         if (status == CUDA_ERROR_OUT_OF_MEMORY) {
+#ifdef USE_ROCM
+          // hipMemCreate above returned hipErrorOutOfMemory and treated it
+          // like a sticky runtime error. Which means we need to clear it.
+          // Unlike the corresponding CUDA Driver API.
+          (void)hipGetLastError();
+#endif
           for (auto j : c10::irange(begin, i)) {
             // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
             auto h = handles_.at(j).value();
             handles_.at(j) = std::nullopt;
+#ifdef USE_ROCM
+            C10_CUDA_CHECK(hipMemRelease(h.handle));
+#else
             C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
+#endif
           }
           trimHandles();
           return rangeFromHandles(begin, begin);
+#ifdef USE_ROCM
+        } else {
+          C10_CUDA_CHECK(status);
+        }
+#else
         } else if (
             CUDAAllocatorConfig::expandable_segments_handle_type() ==
             Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
@@ -487,6 +528,7 @@ struct ExpandableSegment {
         } else {
           C10_CUDA_DRIVER_CHECK(status);
         }
+#endif
       }
       handles_.at(i) = Handle{handle, std::nullopt};
     }
@@ -522,7 +564,11 @@ struct ExpandableSegment {
     // thereby ensuring that the handle can be correctly matched in
     // ipcMemHandle_to_devptr.
     ShareHeader header{};
+#ifdef _WIN32
+    header.pid = _getpid();
+#else
     header.pid = getpid();
+#endif
     header.segment_size = segment_size_;
     header.num_handles = end - begin;
 
@@ -534,8 +580,13 @@ struct ExpandableSegment {
           Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
         if (!handle.shareable_handle) {
           int fd = 0;
+#ifdef USE_ROCM
+          C10_CUDA_CHECK(hipMemExportToShareableHandle(
+              &fd, handle.handle, hipMemHandleTypePosixFileDescriptor, 0));
+#else
           C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
               &fd, handle.handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
+#endif
           handle.shareable_handle = fd;
           LOG(INFO) << "use posix fd to share expandable segments.";
         }
@@ -546,6 +597,10 @@ struct ExpandableSegment {
             reinterpret_cast<const char*>(&*handle.shareable_handle),
             sizeof(int));
       } else {
+#ifdef USE_ROCM
+        TORCH_INTERNAL_ASSERT(
+            false, "expandable segment with fabric handle not supported");
+#else
         if (!handle.shareable_handle) {
           CUmemFabricHandle fabric_handle;
           C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
@@ -559,6 +614,7 @@ struct ExpandableSegment {
         buf.write(
             reinterpret_cast<const char*>(&*handle.shareable_handle),
             sizeof(CUmemFabricHandle));
+#endif
       }
     }
     return rangeFromHandles(begin, end);
@@ -574,14 +630,20 @@ struct ExpandableSegment {
         device, std::nullopt, header.segment_size, std::move(peers));
 // older build setups (e.g. multiwheels) do not have this syscall, added 2020
 // but the kernel on the system might still support it.
+#ifndef _WIN32
 #ifndef SYS_pidfd_open
 #define SYS_pidfd_open 434
 #endif
 #ifndef SYS_pidfd_getfd
 #define SYS_pidfd_getfd 438
 #endif
+#endif // !_WIN32
     if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
         Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
+#ifdef _WIN32
+      TORCH_CHECK(
+          false, "IPC expandable segments are not supported on Windows");
+#else
       auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
       TORCH_CHECK(
           pidfd != -1 || errno != ENOSYS,
@@ -597,9 +659,13 @@ struct ExpandableSegment {
           auto err = errno;
           close(static_cast<int>(pidfd));
           for (auto& h : segment->handles_) {
+#ifdef USE_ROCM
+            C10_CUDA_CHECK(hipMemRelease(h.value().handle));
+#else
             C10_CUDA_DRIVER_CHECK(
                 // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
                 DriverAPI::get()->cuMemRelease_(h.value().handle));
+#endif
             h = std::nullopt;
           }
           TORCH_CHECK(
@@ -609,17 +675,33 @@ struct ExpandableSegment {
           TORCH_CHECK(false, "pidfd_getfd: ", c10::utils::str_error(err));
         }
         CUmemGenericAllocationHandle handle = 0;
+#ifdef USE_ROCM
+#if ROCM_VERSION >= 70100
+        void* myfd_handle =
+            reinterpret_cast<void*>(static_cast<uintptr_t>(myfd));
+#else
+        void* myfd_handle = (void*)(uintptr_t)&myfd;
+#endif
+        C10_CUDA_CHECK(hipMemImportFromShareableHandle(
+            &handle, myfd_handle, hipMemHandleTypePosixFileDescriptor));
+#else
         C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
             &handle,
             // NOLINTNEXTLINE(performance-no-int-to-ptr)
             (void*)(uintptr_t)myfd,
             CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+#endif
         LOG(INFO) << "use posix fd to import expandable segments.";
         close(static_cast<int>(myfd));
         segment->handles_.emplace_back(Handle{handle, std::nullopt});
       }
       close(static_cast<int>(pidfd));
+#endif // !_WIN32
     } else {
+#ifdef USE_ROCM
+      TORCH_INTERNAL_ASSERT(
+          false, "expandable segment with fabric handle not supported");
+#else
       for (auto i : c10::irange(header.num_handles)) {
         (void)i;
         CUmemFabricHandle fabric_handle;
@@ -634,6 +716,7 @@ struct ExpandableSegment {
         LOG(INFO) << "use fabric handle to import expandable segments.";
         segment->handles_.emplace_back(Handle{handle, std::nullopt});
       }
+#endif
     }
     segment->mapAndSetAccess(0, header.num_handles);
     return segment;
@@ -669,8 +752,12 @@ struct ExpandableSegment {
   ~ExpandableSegment() {
     forEachAllocatedRange(
         [&](size_t begin, size_t end) { unmapHandles(begin, end); });
+#ifdef USE_ROCM
+    C10_CUDA_CHECK(hipMemAddressFree(ptr_, segment_size_ * max_handles_));
+#else
     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressFree_(
         ptr_, segment_size_ * max_handles_));
+#endif
   }
 
  private:
@@ -680,12 +767,28 @@ struct ExpandableSegment {
     // NOLINTNEXTLINE(bugprone-signed-char-misuse)
     desc.location.id = static_cast<int>(device);
     desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+#ifdef USE_ROCM
+    C10_CUDA_CHECK(hipMemSetAccess(
+        ptr() + begin * segment_size_,
+        (end - begin) * segment_size_,
+        &desc,
+        1));
+#else
     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemSetAccess_(
         ptr_ + begin * segment_size_, (end - begin) * segment_size_, &desc, 1));
+#endif
   }
 
   void mapAndSetAccess(size_t begin, size_t end) {
     for (auto i : c10::irange(begin, end)) {
+#ifdef USE_ROCM
+      C10_CUDA_CHECK(hipMemMap(
+          ptr() + i * segment_size_,
+          segment_size_,
+          0,
+          handles_.at(i).value().handle,
+          0ULL));
+#else
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemMap_(
           ptr_ + i * segment_size_,
           segment_size_,
@@ -693,6 +796,7 @@ struct ExpandableSegment {
           // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
           handles_.at(i).value().handle,
           0ULL));
+#endif
     }
     mapped_size_ += (end - begin) * segment_size_;
     setAccess(device_, begin, end);
@@ -719,12 +823,22 @@ struct ExpandableSegment {
       // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
       Handle h = handles_.at(i).value();
       handles_.at(i) = std::nullopt;
+#ifdef USE_ROCM
+      C10_CUDA_CHECK(hipMemUnmap(ptr() + segment_size_ * i, segment_size_));
+#else
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemUnmap_(
           ptr_ + segment_size_ * i, segment_size_));
+#endif
       if (h.shareable_handle) {
+#ifndef _WIN32
         close(std::get<int>(*h.shareable_handle));
+#endif
       }
+#ifdef USE_ROCM
+      C10_CUDA_CHECK(hipMemRelease(h.handle));
+#else
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
+#endif
     }
     trimHandles();
   }
@@ -770,7 +884,11 @@ struct ExpandableSegment {
     std::optional<std::variant<int, CUmemFabricHandle>> shareable_handle;
   };
   struct ShareHeader {
+#ifdef _WIN32
+    int pid;
+#else
     pid_t pid;
+#endif
     size_t segment_size;
     size_t num_handles;
   };
diff --git a/test/distributed/test_cupy_as_tensor.py b/test/distributed/test_cupy_as_tensor.py
index e0a98ae960426..57b44ff496adf 100644
--- a/test/distributed/test_cupy_as_tensor.py
+++ b/test/distributed/test_cupy_as_tensor.py
@@ -8,7 +8,10 @@
 import torch
 from torch.multiprocessing.reductions import reduce_tensor
 from torch.testing._internal.common_cuda import SM100OrLater
-from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_distributed import (
+    MultiProcContinuousTest,
+    skip_if_rocm_multiprocess,
+)
 from torch.testing._internal.common_utils import (
     requires_cuda_p2p_access,
     run_tests,
@@ -64,6 +67,7 @@ def _init_device(self) -> None:
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
+    @skip_if_rocm_multiprocess  # RuntimeError: pidfd_getfd Operation not permitted"
     @skip_but_pass_in_sandcastle_if(
         SM100OrLater,
         "Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)",
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 2a3e27dab814e..df9bdd5b0be11 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4990,6 +4990,14 @@ def cb(device, alloc, device_alloc, device_free):
 
     def test_allocator_fuzz(self):
         # fuzz
+        if (
+            torch.version.hip
+            and "expandable_segments:True"
+            in torch._C._accelerator_getAllocatorSettings()
+        ):
+            raise unittest.SkipTest(
+                "ROCm needs https://github.com/ROCm/rocm-systems/pull/3023"
+            )
         state = random.getstate()
         random.seed(123)
         N = 10000
@@ -6448,7 +6456,6 @@ def test_graph_capture_reclaim_4_streams(self):
             "graph_capture_record_stream_reuse:False"
         )
 
-    @skipIfRocm(msg="expandable_segments mode is not supported on ROCm")
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode")
     def test_mempool_expandable(self):
         torch.cuda.empty_cache()
diff --git a/test/test_cuda_expandable_segments.py b/test/test_cuda_expandable_segments.py
index 78e4cddab84ed..f22b50c64313e 100644
--- a/test/test_cuda_expandable_segments.py
+++ b/test/test_cuda_expandable_segments.py
@@ -12,7 +12,7 @@
 
 import torch
 from torch.testing._internal.common_cuda import IS_JETSON, IS_WINDOWS
-from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM
+from torch.testing._internal.common_utils import run_tests
 
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent
@@ -25,12 +25,7 @@
 sys.path.remove(str(REPO_ROOT))
 
 if __name__ == "__main__":
-    if (
-        torch.cuda.is_available()
-        and not IS_JETSON
-        and not IS_WINDOWS
-        and not TEST_WITH_ROCM
-    ):
+    if torch.cuda.is_available() and not IS_JETSON and not IS_WINDOWS:
         get_disabled_tests(".")
 
         torch.cuda.memory._set_allocator_settings("expandable_segments:True")
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 565a2035fc663..14c5e699cf4c1 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -2598,6 +2598,7 @@ def _accelerator_getDeviceStats(device_index: _int) -> dict[str, Any]: ...
 def _accelerator_resetAccumulatedStats(device_index: _int) -> None: ...
 def _accelerator_resetPeakStats(device_index: _int) -> None: ...
 def _accelerator_getMemoryInfo(device_index: _int) -> tuple[_int, _int]: ...
+def _accelerator_getAllocatorSettings() -> str: ...
 def _accelerator_setAllocatorSettings(env: str) -> None: ...
 
 # Defined in torch/csrc/jit/python/python_tracer.cpp
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 17cd668bb8eaa..797bbcb2c79db 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -463,6 +463,7 @@
         "torch._C._accelerator_getAccelerator",
         "torch._C._accelerator_getDeviceIndex",
         "torch._C._accelerator_getStream",
+        "torch._C._accelerator_getAllocatorSettings",
         "torch._C._accelerator_setAllocatorSettings",
         "torch._C._accelerator_setStream",
         "torch._C._accelerator_synchronizeDevice",
diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp
index c6ffa893d95ae..c75643e2fa129 100644
--- a/torch/csrc/DeviceAccelerator.cpp
+++ b/torch/csrc/DeviceAccelerator.cpp
@@ -164,6 +164,10 @@ void initModule(PyObject* module) {
     return at::accelerator::getMemoryInfo(device_index);
   });
 
+  m.def("_accelerator_getAllocatorSettings", []() {
+    return c10::CachingAllocator::getAllocatorSettings();
+  });
+
   m.def("_accelerator_setAllocatorSettings", [](std::string env) {
     c10::CachingAllocator::setAllocatorSettings(env);
   });

From 5f21bad281c1046eb6017ffa513cf2a815ea82f2 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 31 Mar 2026 11:58:40 -0700
Subject: [PATCH 58/87] [release/2.11] Enable wheels (#3111)

http://rocm-ci.amd.com/job/pytorch2.11-manylinux-wheels_rel-7.2/
Enabling wheel build for release/2.11
---
 .circleci/scripts/binary_populate_env.sh |  7 ++++-
 .github/scripts/build_triton_wheel.py    | 36 +++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 74ad225db933b..c25e351768607 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -5,7 +5,9 @@ export TZ=UTC
 tagged_version() {
   GIT_DIR="${workdir}/pytorch/.git"
   GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
-  if [[ ! -d "${GIT_DIR}" ]]; then
+  if [[ -n "${CIRCLE_TAG:-}" ]]; then
+    echo "${CIRCLE_TAG}"
+  elif [[ ! -d "${GIT_DIR}" ]]; then
     echo "Abort, abort! Git dir ${GIT_DIR} does not exists!"
     kill $$
   elif ${GIT_DESCRIBE} --exact >/dev/null; then
@@ -69,6 +71,8 @@ fi
 
 export PYTORCH_BUILD_NUMBER=1
 
+# This part is done in the builder scripts so commenting the duplicate code
+: <<'BLOCK_COMMENT'
 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 TRITON_CONSTRAINT="platform_system == 'Linux'"
@@ -110,6 +114,7 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
         export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
     fi
 fi
+BLOCK_COMMENT
 
 USE_GLOO_WITH_OPENSSL="ON"
 if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 34c6c3549f9c7..fa5f81bf06d9b 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import os
+import re
 import shutil
 import sys
 from pathlib import Path
@@ -51,6 +52,31 @@ def patch_init_py(
         f.write(orig)
 
 
+def get_rocm_version() -> str:
+    rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm"
+    rocm_version = "0.0.0"
+    rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h"
+    if not os.path.isfile(rocm_version_h):
+        rocm_version_h = f"{rocm_path}/include/rocm_version.h"
+    if os.path.isfile(rocm_version_h):
+        RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)")
+        RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)")
+        RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)")
+        major, minor, patch = 0, 0, 0
+        for line in open(rocm_version_h):
+            match = RE_MAJOR.search(line)
+            if match:
+                major = int(match.group(1))
+            match = RE_MINOR.search(line)
+            if match:
+                minor = int(match.group(1))
+            match = RE_PATCH.search(line)
+            if match:
+                patch = int(match.group(1))
+        rocm_version = str(major) + "." + str(minor) + "." + str(patch)
+    return rocm_version
+
+
 def build_triton(
     *,
     version: str,
@@ -65,13 +91,20 @@ def build_triton(
         max_jobs = os.cpu_count() or 1
         env["MAX_JOBS"] = str(max_jobs)
 
+    version_suffix = ""
+    if not release:
+        rocm_version = get_rocm_version()
+        version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}"
+        version += version_suffix
+
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
-            triton_pkg_name = "triton-rocm"
+            triton_repo = "https://github.com/ROCm/triton"
+            triton_pkg_name = "triton"
         elif device == "xpu":
             triton_pkg_name = "triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
@@ -89,6 +122,7 @@ def build_triton(
 
         # change built wheel name and version
         env["TRITON_WHEEL_NAME"] = triton_pkg_name
+        env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
         if with_clang_ldd:
             env["TRITON_BUILD_WITH_CLANG_LLD"] = "1"
 

From c8e635b1be2fcb9d8a7aa7ebdd0f8128afc5e28f Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 31 Mar 2026 12:24:33 -0700
Subject: [PATCH 59/87] [release/2.11] Only skip linalg.eig assertion in
 test_torch_return_types_returns (#3097)

---
 test/functorch/test_vmap.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 28572e528ebbd..7033b41c90f95 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -71,6 +71,7 @@
     run_tests,
     skipIfTorchDynamo,
     subtest,
+    TEST_WITH_ROCM,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
     unMarkDynamoStrictTest,
@@ -5064,9 +5065,12 @@ def test_torch_return_types_returns(self, device):
                 vmap(torch.topk, (0, None, None))(t, 1, 0), torch.return_types.topk
             )
         )
-        self.assertTrue(
-            isinstance(vmap(torch.linalg.eig, (0))(t), torch.return_types.linalg_eig)
-        )
+        if not (TEST_WITH_ROCM and not torch.cuda.has_magma):
+            self.assertTrue(
+                isinstance(
+                    vmap(torch.linalg.eig, (0))(t), torch.return_types.linalg_eig
+                )
+            )
 
     def test_namedtuple_returns(self, device):
         Point = namedtuple("Point", ["x", "y"])

From 8ecd5da996b21f178791f2ada27409d5ac6f3257 Mon Sep 17 00:00:00 2001
From: Arash Pakbin <arash.pakbin@amd.com>
Date: Fri, 27 Feb 2026 19:21:06 +0000
Subject: [PATCH 60/87] [ROCm] Optimize RadixSelect synchronization overhead
 (#174837)

**Summary:**
This PR optimizes the `radixSelect` kernel on ROCm by reducing synchronization overhead when aggregating radix counts across warps. The previous implementation used 3 block-level `__syncthreads()` calls plus atomic operations on 4 radix buckets (contended by all warps). The new implementation uses 2 `__syncthreads()` calls with no atomic contention, reducing synchronization overhead and improving performance.

**Background:**
The `radixSelect` algorithm finds the k-th element by iteratively uncovering its bit pattern through multiple passes over the data. Each pass determines 2 bits of the top-k value's bitmap (up to 16 passes for float32). Each iteration involves:
1. Counting input elements that match the already uncovered pattern
2. Grouping them by radix bucket (4 buckets per iteration)
3. Aggregating counts across all warps
4. Broadcasting the aggregated counts back to all threads

**Previous Implementation:**
The original sequence for each iteration was:
```cpp
initialize smem[RadixSize] to 0
__syncthreads()                    // Sync 1
count within warp
if (lane_id == 0) {
    atomicAdd(&smem[i], counts[i]) // Atomic contention on 4 buckets
}
__syncthreads()                    // Sync 2
read back total counts from smem
__syncthreads()                    // Sync 3
```

This involved **3 synchronizations** and **atomic contention** on 4 buckets from all warps.

**Changes:**

* **Warp-level reduction without atomics:**
  - Each warp's lane 0 writes its counts to a dedicated location in shared memory
  - Warp 0's lanes perform parallel reduction: each lane reduces one radix bin across all warps
  - This eliminates atomic contention while maintaining correctness

* **Double-buffering for concurrent iterations:**
  - Observation: Due to block-level synchronization, at most two consecutive iterations can be in-flight simultaneously
  - When threads are in "section 2" (post-sync) of iteration `i`, other threads can only reach "section 1" (pre-sync) of iteration `i+1` and wait there
  - We use `buffer_index` (0 or 1) to alternate between two shared memory segments, allowing safe concurrent execution
  - This enables removing the first and last `__syncthreads()` calls, reducing from 3 to 2 synchronizations per iteration (2 = 3 - 2 + 1, where the +1 is required for the new warp-level aggregation step that replaces atomics)

**Performance:**
Measured on AMD MI350 (gfx950) using single-block TopK operator, where RadixSelect accounts for ~80% of total latency for typical workloads.

- **Smaller datatypes (bfloat16, float16):** 4-5% improvement on smaller inputs, ~1% on larger inputs
- **float32:** Similar improvements, slightly less pronounced
- **Average improvement:** ~2% (weighted by larger input latencies)

**Testing:**
- Verified correctness across multiple data types (float32, float16, bfloat16) and input shapes
- Tested with various K values to ensure correct behavior across all iteration counts
- Performance benchmarks included below
<img width="2307" height="1537" alt="topk_latency_comparison" src="https://github.com/user-attachments/assets/1d0b8428-055a-4fa9-b3b0-31427021adf9" />

**Testing code:**
 - benchmark code: [code](https://github.com/user-attachments/files/24484540/benchmark.py)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/174837
Approved by: https://github.com/jeffdaily
---
 aten/src/ATen/native/cuda/Sorting.cu          |  14 +++
 .../ATen/native/cuda/SortingRadixSelect.cuh   | 117 +++++++++++++-----
 aten/src/ATen/native/cuda/TensorTopK.cu       |   6 +-
 3 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu
index 1c4f06fe262e6..717e08140226e 100644
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@@ -31,7 +31,14 @@ __global__ void gatherKthValue(
     cuda::detail::TensorInfo<int64_t, index_t> indices) {
   // Indices are limited to integer fp precision, so counts can fit in
   // int32, regardless of index_t
+#ifndef USE_ROCM
   __shared__ int smem[C10_WARP_SIZE]; // one per each warp, up to warp limit
+#else
+  // Maximum shared memory size for radix select (used in countRadixAggregateCounts): NUM_BUFFERS * MAX_WARPS * RADIX_SIZE.
+  // HIP workgroups have at most 1024 threads. Warp size is at least 32 (can be 64 on some
+  // architectures), so we use 32 for safety: 2 buffers * (1024/32) warps * 4 radix bins = 256.
+  __shared__ int smem[256];
+#endif
 
   index_t slice = getLinearBlockId<index_t>();
   if (slice >= numInputSlices) {
@@ -108,7 +115,14 @@ __global__ void gatherMedian(
     bool ignore_nan) {
   // Shared memory for the subroutine RadixSelect. Note that RadixSelect converts the
   // floating point type to int with the same relative ordering.
+#ifndef USE_ROCM
   __shared__ int smem[C10_WARP_SIZE]; // one per each warp, up to warp limit
+#else
+  // Maximum shared memory size for radix select (used in countRadixAggregateCounts): NUM_BUFFERS * MAX_WARPS * RADIX_SIZE.
+  // HIP workgroups have at most 1024 threads. Warp size is at least 32 (can be 64 on some
+  // architectures), so we use 32 for safety: 2 buffers * (1024/32) warps * 4 radix bins = 256.
+  __shared__ int smem[256];
+#endif
 
   index_t slice = getLinearBlockId<index_t>();
   if (slice >= numInputSlices) {
diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
index 977fc76b295be..a82cff9f227d6 100644
--- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
+++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
@@ -436,6 +436,76 @@ __device__ __forceinline__ void countRadixLoop(
   }
 }
 
+// Aggregates radix matches across all warps and distributes results back to all threads.
+// Uses double-buffering via buffer_index (0 or 1) to alternate between two smem segments,
+// preventing race conditions between concurrent iterations. Since countRadixUsingMaskDataSmem
+// performs __syncthreads() internally, at most two loop iterations can be in flight
+// simultaneously, so two buffers are sufficient. buffer_index is toggled after each
+// countRadixUsingMaskDataSmem invocation.
+template <
+    typename CountType,
+    int RadixSize,
+    int RadixBits>
+__device__ __forceinline__ void countRadixAggregateCounts(
+    CountType counts[RadixSize], // counts[i] will be the number of matching
+                                 // elements ((val & desiredMask) == desired)
+                                 // that have the digits [radixDigitPos,
+                                 // radixDigitPos+RADIX_BITS-1] set to i.
+    CountType* smem, // shared memory for inter-warp reduction of counts.
+    int buffer_index){ // buffer index for smem.
+
+  // Maximum number of warps per workgroup. HIP workgroups have at most 1024 threads.
+  // Warp size is at least 32 (can be 64 on some architectures), so we use 32 for safety.
+  // This sizes shared memory buffers to accommodate all possible warps: 1024/32 = 32.
+  constexpr uint MAX_WARPS = 1024/32;
+  const int buffer_offset = buffer_index * MAX_WARPS * RadixSize; // offset of the buffer in smem.
+  const uint WARP_BITS = __builtin_ctz(warpSize);
+
+  const uint num_warps = blockDim.x >> WARP_BITS;  // Actual number of warps in this block
+  const uint warp_id = threadIdx.x >> WARP_BITS; // = threadIdx.x / warpSize
+  const int lane_id = at::cuda::getLaneId(); // = threadIdx.x % warpSize
+
+  // Stage 1: Each warp's lane 0 stores its counts in smem.
+  // Layout after Stage 1: [warp0: all radix bins], [warp1: all radix bins], ...
+  // this layout starts from index buffer_offset.
+  if (lane_id == 0) {
+#pragma unroll
+    for (uint32_t i = 0; i < RadixSize; ++i) {
+      smem[
+            buffer_offset
+          + warp_id * RadixSize
+          + i
+          ] = counts[i];
+    }
+  }
+
+  __syncthreads(); // wait for all warps to finish storing their counts to smem.
+
+  // Stage 2: Warp0 performs reduction for all bins.
+  // Layout after Stage 2: [final radix0 sum], [final radix1 sum], ..., [final radix(RadixSize-1) sum]
+  // this layout starts from index buffer_offset.
+  if (warp_id == 0 && lane_id < RadixSize) {
+    CountType sum = 0;
+#pragma unroll
+    for (int w = 0; w < num_warps; ++w) {
+      sum += smem[
+                    buffer_offset
+                  + w * RadixSize
+                  + lane_id
+                  ];
+    }
+    smem[buffer_offset + lane_id] = sum;
+  }
+
+  __syncthreads(); // Wait for warp 0 to finish reduction.
+
+  // Stage 3: Each thread reads the final counts from smem.
+#pragma unroll
+  for (uint32_t i = 0; i < RadixSize; ++i) {
+    counts[i] = smem[buffer_offset + i];
+  }
+}
+
 // This function counts the distribution of all input values in a
 // slice we are selecting by radix digit at `radixDigitPos`, but only
 // those that pass the filter `((v & desiredMask) == desired)`.
@@ -457,6 +527,7 @@ __device__ void countRadixUsingMaskDataSmem(
                            // digits [radixDigitPos, radixDigitPos+RADIX_BITS-1]
                            // set to i in the warp.
     CountType* smem, // shared memory for inter-warp reduction of counts.
+    int buffer_index, // buffer index for smem.
     bitwise_t
         desired, // combined with desiredMask to filter relevant elements. An
                  // element is relevant if ((val & desiredMask) == desired).
@@ -479,14 +550,6 @@ __device__ void countRadixUsingMaskDataSmem(
     counts[i] = 0; // initialize counts to 0.
   }
 
-  // initialize smem to 0. This is for reduction of counts across all warps.
-  if (threadIdx.x < RadixSize) {
-    smem[threadIdx.x] = 0;
-  }
-
-  __syncthreads(); // wait for all threads in the block to finish initializing
-                   // smem.
-
   // count the distribution of the bits in the radix digit at `radixDigitPos` to
   // `radixDigitPos`+RADIX_BITS-1 for values that match the desired pattern
   // ((val & desiredMask) == desired). counts[] will hold the results for the
@@ -512,27 +575,11 @@ __device__ void countRadixUsingMaskDataSmem(
         });
   }
 
-  // accumulate the counts across all warps.
-  // sum for each warp is added to smem by thread 0 in the warp.
-  if (at::cuda::getLaneId() == 0) {
-#pragma unroll
-    for (uint32_t i = 0; i < RadixSize; ++i) {
-      gpuAtomicAddNoReturn(
-          &smem[i],
-          counts[i]); // thread0 in warp atomically adds the counts to smem.
-    }
-  }
-
-  __syncthreads(); // wait for all warps to finish adding their counts to smem.
-
-// each thread reads the final counts from smem.
-#pragma unroll
-  for (uint32_t i = 0; i < RadixSize; ++i) {
-    counts[i] = smem[i];
-  }
-
-  __syncthreads(); // wait for all threads in the block to finish reading the
-                   // counts.
+  // aggregate counts across all warps and distribute results back to all threads.
+  countRadixAggregateCounts<CountType, RadixSize, RadixBits>(
+    counts,
+    smem,
+    buffer_index);
 }
 
 // This is the main loop of the findPattern function that finds the unique value
@@ -849,6 +896,14 @@ __device__ void radixSelect(
   __syncthreads(); // so the initialization is visible to all threads in the
                    // blocks.
 
+  // buffer index for smem. We use two segments of smem for inter-warp communication of counts.
+  // Given the counting operation in countRadixUsingMaskDataSmem performs __syncthreads() internally,
+  // we need to alternate between the at most two segments of smem to avoid race conditions.
+  // No more than two iterations of the loop will be "in flight" at any given time because
+  // of the __syncthreads() in countRadixUsingMaskDataSmem.
+  // buffer_index is either 0 or 1. It is toggled after each countRadixUsingMaskDataSmem invocation.
+  int buffer_index = 0;
+
 #endif
 
   // We only consider elements x such that (x & desiredMask) == desired
@@ -895,6 +950,7 @@ __device__ void radixSelect(
         RADIX_BITS>(
         counts,
         smem,
+        buffer_index,
         desired,
         desiredMask,
         digitPos,
@@ -903,6 +959,9 @@ __device__ void radixSelect(
         data,
         dataSmem,
         dataSmemSize);
+
+    buffer_index ^= 1; // toggle buffer index.
+
 #else
     countRadixUsingMask<
         scalar_t,
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index e3025bace508e..1b8585ff3beab 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -257,7 +257,11 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
 
   // Indices are limited to integer fp precision, so counts can fit in
   // int32, regardless of IndexType
-  __shared__ int smem[64];
+
+  // Maximum shared memory size for radix select (used in countRadixAggregateCounts): NUM_BUFFERS * MAX_WARPS * RADIX_SIZE.
+  // HIP workgroups have at most 1024 threads. Warp size is at least 32 (can be 64 on some
+  // architectures), so we use 32 for safety: 2 buffers * (1024/32) warps * 4 radix bins = 256.
+  __shared__ int smem[256];
   __shared__ int writeIndexStart; // index to track where to write results. This is shared by all threads in the block. Increases atomically.
 
   IndexType slice = getLinearBlockId<IndexType>();

From e0fbdce0b47d00092fd9c84d9e9c3fecba998a93 Mon Sep 17 00:00:00 2001
From: Arash Pakbin <arash.pakbin@amd.com>
Date: Tue, 24 Feb 2026 16:54:35 +0000
Subject: [PATCH 61/87] [ROCm] RadixSelect: Remove loop padding and make
 prefetching conditional (#174897)

**Summary:**
This PR optimizes the `radixSelect` kernel on ROCm by removing unnecessary loop padding and making prefetching conditional. The previous implementation padded loop bounds to ensure all threads participate in warp-level operations, which added overhead. This does not seem necessary as other parts in PyTorch have not been doing it (see [example](https://github.com/pytorch/pytorch/blob/c8062c4fe279e840407ebf9e2457573498bee464/aten/src/ATen/native/cuda/TensorTopK.cu#L120)). Additionally, prefetching was always enabled even when accessing shared memory, where it provides no benefit and hurts performance by adding long dependency chains within the loop. This PR makes prefetching a compile-time conditional feature and removes the padding overhead.

**Previous Implementation:**
- Loop bounds were padded: `i < round_up(loopBound, warpSize)`
- Work was guarded: `if (i < loopBound) { ... }`
- Prefetching in `countRadixLoop` was always enabled, even for shared memory access

**Changes:**

* **Removed loop padding:**
  - Changed loop bounds from `round_up(loopBound, warpSize)` to `loopBound`
  - Removed `if (i < loopBound)` guards since padding is no longer needed

* **Conditional prefetching:**
  - Added `bool prefetch` template parameter to `countRadixLoop` function
  - Prefetching is enabled (`prefetch = true`) only for global memory access
  - Prefetching is disabled (`prefetch = false`) for shared memory access, where it hurts performance
  - Uses `if constexpr` for compile-time optimization, ensuring zero runtime overhead

**Performance:**
Measured on AMD MI350 (gfx950) using single-block TopK operator.
- **Overall average improvement:** ~2.4% across all tested configurations
- **By data type:**
  - float16: ~3.5% average improvement (best gains)
  - bfloat16: ~2.0% average improvement
  - float32: ~1.6% average improvement
- **By input size:**
  - Small inputs (10K and 100K elements): ~4.6% improvement
  - Medium inputs (1M elements): ~2.4% improvement
  - Large inputs (10M and 100M elements): ~0.2% improvement
- The optimization provides the greatest benefit on smaller inputs and half-precision types, where the overhead of padding and unnecessary prefetching has a larger relative impact. Although some regressions occur (primarily on float32 inputs), the overall impact remains positive across all data types, with float32 still achieving a 1.6% average improvement.

<img width="2311" height="1537" alt="topk_latency_comparison" src="https://github.com/user-attachments/assets/7210a6f7-da51-4bb5-acc3-ce60257ff6e9" />

**Testing:**
- Verified correctness across multiple data types (float32, float16, bfloat16) and input shapes
- Tested with various K values and input sizes to ensure correct behavior
- Confirmed that warp-level operations (`WARP_BALLOT`) work correctly without padding

**Benchmark code:**
See [benchmark.py](https://github.com/user-attachments/files/24484540/benchmark.py) for the performance measurement script.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/174897
Approved by: https://github.com/jerrymannil, https://github.com/jeffdaily
---
 .../ATen/native/cuda/SortingRadixSelect.cuh   | 77 ++++++++++---------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
index a82cff9f227d6..afde242cc98b7 100644
--- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
+++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
@@ -318,6 +318,7 @@ template <
     typename CountType,
     int RadixSize,
     int RadixBits,
+    bool prefetch,
     typename DataAccessor>
 __device__ __forceinline__ void countRadixLoop(
     CountType counts[RadixSize], // counts[i] will be the number of matching
@@ -395,32 +396,38 @@ __device__ __forceinline__ void countRadixLoop(
 
   // phase 2: processing 1 element at an iteration.
 
-  // prefetching. This is specifically useful for global memory access.
-  scalar_t v = unroll_segment + threadIdx.x < loopBound
-      ? getData(unroll_segment + threadIdx.x)
-      : static_cast<scalar_t>(0);
-  // we pad loopbound to round_up(loopbound, warpSize) to make sure all threads
-  // in the warp participate in the ballot.
+  // prefetching pattern if prefetch is true.
+  // prefetching pattern is only useful for global memory access.
+  scalar_t v_curr;
+  if constexpr (prefetch) {
+    v_curr = unroll_segment + threadIdx.x < loopBound
+        ? getData(unroll_segment + threadIdx.x)
+        : static_cast<scalar_t>(0);
+  }
   for (index_t i = unroll_segment + threadIdx.x;
-       i < round_up(
-               static_cast<index_t>(loopBound), static_cast<index_t>(warpSize));
+       i < loopBound;
        i += blockDim.x) {
-    // prefetch the next element.
-    scalar_t v_next = i + blockDim.x < loopBound ? getData(i + blockDim.x)
-                                                 : static_cast<scalar_t>(0);
-
-    bool hasVal = false;
-    bitwise_t digitInRadix = static_cast<bitwise_t>(0);
-    if (i < loopBound) {
-      bitwise_t val = TopKTypeConfig<scalar_t>::convert(v);
-      // check if bit pattern matches the pattern we have already discovered for
-      // topk value v.
-      hasVal = ((val & desiredMask) == desired);
-      // get the bits [radixDigitPos, radixDigitPos+RADIX_BITS-1] of the value
-      // v.
-      digitInRadix = at::cuda::Bitfield<bitwise_t>::getBitfield(
-          val, radixDigitPos, RadixBits);
-    }
+        scalar_t v_local; // the current element.
+        scalar_t v_next; // the next element. Used for prefetching.
+
+        if constexpr (prefetch) {
+          // prefetch the next element.
+          v_local = v_curr;
+          v_next = i + blockDim.x < loopBound ? getData(i + blockDim.x)
+                                              : static_cast<scalar_t>(0);
+        }
+        else {
+          v_local = getData(i); // if no prefetching, just get the current element.
+        }
+
+        bitwise_t val = TopKTypeConfig<scalar_t>::convert(v_local);
+        // check if bit pattern matches the pattern we have already discovered for
+        // topk value v.
+        bool hasVal = ((val & desiredMask) == desired);
+        // get the bits [radixDigitPos, radixDigitPos+RADIX_BITS-1] of the value
+        // v.
+        bitwise_t digitInRadix = at::cuda::Bitfield<bitwise_t>::getBitfield(
+            val, radixDigitPos, RadixBits);
 
 // counting across the warp.
 #pragma unroll
@@ -432,7 +439,9 @@ __device__ __forceinline__ void countRadixLoop(
       counts[j] += __popcll(WARP_BALLOT(vote));
     }
 
-    v = v_next; // closing the prefetching loop.
+    if constexpr (prefetch) {
+      v_curr = v_next; // closing the prefetching loop.
+    }
   }
 }
 
@@ -556,7 +565,7 @@ __device__ void countRadixUsingMaskDataSmem(
   // current warp.
   if (dataSmemSize >
       0) { // if shared memory is filled, use dataSmem as the input data.
-    countRadixLoop<scalar_t, bitwise_t, index_t, int, RadixSize, RadixBits>(
+    countRadixLoop<scalar_t, bitwise_t, index_t, int, RadixSize, RadixBits, /*prefetch =*/ false>(
         counts,
         desired,
         desiredMask,
@@ -564,7 +573,7 @@ __device__ void countRadixUsingMaskDataSmem(
         dataSmemSize,
         [&](index_t i) -> scalar_t { return dataSmem[i]; });
   } else { // if shared memory is not filled, fall back to global memory.
-    countRadixLoop<scalar_t, bitwise_t, index_t, int, RadixSize, RadixBits>(
+    countRadixLoop<scalar_t, bitwise_t, index_t, int, RadixSize, RadixBits, /*prefetch =*/ true>(
         counts,
         desired,
         desiredMask,
@@ -787,21 +796,15 @@ __device__ __forceinline__ void fillDataSmem(
     scalar_t v = threadIdx.x < sliceSize
         ? doLdg(&data[threadIdx.x * withinSliceStride])
         : static_cast<scalar_t>(0);
-    // we pad sliceSize to round_up(sliceSize, warpSize) to make sure all
-    // threads in the warp participate in the ballot.
-    for (index_t i = threadIdx.x; i <
-         round_up(static_cast<index_t>(sliceSize),
-                  static_cast<index_t>(warpSize));
+
+    for (index_t i = threadIdx.x; i < sliceSize;
          i += blockDim.x) {
       scalar_t v_next = (i + blockDim.x) < sliceSize
           ? doLdg(&data[(i + blockDim.x) * withinSliceStride])
           : static_cast<scalar_t>(0);
 
-      bool match = false;
-      if (i < sliceSize) {
-        match =
-            ((TopKTypeConfig<scalar_t>::convert(v) & desiredMask) == desired);
-      }
+      bool match =
+          (TopKTypeConfig<scalar_t>::convert(v) & desiredMask) == desired;
 
       // Warp-level ballot
       uint64_t ballot = WARP_BALLOT(

From ccba6b0ee03e1b194bc3ffd0fd4d5a4a2a55a843 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenwang@meta.com>
Date: Thu, 12 Mar 2026 02:59:57 +0000
Subject: [PATCH 62/87] [ROCm] fix radixselect (#177149)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
https://github.com/pytorch/pytorch/pull/174837 (D94770109) introduced a race condition situation.
## symptom
 running it in reference service under high qps, the service will crash on such error: `HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception. code: 0x1016`

## The Race

After countRadixAggregateCounts Stage 3 (line 514):
  counts[i] = smem[buffer_offset + i];   // ALL threads read, buffer_offset could be 0
  // NO __syncthreads() — function returns immediately

Then in the radixSelect main loop:
  buffer_index ^= 1;                      // line 966, toggle
  // ... bucket loop evaluates counts[] ...
  found_unique(i, count) fires →
    findPatternDataSmem((scalar_t*)smem)   // line 1014
      smem[0] = (scalar_t)0;              // line 689 — WRITE to smem[0]
      smem[1] = (scalar_t)0;              // line 690 — WRITE to smem[1]
      __syncthreads();                     // line 693

When buffer_index = 0 was used for counting, Stage 3 reads from smem[0..3]. Then findPatternDataSmem writes smem[0] and smem[1] (cast to scalar_t*, same physical memory). There is no __syncthreads()
 between these reads and writes.

Since warps execute independently, warp 0 (containing thread 0) can reach line 689 and write smem[0] while a lagging warp is still at line 514 reading smem[0].

## Why This Can Be Dangerous

The corruption writes scalar_t(0) (bit pattern 0x00000000 for float) over the int count values. If the lagging warp reads a corrupted counts[0] or counts[1] (now 0 instead of the real count):

1. Divergent control flow: The lagging warp's found_unique(i, count) check uses the corrupted count. If the corrupted bucket was the one that should have triggered found_unique (count was 1, now reads as 0), that warp skips it and falls through the bucket loop without returning.
2. Mismatched __ syncthreads(): Warp 0 is inside findPatternLoop which has __ syncthreads() at lines 646 and 652. The lagging warp is NOT in findPatternLoop — it continued to the next digitPos iteration, hitting __ syncthreads() inside fillDataSmem or countRadixAggregateCounts. Divergent __syncthreads() is undefined behavior on GPUs and can cause hangs or crashes.
3. Incorrect kth_value: Even if it doesn't hang, the lagging warp may compute a different kth_value entirely. When the kernel later uses this for the gather phases, it could:
    - Collect the wrong number of elements → CUDA_KERNEL_ASSERT(write_index < k) → ASSERT_TRAP (exactly matching the error in D94938108's description)
    - Or produce silently incorrect TopK results

When Can This Happen?

  ┌───────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────┐
  │                   Condition                   │                                Likelihood                                 │
  ├───────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────┤
  │ buffer_index = 0 during final counting        │ 50% per radixSelect call                                                  │
  ├───────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────┤
  │ found_unique fires (count == 1, kToFind == 1) │ Common — happens in final radix iterations                                │
  ├───────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────┤
  │ Sufficient warp skew (~100+ instructions)     │ Rare normally, but increases under GPU memory pressure / high utilization │
  └───────────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────┘

Test Plan:
hard to repro in single test case.
we run it in a service under high qps, it will crash after the qps reaches some high number.

with this fix, the job can succeed - https://www.internalfb.com/vanguard/serving_test_cases/909207988628161

Differential Revision: D96015407

Pull Request resolved: https://github.com/pytorch/pytorch/pull/177149
Approved by: https://github.com/jeanschmidt
---
 aten/src/ATen/native/cuda/SortingRadixSelect.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
index afde242cc98b7..faaf1b06d7989 100644
--- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
+++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
@@ -513,6 +513,7 @@ __device__ __forceinline__ void countRadixAggregateCounts(
   for (uint32_t i = 0; i < RadixSize; ++i) {
     counts[i] = smem[buffer_offset + i];
   }
+  __syncthreads(); // Wait for all threads to finish reading the final counts.
 }
 
 // This function counts the distribution of all input values in a

From 7a10d22bd2662cd9113bb2cf88166875464035ff Mon Sep 17 00:00:00 2001
From: Arash Pakbin <arash.pakbin@amd.com>
Date: Tue, 31 Mar 2026 12:25:41 +0000
Subject: [PATCH 63/87] [ROCm] Reduce RadixSelect sync overhead by moving
 __syncthreads to findPatternDataSmem (#178188)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

PR #177149 fixed a race condition introduced by #174837: after `countRadixAggregateCounts` Stage 3 reads counts from smem, warp 0 may get ahead of lagging warps still in Stage 3 and call `findPatternDataSmem`, overwriting `smem[0]`/`smem[1]` while lagging warps are still reading `smem[buffer_offset + i]` (which overlaps with `smem[0]`/`smem[1]` when `buffer_offset == 0`). The fix placed a `__syncthreads()` at the end of Stage 3, which runs on every iteration of the radix digit loop, negating part of the synchronization overhead that #174837 worked to eliminate.

This patch moves that sync to the **beginning of `findPatternDataSmem`** instead.

## Why this is correct

1. All threads evaluate the same `counts[]` values and all reach `found_unique()` together, so `__syncthreads()` inside `findPatternDataSmem` is collectively reachable by all threads in the block.
2. By the time any thread enters `findPatternDataSmem`, every thread has already finished reading Stage 3 (they all had to evaluate the bucket loop to get here), so syncing before the `smem[0]`/`smem[1]` writes is sufficient to prevent the race.

## Performance

`findPatternDataSmem` is called **at most once** per `radixSelect` invocation — only when `count == 1` (a unique element is identified), at which point the function returns immediately. The removed sync ran on every radix digit iteration (up to 16 times for float32). This saves up to 15 `__syncthreads()` calls in the common case.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/178188
Approved by: https://github.com/jeffdaily, https://github.com/jeanschmidt
---
 aten/src/ATen/native/cuda/SortingRadixSelect.cuh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
index faaf1b06d7989..c6e5f299a0bb3 100644
--- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
+++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
@@ -513,7 +513,6 @@ __device__ __forceinline__ void countRadixAggregateCounts(
   for (uint32_t i = 0; i < RadixSize; ++i) {
     counts[i] = smem[buffer_offset + i];
   }
-  __syncthreads(); // Wait for all threads to finish reading the final counts.
 }
 
 // This function counts the distribution of all input values in a
@@ -684,6 +683,15 @@ __device__ scalar_t findPatternDataSmem(
     const scalar_t* dataSmem, // input data stored in shared memory.
     index_t dataSmemSize) { // input data size stored in shared memory.
 
+  // Ensure all threads have finished reading from smem before overwriting it.
+  // countRadixAggregateCounts Stage 3 reads from smem[buffer_offset + i];
+  // when buffer_offset == 0, those locations overlap with smem[0]/smem[1]
+  // written below. Warp 0 (which writes smem[0]/smem[1]) may get ahead of
+  // lagging warps still in Stage 3. Syncing here (rather than at the end of
+  // Stage 3) is cheaper because findPatternDataSmem is called at most once per
+  // radixSelect invocation, only when a unique element is found (count == 1).
+  __syncthreads();
+
   // initialize smem to 0.
   // smem[0] is a flag to indicate if a value has been found.
   // smem[1] is the found value.

From f5a3aa1b70625cfa410b8f37bbe06bde199ca46d Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Fri, 3 Apr 2026 07:14:29 -0700
Subject: [PATCH 64/87] [release/2.11] Use triton with updated pyproject.toml
 to use cmake 4 (#3124)

Fixes internal CI build failures on release/2.11 due to triton build.
Build was able to pass the point where triton failed previously. e.g.
https://ml-ci-internal.amd.com/blue/organizations/jenkins/pytorch%2Fpytorch-ci-pipeline/detail/release%2F2.10/31/pipeline
With our change to triton pin:
https://ml-ci-internal.amd.com/job/pytorch/job/pytorch-ci-pipeline/job/PR-3124/2/pipeline-overview/
---
 .ci/docker/ci_commit_pins/triton.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 0a2a5f707f24f..f0849cc7d8f63 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-b31789602ee0e40b06a1fbc6e63dfae6df7e131d
+4ed888920c5a0871957f1cf912e557bc79fbe56c

From a5c71cc3527efe43e3b3914ef3c2223be444f5b5 Mon Sep 17 00:00:00 2001
From: Xinya Zhang <Xinya.Zhang@amd.com>
Date: Fri, 3 Apr 2026 11:57:51 -0500
Subject: [PATCH 65/87] [release/2.11] [ROCm] Fix
 test/dynamo/test_repros.py::ReproTestsDeviceCUDA::test_flash_attn_backward_mixed_strides_cuda#179086
 (#3127)

`dv` tensor should be created with `empty_like(v)` rather than
`empty_like(k)`.

This fixes #168540, #168541, and supersedes #178499

This is cherry-picked from upstream PR
https://github.com/pytorch/pytorch/pull/179086
---
 .../ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip | 2 +-
 test/dynamo/test_repros.py                                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
index b96e80d5e5a9e..e809f23e61def 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@@ -635,7 +635,7 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
     TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
     CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size);
   } else {
-    dv = at::empty_like(k);
+    dv = at::empty_like(v);
   }
 
   auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 65881c21b93c6..747fc7a03308f 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -7990,7 +7990,7 @@ def f():
 
     @skipIfHpu
     @unittest.skipIf(
-        TEST_WITH_ROCM or not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
         "flash attention not supported",
     )
     def test_flash_attn_backward_mixed_strides(self, device):

From 8f4963dd3b6edc4f0e710cf4bdc2bd19fdc7a8a0 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Fri, 3 Apr 2026 12:31:59 -0700
Subject: [PATCH 66/87] [release/2.11][CI] Add related_commits file (#3131)

Build validation:
http://rocm-ci.amd.com/job/pytorch2.11-manylinux-wheels_rel-7.2/7/ :
Connection issues

https://github.com/ROCm/TheRock/actions/runs/23953043418/job/69864879059
: Build succeeded

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
---
 related_commits | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 related_commits

diff --git a/related_commits b/related_commits
new file mode 100644
index 0000000000000..cbc1b48c2931b
--- /dev/null
+++ b/related_commits
@@ -0,0 +1,10 @@
+ubuntu|pytorch|apex|release/1.11.0|4fe55b966de2458e4591bed2b0c0f990ffcca683|https://github.com/ROCm/apex
+centos|pytorch|apex|release/1.11.0|4fe55b966de2458e4591bed2b0c0f990ffcca683|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|release/0.26|336d36e8db990a905498c73933e35231876e28bc|https://github.com/pytorch/vision
+centos|pytorch|torchvision|release/0.26|336d36e8db990a905498c73933e35231876e28bc|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|release/2.11|34c52a67e8941bbd8e6adaca0eb0b9eabec11d78|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|release/2.11|34c52a67e8941bbd8e6adaca0eb0b9eabec11d78|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|release/0.17.0|afb2844be99514f0d5ff42badd9c3ed0d1811d73|https://github.com/pytorch/ao
+centos|pytorch|ao|release/0.17.0|afb2844be99514f0d5ff42badd9c3ed0d1811d73|https://github.com/pytorch/ao

From 4e323059f79a8dd73a4770b9d6f9f234f865e64e Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Tue, 24 Mar 2026 23:05:41 +0000
Subject: [PATCH 67/87] [ROCm] Require rocm_smi package (#175648)

Fixes #158725

This is essentially @AngryLoki  patch:
https://github.com/gentoo/gentoo/blob/8cdbe88fa388ce264d1d70047222fcad190fec3d/sci-ml/caffe2/files/caffe2-2.9.0-rocm-distributed-link.patch

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175648
Approved by: https://github.com/jeffdaily, https://github.com/mlazos

(cherry picked from commit 9bff6e149a649234c146fdae8058fb035bfb43b7)
---
 cmake/Dependencies.cmake   | 7 +++++++
 cmake/public/LoadHIP.cmake | 1 +
 2 files changed, 8 insertions(+)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index dac0d1f41c3bd..203cdc7c029db 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1089,6 +1089,13 @@ if(USE_ROCM)
       )
     endif()
 
+    # ROCM-SMI needed to support symmetric memory
+    if(USE_DISTRIBUTED AND UNIX)
+      list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
+        rocm_smi64
+      )
+    endif()
+
     # ---[ Kernel asserts
     # Kernel asserts is disabled for ROCm by default.
     # It can be turned on by turning on the env USE_ROCM_KERNEL_ASSERT to the build system.
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 3ff7b3d2c1b36..78b8acfe9db9a 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -198,6 +198,7 @@ if(HIP_FOUND)
   if(UNIX)
     find_package_and_print_version(rccl)
     find_package_and_print_version(hsa-runtime64 REQUIRED)
+    find_package_and_print_version(rocm_smi REQUIRED)
   endif()
 
   # Optional components.

From 0446f7ba2fdcc4ffd2921949bf20f86d79677544 Mon Sep 17 00:00:00 2001
From: Yanyao Wang <yanywang@amd.com>
Date: Thu, 9 Apr 2026 11:42:50 -0500
Subject: [PATCH 68/87] [release/2.11] Fix numpy compatibility for Python 3.14
 (#3100) (#3143)

## Motivation

Fix numpy compatibility for Python 3.14 for release/2.11

## Technical Details

- `numpy==2.1.2` has no cp314 wheels on PyPI, causing Python 3.14 builds
in TheRock CI to fail with a meson/sccache error when pip falls back to
building numpy from source
- Add `python_version` markers to use `numpy==2.4.3` for Python 3.14+,
while keeping the existing `numpy==2.1.2` pin for older Python versions

## Submission Checklist

- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

Co-authored-by: Subodh Dubey <Subodh.Dubey@amd.com>
---
 requirements-build.txt | 3 ++-
 requirements.txt       | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/requirements-build.txt b/requirements-build.txt
index 88a80dfaf1b30..7ca3c2cb1cb9c 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -5,7 +5,8 @@ setuptools==79.0.1
 cmake==4.0.0
 ninja==1.11.1.4
 numpy==2.0.2 ; python_version == "3.9"
-numpy==2.1.2 ; python_version > "3.9"
+numpy==2.1.2 ; python_version > "3.9" and python_version < "3.14"
+numpy==2.4.3 ; python_version >= "3.14"
 packaging==25.0
 pyyaml==6.0.3
 requests==2.32.5
diff --git a/requirements.txt b/requirements.txt
index f8b6ebfd25ce1..ceb41d722e320 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,8 @@ lintrunner==0.12.11 ; platform_machine != "s390x"
 networkx==2.8.8
 ninja==1.11.1.4
 numpy==2.0.2 ; python_version == "3.9"
-numpy==2.1.2 ; python_version > "3.9"
+numpy==2.1.2 ; python_version > "3.9" and python_version < "3.14"
+numpy==2.4.3 ; python_version >= "3.14"
 optree==0.13.0 ; python_version < "3.14"
 optree==0.17.0 ; python_version >= "3.14"
 psutil==7.2.2

From 8543095e3275db694084a6679bd5b61f7d2ece76 Mon Sep 17 00:00:00 2001
From: Ken <kechong@amd.com>
Date: Thu, 16 Apr 2026 11:38:05 -0500
Subject: [PATCH 69/87] [ROCm][CI][release/2.11] Backport checking existence of
 /etc/rocm_env.sh before sourcing  (#3163)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Fixes the `pytorch_ut` failure introduced in PyTorch 2.11 where
`test.sh` exits immediately with code 1 before any tests run.

**Root cause:** PR pytorch/pytorch#168377 added `source
/etc/rocm_env.sh` to `.ci/pytorch/common.sh` targeting AMD's internal
Jenkins CI, which provisions this file. When cherry-picked into
`release/2.11`, this line breaks all TheRock Docker-based CI
environments that do **not** provision `/etc/rocm_env.sh`. Since `set
-e` is active in `test.sh`, the script exits before a single test runs —
causing 0-pass, 1-fail on every host.

**The fix:** Add a `[[ -f /etc/rocm_env.sh ]]` existence check so
environments without the file skip sourcing it gracefully, while Jenkins
CI (which does provision the file) continues working as before. This
matches the fix already present on `pytorch/pytorch main`.

```bash
# Before (broken):
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
  source /etc/rocm_env.sh
fi

# After (fixed):
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ -f /etc/rocm_env.sh ]]; then
  source /etc/rocm_env.sh
fi
```

**Impact without this fix:**
- 86/97 `pytorch_ut` runs failed on TheRock build 7.13.0-1208
- Affects all GFX variants and Python versions (3.11, 3.12, 3.13)
- PyTorch 2.10 is unaffected (does not have `source /etc/rocm_env.sh`)

**References:**
- Jira: ROCM-21809
- Upstream issue: pytorch/pytorch#170983
- Regression introduced by: pytorch/pytorch#168377
---
 .ci/pytorch/common.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/pytorch/common.sh b/.ci/pytorch/common.sh
index 072b8da9b10c6..eae12816fe71e 100644
--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@@ -6,7 +6,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 set -ex -o pipefail
 
 # for ROCm environment variables
-if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
+if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ -f /etc/rocm_env.sh ]]; then
   # shellcheck disable=SC1091
   source /etc/rocm_env.sh
 fi

From 520641b7cdcabd7dca4c3301fd054c5948c7ffae Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Mon, 20 Apr 2026 09:45:04 -0700
Subject: [PATCH 70/87] [release/2.11] Fix int4mm device memcpy error on
 Windows (#175410) (#3164)

On Windows with HIP/ROCm, std::memcpy is a __host__ function and cannot
be called from __device__ code. Use raw memcpy (which the HIP compiler
provides as a device builtin) when building on Windows.

This will allow builds for of pytorch for gfx942 on Windows. gfx950 is
yet to be tested but it should likely build as well.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/175410
Approved by: https://github.com/jeffdaily

Co-authored-by: Aaryaman Vasishta <aaryaman.vasishta@amd.com>
---
 aten/src/ATen/native/cuda/int4mm.cu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
index ca00c944b3259..8765bed83345a 100644
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@@ -576,7 +576,14 @@ struct BLayout_TC_int4 {
           // type pun, the __nv_bfloat162 value in bf16x2x4 is a struct and
           // can't be used as a 32-bit asm register argument for `mma`
           static_assert(sizeof(bf16x2x4) == sizeof(out[0][0]), "");
+          // On Windows with ROCm, std::memcpy resolves to a __host__-only
+          // function and cannot be called from __device__ code. Use the raw
+          // memcpy which the HIP compiler provides as a __device__ builtin.
+#if defined(_WIN32) && defined(USE_ROCM)
+          memcpy(&out[i][j], &v, sizeof(bf16x2x4_u32));
+#else
           std::memcpy(&out[i][j], &v, sizeof(bf16x2x4_u32));
+#endif
         }
       }
     }

From 141ba657575b42e5d0869002b509af4a75899edc Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com>
Date: Tue, 21 Apr 2026 10:06:00 -0500
Subject: [PATCH 71/87] [UP][release/2.11] [ROCm][TunableOp] Support FP64 on
 hipBLASLt (#178195) (#3169)

Cherry-pick of upstream https://github.com/pytorch/pytorch/pull/178195
into `release/2.11`.

Related PR:
- https://github.com/ROCm/pytorch/pull/3168

## Motivation

For MI350, FP64 is supported in hipBLASLt. This PR enables FP64 on
hipBLASLt in TunableOp and re-enables the FP64 unit test on MI350.

## Technical Details

- Map `double` GEMM to `HIPBLAS_COMPUTE_64F` via a new
`HipBlasComputeTypeFor<CT>()` helper (defaults to `HIPBLAS_COMPUTE_32F`,
specialized to `HIPBLAS_COMPUTE_64F` for `double`).
- Use `at::opmath_type<T>`-typed `alpha` / `beta` in the hipBLASLt path
so FP64 tuning and execution use consistent compute semantics.
- Set the matmul descriptor scale type with
`HipDataTypeFor<opmath_t>()`.
- Guard the TF32 override with `if constexpr (std::is_same_v<CT,
float>)` so FP64 doesn't get downgraded.
- Removes the MI350 skip on
`test_matmul_small_brute_force_tunableop_cuda_float64`.

The cherry-pick applied cleanly (no conflicts).

## Test Plan

Build PyTorch on MI350 with ROCm, then run:

\`\`\`
PYTORCH_TEST_WITH_ROCM=1 python test/test_linalg.py -v -k tunableop
\`\`\`

## Test Result

\`\`\`
Ran 69 tests in 156.726s

OK (skipped=42)
\`\`\`

All tunableop tests pass. Skipped tests are CPU-only variants and
gfx942-only variants (FP8/TF32).

Upstream PR: https://github.com/pytorch/pytorch/pull/178195
Upstream commit: 0550897ab3dcb3627dba1cfa43fd238fa4358418

Made with [Cursor](https://cursor.com)
---
 aten/src/ATen/cuda/tunable/GemmHipblaslt.h | 56 ++++++++++++++--------
 test/test_linalg.py                        |  3 --
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
index 29affa2d21ff1..29c15720f4a66 100644
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -94,6 +94,16 @@ constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
 #endif
 }
 
+template <typename T>
+constexpr hipblasComputeType_t HipBlasComputeTypeFor() {
+  return HIPBLAS_COMPUTE_32F;
+}
+
+template <>
+constexpr hipblasComputeType_t HipBlasComputeTypeFor<double>() {
+  return HIPBLAS_COMPUTE_64F;
+}
+
 template <typename T>
 int GetBatchFromParams(const GemmParams<T>* params) {
   return 1;
@@ -175,43 +185,43 @@ int GetStrideCFromParams(const ScaledGemmParams<T>* params) {
 }
 
 template <typename T>
-float GetAlphaFromParams(const GemmParams<T>* params) {
+at::opmath_type<T> GetAlphaFromParams(const GemmParams<T>* params) {
   return params->alpha;
 }
 
 template <typename T>
-float GetAlphaFromParams(const GemmAndBiasParams<T>* params) {
+at::opmath_type<T> GetAlphaFromParams(const GemmAndBiasParams<T>* params) {
   return params->alpha;
 }
 
 template <typename T>
-float GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
+at::opmath_type<T> GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
   return params->alpha;
 }
 
 template <typename T>
-float GetAlphaFromParams(const ScaledGemmParams<T>* params) {
-  return 1.0;
+at::opmath_type<T> GetAlphaFromParams(const ScaledGemmParams<T>* params) {
+  return at::opmath_type<T>{1.0};
 }
 
 template <typename T>
-float GetBetaFromParams(const GemmParams<T>* params) {
+at::opmath_type<T> GetBetaFromParams(const GemmParams<T>* params) {
   return params->beta;
 }
 
 template <typename T>
-float GetBetaFromParams(const GemmAndBiasParams<T>* params) {
-  return 0.0;
+at::opmath_type<T> GetBetaFromParams(const GemmAndBiasParams<T>* params) {
+  return at::opmath_type<T>{0.0};
 }
 
 template <typename T>
-float GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
+at::opmath_type<T> GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
   return params->beta;
 }
 
 template <typename T>
-float GetBetaFromParams(const ScaledGemmParams<T>* params) {
-  return 0.0;
+at::opmath_type<T> GetBetaFromParams(const ScaledGemmParams<T>* params) {
+  return at::opmath_type<T>{0.0};
 }
 
 template <typename T>
@@ -467,8 +477,9 @@ class HipblasltGemmOp : public Callable<ParamsT> {
 
       TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen");
 
-      float alpha = GetAlphaFromParams<CT>(params);
-      float beta = GetBetaFromParams<CT>(params);
+      using opmath_t = at::opmath_type<CT>;
+      opmath_t alpha = GetAlphaFromParams<CT>(params);
+      opmath_t beta = GetBetaFromParams<CT>(params);
 
       hipblasLtMatrixLayout_t mat_a, mat_b, mat_c;
       if (opa == HIPBLAS_OP_N) {
@@ -505,11 +516,14 @@ class HipblasltGemmOp : public Callable<ParamsT> {
             mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
       }
 
-      hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
-      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
-        computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
+      hipblasComputeType_t computeType = HipBlasComputeTypeFor<CT>();
+      if constexpr (std::is_same_v<CT, float>) {
+        if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
+          computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
+        }
       }
-      HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
+      auto scale_type = HipDataTypeFor<opmath_t>();
+      HipBlasLtMatmulDescriptor matmul(computeType, scale_type);
       matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
       matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
 
@@ -630,9 +644,11 @@ auto GetHipBlasLtTypeStringAndOps() {
   }
 #endif
 
-  hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
-  if (at::globalContext().allowTF32CuBLAS()) {
-    computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
+  hipblasComputeType_t computeType = HipBlasComputeTypeFor<CT>();
+  if constexpr (std::is_same_v<CT, float>) {
+    if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
+      computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
+    }
   }
 
   hipblasLtHandle_t handle;
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 346a6c0204479..045da268fd99c 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -5116,9 +5116,6 @@ def test_matmul_small_brute_force_tunableop(self, device, dtype):
         # We set the TunableOp numerical check environment variable here because it is
         # possible to hit some invalid numerical solutions due to the small matrix sizes.
 
-        if torch.version.hip and isRocmArchAnyOf(MI350_ARCH) and dtype is torch.double:
-            self.skipTest("Currently hangs on rocm mi350")
-
         with self._tunableop_ctx():
             torch.cuda.tunable.set_rotating_buffer_size(0)
             # Numerical check adds significant overhead, unsure if this is needed

From 50bfde7c08dc92b69b71d2b76d3b2d3709cf28f6 Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com>
Date: Wed, 22 Apr 2026 11:10:27 -0500
Subject: [PATCH 72/87] [release/2.11][ROCm][inductor] Additional GEMM,
 pointwise and reduction configs. (#3145)

New Inductor configs in support of a customer request. See
https://amd-hub.atlassian.net/browse/AIPYTORCH-373
---
 torch/_inductor/runtime/triton_heuristics.py  | 7 +++++++
 torch/_inductor/template_heuristics/triton.py | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 8108636663b90..16bc308839b99 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -2897,6 +2897,12 @@ def pointwise(
                                 num_stages=2,
                                 waves_per_eu=1,  # 20% improvement
                             ),
+                            triton_config_with_settings(
+                                size_hints,
+                                512,
+                                num_warps=4,
+                                num_stages=4,  # 30% improvement
+                            ),
                         ]
                     )
                 if inductor_meta.get("atomic_add_found"):
@@ -3257,6 +3263,7 @@ def outer_config_opt():
             [
                 make_config(1024, 8, num_warps=4, num_stages=1, waves_per_eu=2),
                 make_config(512, 8, num_warps=4, num_stages=1, waves_per_eu=1),
+                make_config(32, 128, num_warps=1, num_stages=1),  # 30% improvement
             ]
         )
 
diff --git a/torch/_inductor/template_heuristics/triton.py b/torch/_inductor/template_heuristics/triton.py
index 96457817ff5e3..e15ff07ee5a4f 100644
--- a/torch/_inductor/template_heuristics/triton.py
+++ b/torch/_inductor/template_heuristics/triton.py
@@ -1436,6 +1436,9 @@ def __init__(self) -> None:
             ),
             ROCmGemmConfig(256, 128, 32, self.default_num_stages, 8, group_m=16),
             ROCmGemmConfig(256, 128, 64, self.default_num_stages, 8, group_m=4),
+            ROCmGemmConfig(256, 128, 64, self.default_num_stages, 8, group_m=16, matrix_instr_nonkdim=0),
+            ROCmGemmConfig(256, 128, 64, self.default_num_stages, 8, group_m=8, matrix_instr_nonkdim=0),
+            ROCmGemmConfig(128, 128, 64, self.default_num_stages, 8, group_m=4, matrix_instr_nonkdim=0),
             ROCmGemmConfig(256, 256, 64, self.default_num_stages, 8, group_m=4),
         ]
 

From 0320cc5b2fbba866c7ac1aa5deb8c14dd9a37b95 Mon Sep 17 00:00:00 2001
From: sohbodas <144367600+sohbodas@users.noreply.github.com>
Date: Thu, 23 Apr 2026 10:17:51 -0400
Subject: [PATCH 73/87] [release/2.11] Update Numba version constraints to
 support Python 3.14 (#3148)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- This PR updates the Numba version constraints to correctly handle
Python 3.14 and aligns the platform conditions with Numba’s current
support matrix.
- Add a new rule selecting numba==0.64.0 for Python ≥ 3.14

---------

Co-authored-by: sohbodas <Soham.Bodas@gmail.com>
Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
---
 .ci/docker/requirements-ci.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 24be093b31e7e..3a97ddf174e2a 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -119,8 +119,8 @@ ninja==1.11.1.4
 
 numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.9" and platform_machine != "s390x"
-numba==0.61.2 ; python_version > "3.9" and platform_machine != "s390x"
-
+numba==0.61.2 ; python_version >= "3.10" and python_version < "3.14" and platform_machine != "s390x"
+numba==0.64.0 ; python_version >= "3.14" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.55.2, 0.60.0
 #test that import: test_numba_integration.py

From 3aaa914af1e6fb268b242bfb871e614fbdb6c1bc Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Fri, 24 Apr 2026 17:53:00 -0700
Subject: [PATCH 74/87] [release/2.11] Fix MIOpen CTC loss crash on Windows
 (#179264) (#3181)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<h2>Fix MIOpen CTC loss access violation on Windows discrete GPUs</h2>

<h3>Problem</h3>

<p>A failing unit test on Windows started showing a couple weeks ago and
a missing <code>#include</code> was added in
[](https://github.com/pytorch/pytorch/pull/178284), but CI on TheRock
kept failing. The fix was tested on gfx1151 (APU), where the test
passed, but CI showed failures on gfx1100. </p>

<p><code>test_CTCLoss_no_batch_dim</code> (and any code path hitting
<code>miopen_ctc_loss</code>) crashes with a fatal access violation on
Windows systems with discrete AMD GPUs:</p>

<pre><code>Windows fatal exception: access violation Exception Code:
0xC0000005
#0 miopen::CTCLossDescriptor::GetCTCLossWorkspaceSize
(MIOpen.dll+0x14fde4) #1 miopenGetCTCLossWorkspaceSize
(MIOpen.dll+0x150912) #2 at::native::miopen_ctc_loss (torch_hip.dll)
</code></pre>

<h3>Root Cause</h3>

<p><code>miopenGetCTCLossWorkspaceSize</code> and
<code>miopenCTCLoss</code> read the <code>labels</code>,
<code>label_lengths</code>, and <code>input_lengths</code> arrays
<strong>on the host side</strong> to plan the computation and calculate
workspace requirements. The existing code copies these arrays to GPU
memory and passes device pointers:</p>

<pre><code>Tensor labels_gpu = targets_t.to(Device(at::kCUDA),
at::kInt); // ... hipMemcpy to GPU ...
MIOPEN_CHECK(miopenGetCTCLossWorkspaceSize(...,
    labels_gpu.data_ptr&lt;int&gt;(),          // device pointer
    label_lengths_gpu.data_ptr&lt;int&gt;(),   // device pointer
    input_lengths_gpu.data_ptr&lt;int&gt;()    // device pointer
));
</code></pre>

<p>This works on:</p>
<ul>
<li><strong>Linux</strong> — HSA (Heterogeneous System Architecture)
maps GPU allocations into the process virtual address space, making
device pointers host-readable</li> <li><strong>Windows APUs</strong> —
CPU and iGPU share system RAM, so device pointers point to
host-accessible memory</li> </ul>

<p>This crashes on:</p>
<ul>
<li><strong>Windows dGPUs</strong> — GPU has dedicated VRAM across PCIe;
device pointers are opaque handles that cannot be dereferenced from host
code</li> </ul>

<h3>Verification</h3>

<p>Tested on gfx1201:</p>

<table border="1" cellpadding="6" cellspacing="0">
<tr><th>Check</th><th>Result</th></tr>

<tr><td><code>hipDeviceAttributeIntegrated</code></td><td><code>0</code>
(discrete GPU)</td></tr>
<tr><td><code>hipDeviceAttributeCanUseHostPointerForRegisteredMem</code></td><td><code>0</code></td></tr>
<tr><td><code>hipDeviceAttributeManagedMemory</code></td><td><code>0x7FFFFFFF</code>
(unsupported)</td></tr>
<tr><td><code>hipDeviceAttributeUnifiedAddressing</code></td><td><code>0x7FFFFFFF</code>
(unsupported)</td></tr> <tr><td>Host read of <code>hipMalloc</code>
pointer via <code>ctypes</code></td><td>Access violation</td></tr>
<tr><td>CTC loss with CPU pointers</td><td>Pass (forward +
backward)</td></tr> </table>

<h3>Fix</h3>

<p>Use host pointers since this is what MIOpen expects should be
used.</p>

<h3>Testing</h3>

<p>Run all existing CTCLoss unit tests.</p>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/179264
Approved by: https://github.com/jeffdaily

Co-authored-by: Milica Stankovic <mstankov@amd.com>
---
 .../src/ATen/native/miopen/LossCTC_miopen.cpp | 33 ++++---------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/native/miopen/LossCTC_miopen.cpp b/aten/src/ATen/native/miopen/LossCTC_miopen.cpp
index 21797e7537d59..9c9ee2687aa99 100644
--- a/aten/src/ATen/native/miopen/LossCTC_miopen.cpp
+++ b/aten/src/ATen/native/miopen/LossCTC_miopen.cpp
@@ -206,35 +206,16 @@ std::tuple<Tensor, Tensor> miopen_ctc_loss(
   Tensor costs = at::empty({batch_size}, log_probs->options());
   Tensor grad = at::empty_like(log_probs_t, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 
-  // MIOpen requires labels and lengths on GPU
-  Tensor labels_gpu = targets_t.to(Device(at::kCUDA), at::kInt);
-  Tensor label_lengths_gpu = at::empty(
-      {static_cast<int64_t>(target_lengths.size())},
-      at::TensorOptions().dtype(at::kInt).device(at::kCUDA));
-  Tensor input_lengths_gpu = at::empty(
-      {static_cast<int64_t>(input_lengths.size())},
-      at::TensorOptions().dtype(at::kInt).device(at::kCUDA));
-
-  C10_CUDA_CHECK(hipMemcpy(
-      label_lengths_gpu.data_ptr<int>(),
-      target_lengths.data(),
-      target_lengths.size() * sizeof(int),
-      hipMemcpyHostToDevice));
-  C10_CUDA_CHECK(hipMemcpy(
-      input_lengths_gpu.data_ptr<int>(),
-      input_lengths.data(),
-      input_lengths.size() * sizeof(int),
-      hipMemcpyHostToDevice));
-
+  // MIOpen reads labels/lengths on the host.
   size_t workspace_size;
   (void)deterministic; // MIOpen only supports deterministic algorithm
   MIOPEN_CHECK(miopenGetCTCLossWorkspaceSize(
       handle,
       probs_desc,
       grads_desc,
-      labels_gpu.data_ptr<int>(),
-      label_lengths_gpu.data_ptr<int>(),
-      input_lengths_gpu.data_ptr<int>(),
+      targets_t.data_ptr<int>(),
+      target_lengths.data(),
+      input_lengths.data(),
       MIOPEN_CTC_LOSS_ALGO_DETERMINISTIC,
       ctc_desc,
       &workspace_size));
@@ -245,9 +226,9 @@ std::tuple<Tensor, Tensor> miopen_ctc_loss(
       handle,
       probs_desc,
       log_probs_t.data_ptr(),
-      labels_gpu.data_ptr<int>(),
-      label_lengths_gpu.data_ptr<int>(),
-      input_lengths_gpu.data_ptr<int>(),
+      targets_t.data_ptr<int>(),
+      target_lengths.data(),
+      input_lengths.data(),
       costs.data_ptr(),
       grads_desc,
       grad.data_ptr(),

From 48211a7882d719c26bbeb9c3cca5c60a936bdc34 Mon Sep 17 00:00:00 2001
From: Tijana Vukovic <127323445+tvukovic-amd@users.noreply.github.com>
Date: Mon, 27 Apr 2026 09:03:15 -0700
Subject: [PATCH 75/87] [release/2.11] Fix Windows access violation in MIOpen
 CTC loss dispatch (#3161)

Cherry pick of https://github.com/pytorch/pytorch/pull/178284

Fixes https://github.com/ROCm/TheRock/issues/3987

Co-authored-by: Milica Stankovic <milica.stankovic@amd.com>
---
 aten/src/ATen/native/miopen/LossCTC_miopen.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/native/miopen/LossCTC_miopen.cpp b/aten/src/ATen/native/miopen/LossCTC_miopen.cpp
index 9c9ee2687aa99..6200f7ede7df5 100644
--- a/aten/src/ATen/native/miopen/LossCTC_miopen.cpp
+++ b/aten/src/ATen/native/miopen/LossCTC_miopen.cpp
@@ -12,6 +12,7 @@
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/miopen_ctc_loss.h>
 #include <ATen/ops/miopen_ctc_loss_native.h>
+#include <ATen/ops/_use_miopen_ctc_loss_native.h>
 #endif
 
 // TODO: Remove the condition on AT_ROCM_ENABLED entirely,

From e16e349eb30bac8fd72b5c34ab220527fea5c58c Mon Sep 17 00:00:00 2001
From: Tijana Vukovic <127323445+tvukovic-amd@users.noreply.github.com>
Date: Mon, 27 Apr 2026 09:04:26 -0700
Subject: [PATCH 76/87] [release/2.11] Fix missing native header includes
 causing DLL export (#3160)

Cherry pick of https://github.com/pytorch/pytorch/pull/179138

Fixes:

https://github.com/ROCm/TheRock/issues/4086
https://github.com/ROCm/rocm-libraries/issues/5205
https://github.com/ROCm/TheRock/issues/4079

Co-authored-by: Stefan Sokolovic <stefan.sokolovic2@amd.com>
---
 aten/src/ATen/native/cuda/Blas.cpp        | 1 +
 aten/src/ATen/native/cuda/GroupedBlas.cpp | 1 +
 aten/src/ATen/native/cuda/ScaledBlas.cpp  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 4a03faa02ef56..74d650463e8d2 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -34,6 +34,7 @@
 #else
 #include <ATen/ops/_addmm_activation_native.h>
 #include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_int_mm_native.h>
 #include <ATen/ops/_scaled_mm_native.h>
 #include <ATen/ops/_unsafe_view_native.h>
 #include <ATen/ops/abs.h>
diff --git a/aten/src/ATen/native/cuda/GroupedBlas.cpp b/aten/src/ATen/native/cuda/GroupedBlas.cpp
index 5875c9a805724..70c33e27aa0a3 100644
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@@ -37,6 +37,7 @@
 #else
 #include <ATen/ops/_addmm_activation_native.h>
 #include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_grouped_mm_native.h>
 #include <ATen/ops/_scaled_mm_native.h>
 #include <ATen/ops/_unsafe_view_native.h>
 #include <ATen/ops/abs.h>
diff --git a/aten/src/ATen/native/cuda/ScaledBlas.cpp b/aten/src/ATen/native/cuda/ScaledBlas.cpp
index cafcc28b3d2c1..223f10c53a318 100644
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@@ -35,6 +35,7 @@
 #include <ATen/ops/_addmm_activation_native.h>
 #include <ATen/ops/_efficientzerotensor.h>
 #include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/_scaled_mm_v2_native.h>
 #include <ATen/ops/_unsafe_view_native.h>
 #include <ATen/ops/abs.h>
 #include <ATen/ops/addmm_native.h>

From 1a6ad28e29f7825ae7e0367f1b0793d579c66d29 Mon Sep 17 00:00:00 2001
From: Tijana Vukovic <127323445+tvukovic-amd@users.noreply.github.com>
Date: Tue, 28 Apr 2026 08:54:05 -0700
Subject: [PATCH 77/87] [release/2.11] Windows specific test fixes (#176024)
 (#3182)

Cherry pick of https://github.com/pytorch/pytorch/pull/176024

Co-authored-by: nkhasbag <nkhasbag@nvidia.com>
Co-authored-by: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Co-authored-by: Nikita Shulga <nshulga@meta.com>
---
 test/cpp_extensions/setup.py         |   6 +-
 test/export/test_export_opinfo.py    |  10 ++
 test/export/test_serialize.py        |   5 +
 test/test_cuda.py                    | 234 ++++++++++++++++++++++++++-
 test/test_fx.py                      |  25 ++-
 test/torch_np/test_nep50_examples.py |  34 +++-
 6 files changed, 301 insertions(+), 13 deletions(-)

diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py
index 35da0b4391884..2fd8f1b2667ff 100644
--- a/test/cpp_extensions/setup.py
+++ b/test/cpp_extensions/setup.py
@@ -42,6 +42,8 @@
     ),
 ]
 
+NVCC_FLAGS = ["-O2"] + (["-DUSE_CUDA"] if IS_WINDOWS else [])
+
 if torch.cuda.is_available() and (CUDA_HOME is not None or ROCM_HOME is not None):
     extension = CUDAExtension(
         "torch_test_cpp_extension.cuda",
@@ -50,7 +52,7 @@
             "cuda_extension_kernel.cu",
             "cuda_extension_kernel2.cu",
         ],
-        extra_compile_args={"cxx": CXX_FLAGS, "nvcc": ["-O2"]},
+        extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
     )
     ext_modules.append(extension)
 
@@ -58,7 +60,7 @@
     extension = CUDAExtension(
         "torch_test_cpp_extension.torch_library",
         ["torch_library.cu"],
-        extra_compile_args={"cxx": CXX_FLAGS, "nvcc": ["-O2"]},
+        extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
     )
     ext_modules.append(extension)
 
diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py
index 361674a69c7ae..b33aeb45438a3 100644
--- a/test/export/test_export_opinfo.py
+++ b/test/export/test_export_opinfo.py
@@ -22,6 +22,7 @@
 )
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
+    IS_WINDOWS,
     run_tests,
     skipIfRocm,
     TestCase,
@@ -152,6 +153,11 @@ class TestExportOnFakeCuda(TestCase):
     # We set CUDA_VISIBLE_DEVICES="" to simulate a CPU machine with cuda build
     # Running this on all ops in op_db is too slow, so we only run on a selected subset
     @onlyCUDA
+    @unittest.skipIf(
+        IS_WINDOWS,
+        'Subprocess with CUDA_VISIBLE_DEVICES="" imports op_db which triggers '
+        "get_device_capability(); 0 devices raises Invalid device id on Windows.",
+    )
     @ops(selected_op_db, allowed_dtypes=(torch.float,))
     def test_fake_export(self, device, dtype, op):
         test_script = f"""\
@@ -218,6 +224,10 @@ def forward(self, *args):
         self.assertEqual(r, "")
 
     @unittest.skipIf(not torch.backends.cuda.is_built(), "requires CUDA build")
+    @unittest.skipIf(
+        IS_WINDOWS,
+        "Failing on Windows, device_count() changes from 0 to 1 ",
+    )
     def test_preserve_original_behavior(self):
         test_script = f"""\
 import torch
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index 6e4d41fc1937a..7df7867e3613e 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -24,6 +24,11 @@
 
     from torch.library import wrap_triton
     from torch.utils._triton import has_triton
+else:
+
+    def has_triton():
+        return False
+
 
 import torch
 import torch._dynamo as torchdynamo
diff --git a/test/test_cuda.py b/test/test_cuda.py
index df9bdd5b0be11..56625d716244c 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -3,6 +3,7 @@
 
 import contextlib
 import ctypes
+import functools
 import gc
 import json
 import os
@@ -38,6 +39,7 @@
     _get_torch_cuda_version,
     PLATFORM_SUPPORTS_GREEN_CONTEXT,
     SM70OrLater,
+    SM89OrLater,
     TEST_CUDNN,
     TEST_MULTIGPU,
     tf32_on_and_off,
@@ -105,7 +107,8 @@
 load_tests = load_tests  # noqa: PLW0127
 
 try:
-    # import torchvision.models  # noqa: F401
+    import torchvision.models  # noqa: F401
+
     # from torchvision.models import resnet18  # noqa: F401
 
     HAS_TORCHVISION = True
@@ -130,6 +133,50 @@
 _cycles_per_ms = None
 
 
+_wait_for_cpu_kernel = None
+
+
+def skip_background_threads_on_windows(f):
+    @functools.wraps(f)
+    def wrapped(self, **kwargs):
+        if IS_WINDOWS and SM89OrLater and kwargs.get("use_background_threads"):
+            raise unittest.SkipTest("using background threads fails on Windows")
+        return f(self, **kwargs)
+
+    return wrapped
+
+
+def get_wait_for_cpu_kernel():
+    """Returns a compiled CUDA spin-wait kernel that blocks the GPU stream until
+    the host sets a pinned int32 flag to non-zero. Requires SM70+.
+
+    Usage::
+
+        kernel = get_wait_for_cpu_kernel()
+        flag = torch.zeros(1, dtype=torch.int32, device="cpu").pin_memory()
+        with torch.cuda.stream(s):
+            kernel(grid=(1, 1, 1), block=(1, 1, 1), args=[flag])
+        # stream s is now blocked until:
+        flag[0] = 1
+    """
+    global _wait_for_cpu_kernel
+    if _wait_for_cpu_kernel is None:
+        from torch.cuda import _compile_kernel
+
+        _wait_for_cpu_kernel = _compile_kernel(
+            r"""
+            __global__ void wait_for_cpu(int *pinned_cpu_flag) {
+                int flag = 0;
+                do {
+                    asm volatile("ld.relaxed.sys.global.s32 %0, [%1];" : "=r"(flag) : "l"(pinned_cpu_flag) : "memory");
+                } while (flag == 0);
+            }
+            """,
+            "wait_for_cpu",
+        )
+    return _wait_for_cpu_kernel
+
+
 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestCuda(TestCase):
@@ -290,6 +337,9 @@ def test_pinned_memory_empty_cache(self):
                 "pinned_use_cuda_host_register:False"
             )
 
+    # Pinned allocator background thread does not shut down cleanly on Windows
+    # Python process hangs
+    @unittest.skipIf(IS_WINDOWS and SM89OrLater, "Fails on windows with SM89+")
     def test_pinned_memory_use_background_threads(self):
         script = """
 import torch
@@ -432,6 +482,9 @@ def test_out_of_memory(self):
         tensor.fill_(1)
         self.assertTrue((tensor == 1).all())
 
+    # CUDA memory allocations on windows do not OOM on rtx even when they cross allowed memory
+    # Skip test until this is investigated
+    @unittest.skipIf(IS_WINDOWS and SM89OrLater, "Fails on windows with SM89+")
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC or IS_JETSON, "Segmentation fault (core dumped)"
     )
@@ -616,6 +669,9 @@ def test_serialization_array_with_storage(self):
         q_copy[1].fill_(10)
         self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
 
+    @unittest.skipIf(
+        IS_WINDOWS and SM89OrLater, "preferred_blas_library not supported on Windows"
+    )
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Does not work in fbcode yet")
     @setBlasBackendsToDefaultFinally
     def test_preferred_blas_library_settings(self):
@@ -685,6 +741,9 @@ def _check_default():
             torch.backends.cuda.preferred_blas_library("default")
             _check_default()
 
+    @unittest.skipIf(
+        IS_WINDOWS and SM89OrLater, "preferred_blas_library not supported on Windows"
+    )
     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "temporarily disabled for async")
     @setBlasBackendsToDefaultFinally
     def test_cublas_workspace_explicit_allocation(self):
@@ -4086,6 +4145,9 @@ def test_gds_fails_in_ci(self):
             with self.assertRaisesRegex(RuntimeError, error_msg):
                 torch.cuda.gds.GdsFile(f, os.O_CREAT | os.O_RDWR)
 
+    @unittest.skipIf(
+        IS_WINDOWS, "test relies on fork; Windows multiprocessing uses spawn"
+    )
     def test_is_pinned_no_context(self):
         test_script = """\
 import torch
@@ -5043,7 +5105,11 @@ def test_temperature(self):
     @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
     def test_device_memory_used(self):
         """
-        Verify used device memory in bytes
+        Verify used device memory in bytes.
+        On Windows the NVML used value has been observed not to increase after
+        a CUDA allocation (delta 0); we only assert API sanity there (non-negative,
+        non-decreasing after alloc, <= total memory). Need to investigate expected behavior
+        with Windows WDDM
         """
         torch.cuda.synchronize()
         gc.collect()
@@ -5054,9 +5120,20 @@ def test_device_memory_used(self):
         torch.cuda.synchronize()
         torch.cuda.empty_cache()
         b = torch.cuda.device_memory_used()
-        mem_bytes = b - a
-        # test the order of magnitude
-        self.assertTrue(num_bytes // 32 <= mem_bytes <= num_bytes * 32)
+        if IS_WINDOWS:
+            # NVML used memory does not reflect CUDA allocations on WDDM; only check API sanity
+            self.assertGreaterEqual(a, 0, "device_memory_used should be non-negative")
+            self.assertGreaterEqual(b, 0, "device_memory_used should be non-negative")
+            self.assertGreaterEqual(
+                b, a, "used memory should not decrease after allocation"
+            )
+            total = torch.cuda.get_device_properties(0).total_memory
+            self.assertLessEqual(a, total, "used should not exceed total device memory")
+            self.assertLessEqual(b, total, "used should not exceed total device memory")
+        else:
+            mem_bytes = b - a
+            # test the order of magnitude
+            self.assertTrue(num_bytes // 32 <= mem_bytes <= num_bytes * 32)
 
     @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
     def test_power_draw(self):
@@ -5748,6 +5825,7 @@ def test_pin_memory_use(self, use_cuda_host_register):
         "use_memory, delete_memory",
         [(True, True), (True, False), (False, True), (False, False)],
     )
+    @skip_background_threads_on_windows
     def test_two_graphs(
         self, use_background_threads, use_cuda_host_register, use_memory, delete_memory
     ):
@@ -6456,6 +6534,152 @@ def test_graph_capture_reclaim_4_streams(self):
             "graph_capture_record_stream_reuse:False"
         )
 
+
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    def test_graph_capture_reclaim_shared_pool(self):
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:True"
+        )
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+        shared_pool = torch.cuda.graph_pool_handle()
+        cap_stream = torch.cuda.Stream()
+        side_stream = torch.cuda.Stream()
+
+        g1 = torch.cuda.CUDAGraph()
+        g2 = torch.cuda.CUDAGraph()
+
+        numel = (8 * 1024 * 1024) // 4
+
+        with torch.cuda.stream(cap_stream):
+            g1.capture_begin(pool=shared_pool)
+            data = torch.empty(numel, device="cuda")
+            data_ptr = data.data_ptr()
+
+            side_stream.wait_stream(cap_stream)
+            with torch.cuda.stream(side_stream):
+                data.add_(1.0)
+                data.record_stream(side_stream)
+
+            cap_stream.wait_stream(side_stream)
+
+            del data
+            g1.capture_end()
+
+        torch.cuda.current_stream().wait_stream(cap_stream)
+        torch.cuda.synchronize()
+
+        with torch.cuda.stream(cap_stream):
+            g2.capture_begin(pool=shared_pool)
+            data2 = torch.empty(numel, device="cuda")
+            data2.fill_(42.0)
+            data2_ptr = data2.data_ptr()
+            g2.capture_end()
+
+        torch.cuda.current_stream().wait_stream(cap_stream)
+        torch.cuda.synchronize()
+
+        self.assertEqual(data_ptr, data2_ptr)
+
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:False"
+        )
+
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+    @unittest.skipIf(
+        not SM70OrLater, "Compute capability >= SM70 required for relaxed ptx flag"
+    )
+    def test_graph_capture_pre_capture_stream_use(self):
+        # Tests that a block with pre-capture stream uses is correctly handled
+        # when freed during a subsequent capture on the same pool.
+        # Exercises the insert_events path in endAllocateToPool.
+        spin_wait_kernel = get_wait_for_cpu_kernel()
+
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:True"
+        )
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+        shared_pool = torch.cuda.graph_pool_handle()
+        cap_stream = torch.cuda.Stream()
+        side_stream = torch.cuda.Stream()
+        flag_cpu = torch.zeros(1, dtype=torch.int32, device="cpu").pin_memory()
+
+        g1 = torch.cuda.CUDAGraph()
+        g2 = torch.cuda.CUDAGraph()
+        g3 = torch.cuda.CUDAGraph()
+        g4 = torch.cuda.CUDAGraph()
+
+        numel = (8 * 1024 * 1024) // 4
+
+        # First capture: allocate data in the shared pool, keep it alive.
+        with torch.cuda.stream(cap_stream):
+            g1.capture_begin(pool=shared_pool)
+            data = torch.empty(numel, device="cuda")
+            data_ptr = data.data_ptr()
+            g1.capture_end()
+
+        torch.cuda.synchronize()
+
+        # Between captures: block side_stream with a spin-wait kernel
+        # (pre-capture stream use). The kernel holds the stream busy until
+        # we explicitly set the flag from the host.
+        with torch.cuda.stream(side_stream):
+            spin_wait_kernel(grid=(1, 1, 1), block=(1, 1, 1), args=[flag_cpu])
+            data.record_stream(side_stream)
+
+        # Second capture: free data during capture.
+        with torch.cuda.stream(cap_stream):
+            g2.capture_begin(pool=shared_pool)
+            del data
+            g2.capture_end()
+
+        # Trigger process_events. The spin kernel is still holding side_stream,
+        # so cudaEventQuery returns cudaErrorNotReady and the block stays pending.
+        torch.empty(1, device="cuda")
+
+        # Allocate from the same pool: block must NOT be reused yet.
+        with torch.cuda.stream(cap_stream):
+            g3.capture_begin(pool=shared_pool)
+            not_reused = torch.empty(numel, device="cuda")
+            not_reused_ptr = not_reused.data_ptr()
+            g3.capture_end()
+
+        self.assertNotEqual(data_ptr, not_reused_ptr)
+
+        # Release the spin kernel so side_stream can finish.
+        flag_cpu[0] = 1
+        torch.cuda.synchronize()
+
+        # Trigger process_events to reclaim the block.
+        torch.empty(1, device="cuda")
+
+        # Fourth capture: the block should now be reusable.
+        with torch.cuda.stream(cap_stream):
+            g4.capture_begin(pool=shared_pool)
+            reused = torch.empty(numel, device="cuda")
+            reused_ptr = reused.data_ptr()
+            g4.capture_end()
+
+        self.assertEqual(data_ptr, reused_ptr)
+
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:False"
+        )
+
+    # expandable_segments not supported (PYTORCH_C10_DRIVER_API_SUPPORTED not defined for windows builds)
+    @unittest.skipIf(
+        IS_WINDOWS and SM89OrLater,
+        "expandable_segments not supported (PYTORCH_C10_DRIVER_API_SUPPORTED not defined for windows builds)",
+    )
+    @skipIfRocm(msg="expandable_segments mode is not supported on ROCm")
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode")
     def test_mempool_expandable(self):
         torch.cuda.empty_cache()
diff --git a/test/test_fx.py b/test/test_fx.py
index a9bbf8c22d699..36abe391485b1 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -4476,8 +4476,29 @@ def forward(self, x):
         else:
             kernel_event = "cudaLaunchKernel"
             kernel_event_relu = "cudaLaunchKernel"
-
-        expected = f"""\
+        if IS_WINDOWS:
+            expected = f"""\
+event=aten::t node=t stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::transpose node=t stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::as_strided node=t stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::addmm node=addmm stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::expand node=addmm stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::as_strided node=addmm stack_trace=return F.linear(input, self.weight, self.bias)
+event={kernel_event} node=addmm stack_trace=return F.linear(input, self.weight, self.bias)
+event={kernel_event} node=addmm stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::relu node=relu stack_trace=return F.relu(input, inplace=self.inplace)
+event=aten::clamp_min node=relu stack_trace=return F.relu(input, inplace=self.inplace)
+event={kernel_event_relu} node=relu stack_trace=return F.relu(input, inplace=self.inplace)
+event=aten::t node=t_1 stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::transpose node=t_1 stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::as_strided node=t_1 stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::addmm node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::expand node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias)
+event=aten::as_strided node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias)
+event={kernel_event} node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias)
+event={kernel_event} node=addmm_1 stack_trace=return F.linear(input, self.weight, self.bias)"""
+        else:
+            expected = f"""\
 event=aten::t node=t stack_trace=x = self.linear1(x)
 event=aten::transpose node=t stack_trace=x = self.linear1(x)
 event=aten::as_strided node=t stack_trace=x = self.linear1(x)
diff --git a/test/torch_np/test_nep50_examples.py b/test/torch_np/test_nep50_examples.py
index a3ad346bf9f1c..964683bd74c81 100644
--- a/test/torch_np/test_nep50_examples.py
+++ b/test/torch_np/test_nep50_examples.py
@@ -31,6 +31,7 @@
 from torch._numpy.testing import assert_allclose
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    IS_WINDOWS,
     parametrize,
     run_tests,
     TestCase,
@@ -216,11 +217,36 @@ def test_compare_ufuncs(self, name, scalar, array):
                 # TypeError: ufunc 'hypot' not supported for the input types
                 result_numpy = None
 
+            type_mismatch = False
+            expected_numpy_dtype = None
+            expected_torch_dtype = None
+
             if result is not None and result_numpy is not None:
-                if result.tensor.numpy().dtype != result_numpy.dtype:
-                    raise AssertionError(
-                        f"Expected result dtype == {result_numpy.dtype}, got {result.tensor.numpy().dtype}"
-                    )
+                expected_numpy_dtype = result_numpy.dtype
+                expected_torch_dtype = result.tensor.numpy().dtype
+                if IS_WINDOWS:
+                    if (
+                        array.tensor.numpy().dtype != _np.bool_
+                        and result.tensor.numpy().dtype != result_numpy.dtype
+                    ):
+                        type_mismatch = True
+
+                    if (
+                        array.tensor.numpy().dtype == _np.bool_
+                        and result_numpy.dtype == _np.int32
+                        and result.tensor.numpy().dtype != _np.int64
+                    ):
+                        expected_numpy_dtype = _np.int32
+                        expected_torch_dtype = tnp.int64
+                        type_mismatch = True
+                else:
+                    if result.tensor.numpy().dtype != result_numpy.dtype:
+                        type_mismatch = True
+
+            if type_mismatch:
+                raise AssertionError(
+                    f"Expected result numpy dtype == {expected_numpy_dtype}, torch dtype == {expected_torch_dtype}"
+                )
 
         finally:
             _np._set_promotion_state(state)

From 9413e9b96bcbeb8af1aa0280a3a9bc7dd048857e Mon Sep 17 00:00:00 2001
From: "rocm-repo-management-api-6[bot]"
 <212817015+rocm-repo-management-api-6[bot]@users.noreply.github.com>
Date: Tue, 28 Apr 2026 08:57:11 -0700
Subject: [PATCH 78/87] [AUTOGENERATED] [release/2.11]
 [UP][UT][ROCm][TunableOp] Fix test_call_count_tunableop to correctly extract
 kernel names for RDNA (#3185)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2954

Co-authored-by: Uros Markovic <umarkovi@amd.com>
---
 test/test_linalg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index 045da268fd99c..25a157343db15 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -6427,9 +6427,9 @@ def test_call_count_tunableop(self, device, dtype):
             # launched per PyTorch API. The kernels have string
             # that always starts with `Cijk*`
             mm_key = 'Cijk'
-            events = prof.key_averages()
+            events = prof.events()
             for evt in events:
-                if mm_key in evt.key:
+                if mm_key in evt.name:
                     self.assertEqual(evt.count, 1)
                     kernel_count = kernel_count + 1
 

From 7d37be22a5b0fa41bbedc7394d38de3df0a61a30 Mon Sep 17 00:00:00 2001
From: Harkirat Gill <harkirat.gill@amd.com>
Date: Tue, 28 Apr 2026 18:18:50 -0400
Subject: [PATCH 79/87] [release/2.11] Update composable_kernel submodule with
 gfx1033 support (#3144)

## Motivation

- Enabling gfx103X-all wheels in TheRock is currently blocked due to
PyTorch CI failures caused by a lack of `gfx1033` support in CK.
https://github.com/ROCm/rocm-libraries/pull/5141 resolves these issues.

## Technical Details

- The aforementioned fix has been cherrypicked into the
`pytorch/release/2.11/` branch of ROCm/composable_kernel - this PR bumps
the `third_party/composable_kernel` branch to pick up these changes.

## Test Plan

- Trigger a build and verify it passes

## Test Result

- Build succeeds for `cherrypick-gfx1033-CK-support-torch2.11` branch.
https://github.com/ROCm/TheRock/actions/runs/24195531659/job/70624339554

  - Testing
  Pasting offline comments from @harkgill-amd
> In
https://github.com/ROCm/TheRock/actions/runs/24906345786/job/72942139688
Pytorch 3.10  + release/2.11 -> Pass
Pytorch 3.11 + release/2.11 -> TestNN.test_Embedding_discontiguous_cuda
failed but this seems to be a known flaky test and will be disabled with
https://github.com/ROCm/TheRock/pull/4775
Pytorch 3.12  + release/2.11 -> Pass
Pytorch 3.13  + release/2.11 -> Pass
In
https://github.com/ROCm/TheRock/actions/runs/25002732513/job/73225027260
Pytorch 3.14 + release/2.11 -> The failing tests here all share the same
miopenStatusUnknownError message. These are the same failures as seen in
the main branch run here
https://github.com/ROCm/TheRock/actions/runs/24985367049 so they aren't
related to my PR

## Submission Checklist

- [X] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index fcc9372c009c8..7182f6d1391ed 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit fcc9372c009c8e0a23fece77b582da83b04a654f
+Subproject commit 7182f6d1391ed75fe0a9dd1328f2b2683a12d041

From 345ca6fbeb70f3a6edda9f13b8aafcadccbedd4e Mon Sep 17 00:00:00 2001
From: zichguan-amd <zichuan.guan@amd.com>
Date: Tue, 28 Apr 2026 18:37:21 -0400
Subject: [PATCH 80/87] Cleanup custom op polluting global state for subsequent
 tests (#3170)

`my_lib` in `test_storage_preserve_nonhermetic_in_hermetic_context`
leaks into global op space after the test ends and affect subsequent
tests in the same process using dynamo.

Without the fix, running any tests requiring checkpoint/compile or
dynamo-related after
`test_storage_preserve_nonhermetic_in_hermetic_context` fails with
```
torch._dynamo.exc.BackendCompilerFailed: backend='aot_eager' raised:
TypeError: 'CustomDecompTable' object is not a mapping
```
e.g. `python -m pytest -v
pytorch/test/test_torch.py::TestTorch::test_storage_preserve_nonhermetic_in_hermetic_context
pytorch/test/test_autograd.py::TestAutograd::test_checkpoint_compile_no_recompile`

Upstream PR: https://github.com/pytorch/pytorch/pull/180998

Signed-off-by: zichguan-amd <zichuan.guan@amd.com>
---
 test/test_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_torch.py b/test/test_torch.py
index fa254541d1ece..54a756b5940cf 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -10335,6 +10335,7 @@ def test_storage_preserve_nonhermetic_in_hermetic_context(self):
         global _my_storage
 
         my_lib = Library("my_lib", "DEF")  # noqa: TOR901
+        self.addCleanup(my_lib._destroy)
         my_lib.define('my_func() -> None')
 
         a = torch.tensor([1.])

From 443606eb94430d90554ab4c21202494576afedce Mon Sep 17 00:00:00 2001
From: "rocm-repo-management-api-6[bot]"
 <212817015+rocm-repo-management-api-6[bot]@users.noreply.github.com>
Date: Wed, 29 Apr 2026 09:55:15 -0700
Subject: [PATCH 81/87] [AUTOGENERATED] [release/2.11] Fix SIGSEGV on AMD RDNA
 due to reduction mask optimization #176269 (#3156)

Cherry-pick of https://github.com/ROCm/pytorch/pull/3055

Co-authored-by: Strahinja Stamenkovic <sstamenk@amd.com>
---
 test/inductor/test_torchinductor.py      | 35 ++++++++++++++++++++++++
 torch/_inductor/codegen/triton.py        | 12 +++++++-
 torch/_inductor/runtime/triton_compat.py |  7 ++---
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 6c4f1b3f92890..60561f9dc812d 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -16528,6 +16528,41 @@ def test_has_constant_mask(self, block_multiple, ynumel_exceed_ygrid_size):
                 self.assertTrue("ymask = yindex < ynumel" in code)
                 self.assertTrue("xmask = xindex < xnumel" in code)
 
+        @parametrize(
+            "rnumel",
+            [16, 32],
+        )
+        @config.patch("triton.persistent_reductions", True)
+        def test_has_constant_mask_small_persistent_reduction(self, rnumel):
+            from torch._inductor.runtime.hints import DeviceProperties
+
+            def fn(x):
+                return x.sum(dim=-1)
+
+            x = torch.randn(1024, rnumel, device=GPU_TYPE)
+            opt_fn = torch.compile(fn)
+            code = run_and_get_triton_code(opt_fn, x)
+
+            device = torch.device(GPU_TYPE, 0)
+            warp_size = DeviceProperties.create(device).warp_size or 32
+
+            rblock = 1
+            while rblock < rnumel:
+                rblock *= 2
+
+            if rblock < warp_size:
+                self.assertTrue(
+                    "r0_index < r0_numel" in code or "rindex < rnumel" in code,
+                    f"Expected dynamic reduction mask for RBLOCK={rblock} < warp_size={warp_size}",
+                )
+            else:
+                self.assertTrue(
+                    "r0_mask = tl.full" in code or "rmask = tl.full" in code,
+                    f"Expected constant reduction mask for RBLOCK={rblock} >= warp_size={warp_size}",
+                )
+
+            self.assertEqual(fn(x), opt_fn(x))
+
         @config.patch("triton.native_matmul", False)
         def test_kernel_names_descriptive(self):
             @torch.compile(backend="inductor")
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 039b53ee1f2fc..1fb7b7fdf3a7b 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -5886,12 +5886,22 @@ def _has_constant_mask(self, tree: IterationRangesRoot) -> bool:
                 return True
         elif not self.is_combo_kernel:
             if V.graph.sizevars.statically_known_equals(tree.numel, 1):
-                return True
+                if not (tree.is_reduction and self.persistent_reduction):
+                    return True
 
         # Masks are superfluous if numel is a multiple of BLOCK
         # (We use the fact that BLOCK is required by triton to be a power of 2)
         if tree.is_reduction and self.persistent_reduction:
             max_block = self._get_persistent_RBLOCK(tree.numel)
+            # Triton's auto-tuner can map a full hardware warp along the
+            # reduction axis.  When RBLOCK < warp_size the excess lanes
+            # would execute out-of-bounds global loads.  This results in
+            # faults on AMD hardware.  Keep the dynamic mask so that all
+            # hardware stays correct.
+            device = V.graph.get_current_device_or_throw()
+            warp_size = DeviceProperties.create(device).warp_size or 32
+            if isinstance(max_block, int) and max_block < warp_size:
+                return False
         elif tree.prefix == "x" and self.no_x_dim:
             max_block = 1
         else:
diff --git a/torch/_inductor/runtime/triton_compat.py b/torch/_inductor/runtime/triton_compat.py
index 49ceacb50bc3d..d237350a667c8 100644
--- a/torch/_inductor/runtime/triton_compat.py
+++ b/torch/_inductor/runtime/triton_compat.py
@@ -140,11 +140,10 @@ class JITFunction:  # type: ignore[no-redef]
 
 def cc_warp_size(cc: str | int) -> int:
     if torch.version.hip:
-        cc_str = str(cc)
-        if "gfx10" in cc_str or "gfx11" in cc_str:
-            return 32
-        else:
+        if "gfx9" in str(cc):
             return 64
+        else:
+            return 32
     else:
         return 32
 

From 5223630054ce5ecd7b774d0ea31f2a1b472fb9b3 Mon Sep 17 00:00:00 2001
From: "tom.jen" <tomjen12@amd.com>
Date: Thu, 7 May 2026 21:51:03 +0800
Subject: [PATCH 82/87] [Inductor] Fix ReinterpretView stride mismatch in
 TritonTemplateKernel (#3191)

Fixes a bug where FlexibleLayout on a ReinterpretView incorrectly
returns underlying physical buffer strides (e.g., 4D) instead of logical
view strides (3D).

This patch skips speculative layout and constraint tracking for
ReinterpretView nodes, forcing the use of node.get_stride() to prevent
Illegal Memory Access (IMA) on ROCm.

Manual backport from PyTorch 2.12.
Ref commit:
https://github.com/pytorch/pytorch/commit/0e1f56285ea65c0fc960ea110cc4088e92eab453

## Motivation

<!-- Explain the purpose of this PR and the goals it aims to achieve.
-->

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->

## Test Plan

<!-- Explain any relevant testing done to verify this PR. -->

## Test Result

<!-- Briefly summarize test outcomes. -->

## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
---
 torch/_inductor/select_algorithm.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 9c02233204dbb..2fb2852eef785 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -1580,7 +1580,13 @@ def get_stride_and_maybe_freeze_layout(self, node) -> list[int]:
         layout = node.data.layout
         node_name = node.get_name()
 
-        if isinstance(layout, ir.FlexibleLayout):
+        # For ReinterpretView, the view's strides are already determined by its layout.
+        # We skip constraint tracking because node.get_name() returns the underlying
+        # buffer name, not the view's identity, so constraints would be incorrectly
+        # associated with the underlying buffer rather than the view.
+        if isinstance(layout, ir.FlexibleLayout) and not isinstance(
+            node, ir.ReinterpretView
+        ):
             if not use_aten_gemm_kernels():
                 # No ExternKernel fallback available, freeze immediately
                 node.data.freeze_layout()

From 96bfee122869125d32aa4ec9acc8c3597059188b Mon Sep 17 00:00:00 2001
From: Jack Taylor <108682042+jataylo@users.noreply.github.com>
Date: Tue, 12 May 2026 16:58:07 +0100
Subject: [PATCH 83/87] Dynamo/fix ignore logging functions (#178506) (#3206)

## PR Summary

Fixes #178455 ignore_logger_methods was renamed to
ignore_logging_functions in torch 2.11 but wasn't added to blocklist in
_get_dynamo_config_for_logging()

## Repro
```
  import torch
  import torch._dynamo.config
  import torch._dynamo.utils

  torch._dynamo.config.ignore_logging_functions.add(print)
  torch._dynamo.utils._get_dynamo_config_for_logging()
```
## Changes

* Include `ignore_logging_functions` from
`_get_dynamo_config_for_logging()` (consistent with existing
`ignore_logger_methods`)
* Add a regression test to ensure no crash when logging config includes
builtin functions *

Added a test that:

* Inserts `print` into `ignore_logging_functions`
* Verifies `_get_dynamo_config_for_logging()` returns valid JSON without
errors

related issue: #178455

Pull Request resolved: https://github.com/pytorch/pytorch/pull/178506
Approved by: https://github.com/Lucaskabela


(cherry picked from commit 7eea8eacbb4457a503467a46eae47d93e39e49e8)

## Motivation

<!-- Explain the purpose of this PR and the goals it aims to achieve.
-->

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->

## Test Plan

<!-- Explain any relevant testing done to verify this PR. -->

## Test Result

<!-- Briefly summarize test outcomes. -->

## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

Co-authored-by: vvvdwbvvv <vvvdwbvvv@gmail.com>
Co-authored-by: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
---
 test/dynamo/test_utils.py | 9 +++++++++
 torch/_dynamo/utils.py    | 1 +
 2 files changed, 10 insertions(+)

diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index 22c3d26207ef1..b0f3d4a079d44 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 import dataclasses
+import json
 import os
 import pprint
 import sys
@@ -298,6 +299,14 @@ def test_reinplace_counters_use_trigger_name_not_enum_value(self):
             "Should not use enum value (integer) in key, should use trigger.name instead",
         )
 
+    def test_get_dynamo_config_for_logging_ignores_logging_functions(self):
+        with dynamo_config.patch(ignore_logging_functions={print}):
+            result = utils._get_dynamo_config_for_logging()
+            parsed = json.loads(result)
+
+        self.assertIsInstance(parsed, dict)
+        self.assertNotIn("ignore_logging_functions", parsed)
+
 
 class TestModel(torch.nn.Module):
     def __init__(self):
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 210e289be2ff1..51837a6fa283d 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1649,6 +1649,7 @@ def clean_for_json(d: dict[str, Any]) -> dict[str, Any]:
             "_autograd_backward_strict_mode_banned_ops",
             "reorderable_logging_functions",
             "ignore_logger_methods",
+            "ignore_logging_functions",
             "traceable_tensor_subclasses",
             "nontraceable_tensor_subclasses",
             "_custom_ops_profile",

From 0c210c5f20ff65125cf0c618dc5d76c6dc603238 Mon Sep 17 00:00:00 2001
From: zichguan-amd <zichuan.guan@amd.com>
Date: Tue, 19 May 2026 11:02:45 -0400
Subject: [PATCH 84/87] Cleanup custom op polluting global state for subsequent
 tests (#180998) (#3221)

Cherry pick to 2.11 release

`my_lib` in `test_storage_preserve_nonhermetic_in_hermetic_context`
leaks into global op space after the test ends and affect subsequent
tests in the same process using dynamo.

Without the fix, running any tests requiring checkpoint/compile or
dynamo-related after
`test_storage_preserve_nonhermetic_in_hermetic_context` fails with
```
torch._dynamo.exc.BackendCompilerFailed: backend='aot_eager' raised:
TypeError: 'CustomDecompTable' object is not a mapping
```
e.g. `python -m pytest -v
pytorch/test/test_torch.py::TestTorch::test_storage_preserve_nonhermetic_in_hermetic_context
pytorch/test/test_autograd.py::TestAutograd::test_checkpoint_compile_no_recompile`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/180998
Approved by: https://github.com/albanD, https://github.com/ezyang

---------

Co-authored-by: Claude Opus 4 <noreply@anthropic.com>
---
 test/dynamo/test_compiler_bisector.py |   3 +-
 test/dynamo/test_decorators.py        |   6 +-
 test/functorch/test_aotdispatch.py    | 207 +++++++++++++-------------
 test/test_fake_tensor.py              |   8 +-
 test/test_fx_passes.py                |   2 +-
 test/test_proxy_tensor.py             |  51 ++++---
 test/test_torch.py                    |  39 +++--
 7 files changed, 154 insertions(+), 162 deletions(-)

diff --git a/test/dynamo/test_compiler_bisector.py b/test/dynamo/test_compiler_bisector.py
index c1bc667a02041..24116f41809d5 100644
--- a/test/dynamo/test_compiler_bisector.py
+++ b/test/dynamo/test_compiler_bisector.py
@@ -29,8 +29,7 @@ def tearDown(self):
         if hasattr(torch.ops, self.test_ns):
             delattr(torch.ops, self.test_ns)
         if hasattr(self, "lib"):
-            del self.lib.m
-            del self.lib
+            self.lib._destroy()
 
     def get_op(self, name):
         return getattr(getattr(torch.ops, self.test_ns), name).default
diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py
index 58f5127499562..6f9b0c2765659 100644
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@@ -48,10 +48,8 @@ def fn(a):
 
     def test_disable_for_custom_op(self):
         import torch.library
-        from torch.library import Library
 
-        foo = Library("foo", "DEF")  # noqa: TOR901
-        try:
+        with torch.library._scoped_library("foo", "DEF") as foo:
             foo.define("custom(Tensor self) -> Tensor")
 
             # Dynamic shape data dependent operator. For static shape compilation, Dynamo
@@ -81,8 +79,6 @@ def fn(x):
                 self.assertEqual(ref, res)
             finally:
                 torch.ops.foo.custom = orig_custom
-        finally:
-            foo._destroy()
 
     def test_disable_ignores_outer_wraps(self):
         def orig_inner():
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 2375778a29e43..e523c671dc5fb 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -8183,122 +8183,125 @@ def unpack_cpu(x):
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
     @unittest.skipIf(not SM80OrLater, "bfloat16, float8")
     def test_saved_tensors_hooks_params(self):
-        lib = torch.library.Library("_test_aotdispatch_lib", "FRAGMENT")
-        logged_shapes = []
-        logged_dtypes = []
-        lib.define("log(Tensor x) -> Tensor")
-
-        def log_impl(x):
-            logged_shapes.append(list(x.shape))
-            logged_dtypes.append(x.dtype)
-            return x.clone()
+        with torch.library._scoped_library("_test_aotdispatch_lib", "FRAGMENT") as lib:
+            logged_shapes = []
+            logged_dtypes = []
+            lib.define("log(Tensor x) -> Tensor")
+
+            def log_impl(x):
+                logged_shapes.append(list(x.shape))
+                logged_dtypes.append(x.dtype)
+                return x.clone()
 
-        def log_meta(x):
-            return x.clone()
+            def log_meta(x):
+                return x.clone()
 
-        for backend in ["CPU", "CUDA"]:
-            lib.impl(
-                "log",
-                log_impl,
-                backend,
-            )
-        lib.impl("log", log_meta, "Meta")
+            for backend in ["CPU", "CUDA"]:
+                lib.impl(
+                    "log",
+                    log_impl,
+                    backend,
+                )
+            lib.impl("log", log_meta, "Meta")
 
-        def pack_fp8_with_scale_and_log(x):
-            torch.ops._test_aotdispatch_lib.log(x)
-            return _pack_fp8_with_scale_wrap(x)
+            def pack_fp8_with_scale_and_log(x):
+                torch.ops._test_aotdispatch_lib.log(x)
+                return _pack_fp8_with_scale_wrap(x)
 
-        def unpack_fp8_with_scale_and_log(packed):
-            return _unpack_fp8_with_scale_wrap(packed)
+            def unpack_fp8_with_scale_and_log(packed):
+                return _unpack_fp8_with_scale_wrap(packed)
 
-        def m_inp_fn():
-            x = torch.ones(
-                2, 2, 2, device=device, dtype=torch.float64, requires_grad=True
-            )
-            torch._dynamo.mark_dynamic(x, 0)
-            torch._dynamo.mark_dynamic(x, 1)
-            return (x,)
+            def m_inp_fn():
+                x = torch.ones(
+                    2, 2, 2, device=device, dtype=torch.float64, requires_grad=True
+                )
+                torch._dynamo.mark_dynamic(x, 0)
+                torch._dynamo.mark_dynamic(x, 1)
+                return (x,)
 
-        class SAF0(torch.autograd.Function):
-            @staticmethod
-            def forward(ctx, x):
-                ctx.save_for_backward(x)
-                return x
+            class SAF0(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.save_for_backward(x)
+                    return x
 
-            @staticmethod
-            def backward(ctx, gx):
-                (saved_x,) = ctx.saved_tensors
-                return gx + saved_x
+                @staticmethod
+                def backward(ctx, gx):
+                    (saved_x,) = ctx.saved_tensors
+                    return gx + saved_x
 
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.fc1 = nn.Linear(2, 2)
-                self.relu = nn.ReLU()
-                self.fc2 = nn.Linear(2, 2)
+            class M(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.fc1 = nn.Linear(2, 2)
+                    self.relu = nn.ReLU()
+                    self.fc2 = nn.Linear(2, 2)
 
-            def forward(self, x):
-                x = SAF0.apply(x)
-                x = x.to(dtype=torch.float32)
-                x = self.fc1(x)
-                x = self.relu(x)
-                x = self.fc2(x)
-                return x
+                def forward(self, x):
+                    x = SAF0.apply(x)
+                    x = x.to(dtype=torch.float32)
+                    x = self.fc1(x)
+                    x = self.relu(x)
+                    x = self.fc2(x)
+                    return x
 
-        def _reset_logged():
-            logged_shapes.clear()
-            logged_dtypes.clear()
+            def _reset_logged():
+                logged_shapes.clear()
+                logged_dtypes.clear()
 
-        device = torch.device("cuda:0")
-        m = M().to(device=device)
+            device = torch.device("cuda:0")
+            m = M().to(device=device)
 
-        def _test_m():
-            self._test_pack_hooks(
-                m,
-                m_inp_fn,
-                [
-                    (
+            def _test_m():
+                self._test_pack_hooks(
+                    m,
+                    m_inp_fn,
+                    [
                         (
-                            pack_fp8_with_scale_and_log,
-                            unpack_fp8_with_scale_and_log,
-                        ),
-                        True,
-                    )
-                ],
-                pre_compile_fn=_reset_logged,
-                backend="aot_eager",
-            )
-
-        with patch(
-            "torch._functorch.config.saved_tensors_hooks_filtering_mode", "donated"
-        ):
-            _reset_logged()
-            _test_m()
-            # Check that hooks were not applied to Parameters
-            # parameters excluded
-            self.assertFalse([2, 2] in logged_shapes)
-            self.assertTrue([2, 2, 2] in logged_shapes)
-            # input excluded
-            self.assertFalse(torch.float64 in logged_dtypes)
+                            (
+                                pack_fp8_with_scale_and_log,
+                                unpack_fp8_with_scale_and_log,
+                            ),
+                            True,
+                        )
+                    ],
+                    pre_compile_fn=_reset_logged,
+                    backend="aot_eager",
+                )
 
-        with patch(
-            "torch._functorch.config.saved_tensors_hooks_filtering_mode", "no_static"
-        ):
-            _reset_logged()
-            _test_m()
-            # Check that hooks were not applied to Parameters
-            # parameters excluded
-            self.assertFalse([2, 2] in logged_shapes)
-            self.assertTrue([2, 2, 2] in logged_shapes)
-            self.assertTrue(torch.float64 in logged_dtypes)
-
-        with patch("torch._functorch.config.saved_tensors_hooks_filtering_mode", "all"):
-            _reset_logged()
-            _test_m()
-            # Check that hooks were applied to all saved tensors
-            self.assertTrue([2, 2] in logged_shapes)
-            self.assertTrue([2, 2, 2] in logged_shapes)
-            self.assertTrue(torch.float64 in logged_dtypes)
+            with patch(
+                "torch._functorch.config.saved_tensors_hooks_filtering_mode", "donated"
+            ):
+                _reset_logged()
+                _test_m()
+                # Check that hooks were not applied to Parameters
+                # parameters excluded
+                self.assertFalse([2, 2] in logged_shapes)
+                self.assertTrue([2, 2, 2] in logged_shapes)
+                # input excluded
+                self.assertFalse(torch.float64 in logged_dtypes)
+
+            with patch(
+                "torch._functorch.config.saved_tensors_hooks_filtering_mode",
+                "no_static",
+            ):
+                _reset_logged()
+                _test_m()
+                # Check that hooks were not applied to Parameters
+                # parameters excluded
+                self.assertFalse([2, 2] in logged_shapes)
+                self.assertTrue([2, 2, 2] in logged_shapes)
+                self.assertTrue(torch.float64 in logged_dtypes)
+
+            with patch(
+                "torch._functorch.config.saved_tensors_hooks_filtering_mode", "all"
+            ):
+                _reset_logged()
+                _test_m()
+                # Check that hooks were applied to all saved tensors
+                self.assertTrue([2, 2] in logged_shapes)
+                self.assertTrue([2, 2, 2] in logged_shapes)
+                self.assertTrue(torch.float64 in logged_dtypes)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
     @unittest.skipIf(not SM80OrLater, "bfloat16, float8")
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 1ee0668c9dc63..a7f31ce07be77 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -115,10 +115,9 @@ def test_basic(self):
             self.assertTrue(isinstance(z, FakeTensor))
 
     def test_custom_op_fallback(self):
-        from torch.library import impl, Library
+        from torch.library import _scoped_library, impl
 
-        try:
-            test_lib = Library("my_test_op", "DEF")  # noqa: TOR901
+        with _scoped_library("my_test_op", "DEF") as test_lib:
             test_lib.define("foo(Tensor self) -> Tensor")
 
             @impl(test_lib, "foo", "CPU")
@@ -133,9 +132,6 @@ def foo_impl(self):
                     x = mode.from_tensor(x)
                     torch.ops.my_test_op.foo(x)
 
-        finally:
-            test_lib._destroy()
-
     def test_parameter_instantiation(self):
         with FakeTensorMode():
             x = torch.rand([4])
diff --git a/test/test_fx_passes.py b/test/test_fx_passes.py
index 4a48d0cd966aa..2f551338d7e63 100644
--- a/test/test_fx_passes.py
+++ b/test/test_fx_passes.py
@@ -855,7 +855,7 @@ def setup(cls):
 
     @classmethod
     def tearDown(cls):
-        del cls.quantization
+        cls.quantization._destroy()
 
     @staticmethod
     def forward(self, arg0_1, arg1_1):
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 8296f386f0977..b2c4a000fb980 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1017,40 +1017,39 @@ def _test_dynamic(self, fn, trace_inputs, test_inputs, assert_eq=True):
 
 
     def test_debug_interpreter(self):
-        import torch.library
-        from torch.library import Library
+        from torch.library import _scoped_library
 
-        foo = Library("foo", "DEF")  # noqa: TOR901
-        foo.define("foo(Tensor self) -> Tensor")
+        with _scoped_library("foo", "DEF") as foo:
+            foo.define("foo(Tensor self) -> Tensor")
 
-        # Operator where meta and cpu disagree on strides
-        @torch.library.impl(foo, "foo", "CPU")
-        def foo_cpu(x):
-            return x.clone().T
+            # Operator where meta and cpu disagree on strides
+            @torch.library.impl(foo, "foo", "CPU")
+            def foo_cpu(x):
+                return x.clone().T
 
-        @torch.library.impl(foo, "foo", "Meta")
-        def foo_meta(x):
-            return x.clone()
+            @torch.library.impl(foo, "foo", "Meta")
+            def foo_meta(x):
+                return x.clone()
 
-        def f(x):
-            return torch.ops.foo.foo.default(x)
+            def f(x):
+                return torch.ops.foo.foo.default(x)
 
-        gm = make_fx(f, tracing_mode="symbolic")(torch.randn(2, 2))
-        from torch._functorch.compilers import DebugInterpreter
+            gm = make_fx(f, tracing_mode="symbolic")(torch.randn(2, 2))
+            from torch._functorch.compilers import DebugInterpreter
 
-        interp = DebugInterpreter(gm)
+            interp = DebugInterpreter(gm)
 
-        # input mismatch is caught (indicates guard problem)
-        self.assertRaisesRegex(
-            AssertionError, r"3 != 1",
-            lambda: interp.run(torch.randn(3, 3).T),
-        )
+            # input mismatch is caught (indicates guard problem)
+            self.assertRaisesRegex(
+                AssertionError, r"3 != 1",
+                lambda: interp.run(torch.randn(3, 3).T),
+            )
 
-        # Catch the incorrect meta
-        self.assertRaisesRegex(
-            AssertionError, r"\(3, 1\) != \(1, 3\)",
-            lambda: interp.run(torch.randn(3, 3))
-        )
+            # Catch the incorrect meta
+            self.assertRaisesRegex(
+                AssertionError, r"\(3, 1\) != \(1, 3\)",
+                lambda: interp.run(torch.randn(3, 3))
+            )
 
     def test_int_input(self):
         def f(x, y):
diff --git a/test/test_torch.py b/test/test_torch.py
index 54a756b5940cf..66b021920594f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -10331,33 +10331,32 @@ def __del__(self):
 
     @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
     def test_storage_preserve_nonhermetic_in_hermetic_context(self):
-        from torch.library import Library, impl
+        from torch.library import _scoped_library, impl
         global _my_storage
 
-        my_lib = Library("my_lib", "DEF")  # noqa: TOR901
-        self.addCleanup(my_lib._destroy)
-        my_lib.define('my_func() -> None')
+        with _scoped_library("my_lib", "DEF") as my_lib:
+            my_lib.define('my_func() -> None')
 
-        a = torch.tensor([1.])
-        _my_storage = a.untyped_storage()
+            a = torch.tensor([1.])
+            _my_storage = a.untyped_storage()
 
-        m, t = Tracker.make()
-        _my_storage._tracker = t
-        del t
+            m, t = Tracker.make()
+            _my_storage._tracker = t
+            del t
 
-        @impl(my_lib, 'my_func', '')
-        def my_func():
-            global _my_storage
-            del _my_storage
+            @impl(my_lib, 'my_func', '')
+            def my_func():
+                global _my_storage
+                del _my_storage
 
-        self.assertFalse(m[0])
-        torch.ops.my_lib.my_func()
-        self.assertFalse(m[0])
+            self.assertFalse(m[0])
+            torch.ops.my_lib.my_func()
+            self.assertFalse(m[0])
 
-        s = a.untyped_storage()
-        del a
-        del s
-        self.assertTrue(m[0])
+            s = a.untyped_storage()
+            del a
+            del s
+            self.assertTrue(m[0])
 
     # FIXME: move to test_autograd?
     @skipIfTorchDynamo("TorchDynamo does not work well with hooks")

From 5f7b013bcacb1d6848bc1397450c7acc04d155e0 Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com>
Date: Tue, 19 May 2026 17:29:51 -0500
Subject: [PATCH 85/87] [release/2.11][UP][ROCm][inductor] Use
 hipModuleLoadData in StaticCudaLauncher (#3238)

## Summary
- Backports upstream PyTorch PR pytorch/pytorch#183926 to ROCm
release/2.11.
- Uses `hipModuleLoadData` for ROCm static launcher module loading to
avoid retaining open HSACO file descriptors.
- Leaves the CUDA/NVIDIA path unchanged.
- Resolves Jira https://amd-hub.atlassian.net/browse/ROCM-24659,
https://amd-hub.atlassian.net/browse/ROCM-24664

Made with [Cursor](https://cursor.com)

Co-authored-by: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
---
 torch/csrc/inductor/static_launcher/cuda.cpp | 21 +++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/inductor/static_launcher/cuda.cpp b/torch/csrc/inductor/static_launcher/cuda.cpp
index a2378b7c1a248..9687a57346106 100644
--- a/torch/csrc/inductor/static_launcher/cuda.cpp
+++ b/torch/csrc/inductor/static_launcher/cuda.cpp
@@ -12,6 +12,9 @@
 
 #if defined(USE_ROCM)
 #include <hip/hip_runtime_api.h>
+#include <fstream>
+#include <iterator>
+#include <vector>
 #endif
 
 /**
@@ -101,6 +104,19 @@ CUdeviceptr getPointer(PyObject* obj) {
 
 #define SHARED_MEM_STATIC_MAX 49152 // 48 KB
 
+#if defined(USE_ROCM)
+std::vector<char> readKernelImage(const std::string& filePath) {
+  std::ifstream file(filePath, std::ios::binary);
+  TORCH_CHECK(file, "Failed to open kernel image: ", filePath);
+
+  auto begin = std::istreambuf_iterator<char>(file);
+  auto end = std::istreambuf_iterator<char>();
+  std::vector<char> image(begin, end);
+  TORCH_CHECK(!image.empty(), "Kernel image is empty: ", filePath);
+  return image;
+}
+#endif
+
 CUfunction loadKernel(
     std::string filePath,
     const std::string& funcName,
@@ -116,7 +132,10 @@ CUfunction loadKernel(
   CUfunction func = nullptr;
 
 #if defined(USE_ROCM)
-  AT_CUDA_DRIVER_CHECK(hipModuleLoad(&mod, filePath.c_str()));
+  // Unlike cuModuleLoad, hipModuleLoad keeps a file descriptor for the loaded
+  // HSACO. Load from memory to avoid retaining one FD per static launcher.
+  auto image = readKernelImage(filePath);
+  AT_CUDA_DRIVER_CHECK(hipModuleLoadData(&mod, image.data()));
   AT_CUDA_DRIVER_CHECK(hipModuleGetFunction(&func, mod, funcName.c_str()));
   int shared_optin = 0;
   AT_CUDA_DRIVER_CHECK(hipDeviceGetAttribute(

From 6c683dd613ed22b748d513c35082ba94de91cd49 Mon Sep 17 00:00:00 2001
From: Umesh Chand <umesh.chand@amd.com>
Date: Wed, 20 May 2026 15:48:26 -0700
Subject: [PATCH 86/87] =?UTF-8?q?[Inductor]=20Fix=20flaky=20epilogue=20fus?=
 =?UTF-8?q?ion=20tests=20by=20adding=20missing=20tearDown=E2=80=A6=20(#324?=
 =?UTF-8?q?5)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…Class (#180736)

TestPrologueFusion and TestEpilogueFusionStaticAnalysis both use
ExitStack in setUpClass to apply config.patch(), but neither defined
tearDownClass to close the stack. When TestPrologueFusion runs before
TestEpilogueFusionStaticAnalysis in the same process, config values like
max_autotune_gemm_backends="TRITON" leak through, removing the aten
kernel choice from autotuning and causing test failures.

Fixes #179693

Pull Request resolved: https://github.com/pytorch/pytorch/pull/180736
Approved by: https://github.com/Skylion007

## Motivation

<!-- Explain the purpose of this PR and the goals it aims to achieve.
-->

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->

## Test Plan

<!-- Explain any relevant testing done to verify this PR. -->

## Test Result

<!-- Briefly summarize test outcomes. -->

## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

Co-authored-by: NikhilAPatel <nikhilap@meta.com>
---
 test/inductor/test_max_autotune.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index d94dd81b4673a..5d3855aa73e93 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -3450,6 +3450,11 @@ def setUpClass(cls):
             )
         )
 
+    @classmethod
+    def tearDownClass(cls):
+        cls._stack.close()
+        super().tearDownClass()
+
     def check_code(self, code_str, num_kernels, num_allocs, num_deallocs):
         FileCheck().check(get_func_call()).check_count(
             get_kernel_launch(),
@@ -3943,6 +3948,11 @@ def setUpClass(cls):
             )
         )
 
+    @classmethod
+    def tearDownClass(cls):
+        cls._stack.close()
+        super().tearDownClass()
+
     @contextlib.contextmanager
     def get_common_patches(self, async_compile: bool, persistent_tma: bool):
         common_patches = (

From 53679bf172c72d6ebb8135203930df6a199b8f79 Mon Sep 17 00:00:00 2001
From: Darren Lao <Darren.Lao@amd.com>
Date: Thu, 21 May 2026 11:27:07 -0400
Subject: [PATCH 87/87] [release/2.11] Free deferred record_stream blocks at
 graph capture end (#175817) (#3241)

## Motivation

Aimed as a fix for test
`TestMemPool.test_graph_capture_reclaim_shared_pool` failing in TheRock
wheels: https://github.com/ROCm/TheRock/issues/4925

The test was brought into `release/2.11` by the cherry-pick of upstream
pytorch/pytorch#176024 in #3182, but the allocator fix from upstream
pytorch/pytorch#175817 was not.

Without this fix, `endAllocateToPool` (called from
`CUDAGraph::capture_end`) does not reclaim `record_stream`-deferred
blocks, so a second graph capture into the same shared pool cannot reuse
the block freed in the first capture.

## Technical Details

Cherry-pick of upstream pytorch/pytorch#175817 (commit
`b55e5314fb72f1ea782f72a6c9728a40c12678ea`) on top of `release/2.11`.

## Test Plan

- Build PyTorch wheels from this branch and verify that the test
`TestMemPool.test_graph_capture_reclaim_shared_pool` is now passing.

## Test Result

- `TestMemPool.test_graph_capture_reclaim_shared_pool` passed for torch
2.11:
https://github.com/ROCm/TheRock/actions/runs/26116907093/job/76816330885

## Submission Checklist

- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

Co-authored-by: Frank Lin <eee4017@gmail.com>
---
 c10/cuda/CUDACachingAllocator.cpp | 66 +++++++++++++++++++++++++------
 test/test_cuda.py                 | 13 +-----
 2 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 7a053b8134ef7..268d5fb70fdbc 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -2774,18 +2774,62 @@ class DeviceCachingAllocator {
   void endAllocateToPool(MempoolId_t mempool_id) {
     std::lock_guard<std::recursive_mutex> lock(mutex);
 
-    if (CUDAAllocatorConfig::graph_capture_record_stream_reuse() &&
-        !graph_reuse_context.empty()) {
-      auto capture_id = mempool_to_capture_id[mempool_id];
-      auto graph_context = graph_reuse_context[capture_id];
-      for (auto& [stream, _] : graph_context.visited) {
-        TORCH_INTERNAL_ASSERT(
-            stream_get_capture_info(stream).status ==
-                cudaStreamCaptureStatusNone,
-            "This stream should not be capturing when the capture is ended");
+    if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
+      // Remove stream reuse context and mapping for this capture, if present.
+      if (!graph_reuse_context.empty()) {
+        auto capture_id = mempool_to_capture_id[mempool_id];
+        auto graph_context = graph_reuse_context[capture_id];
+        for (auto& [stream, _] : graph_context.visited) {
+          TORCH_INTERNAL_ASSERT(
+              stream_get_capture_info(stream).status ==
+                  cudaStreamCaptureStatusNone,
+              "This stream should not be capturing when the capture is ended");
+        }
+        graph_reuse_context.erase(capture_id);
+        mempool_to_capture_id.erase(mempool_id);
+      }
+
+      // Free deferred blocks associated with the ended pool, if any.
+      // cudaStreamEndCapture would have failed if any stream used during
+      // capture hadn't been joined back, so all stream uses on these
+      // blocks are known to be complete and we can safely clear them.
+      if (!deferred_blocks.empty()) {
+        auto pool_it = graph_pools.find(mempool_id);
+        if (pool_it != graph_pools.end()) {
+          auto* private_pool = pool_it->second.get();
+          auto context = maybeGatherContext(RecordContext::ALL);
+          std::vector<Block*> blocks_to_erase;
+          for (auto& [block, markers] : deferred_blocks) {
+            if (block->pool->owner_PrivatePool == private_pool) {
+              // At capture end, handle blocks associated with non-capturing
+              // streams. Remove only stream uses introduced during capture
+              // (guaranteed complete), and for any leftover pre-capture uses,
+              // insert events to track their completion. This aligns with
+              // insert_events_deferred_until_no_capture semantics.
+              remove_cudagraph_stream_uses(block);
+              if (block->stream_uses.empty()) {
+                free_block(block, context);
+              } else {
+                // Pre-capture stream uses remain; record events so
+                // process_events can free the block once they complete.
+                insert_events(block);
+                // block->event_count should likely be non-zero here since
+                // block->stream_uses is not empty. Defensive: still free if
+                // event_count is zero, but this should be rare.
+                if (block->event_count == 0) {
+                  free_block(block, context);
+                }
+              }
+              // Must erase from deferred_blocks regardless of which branch we
+              // took.
+              blocks_to_erase.push_back(block);
+            }
+          }
+          for (auto* b : blocks_to_erase) {
+            deferred_blocks.erase(b);
+          }
+        }
       }
-      graph_reuse_context.erase(capture_id);
-      mempool_to_capture_id.erase(mempool_id);
     }
 
     for (auto it = captures_underway.begin(); it != captures_underway.end();
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 56625d716244c..5dd2a7346c79b 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -132,7 +132,6 @@
 
 _cycles_per_ms = None
 
-
 _wait_for_cpu_kernel = None
 
 
@@ -8379,17 +8378,7 @@ class TestCudaDeviceParametrized(TestCase):
     def test_graph_external_wait_and_record(self):
         torch.cuda.empty_cache()
 
-        kernel_source = r"""
-        __global__ void wait_for_cpu(int *pinned_cpu_flag) {
-            int flag = 0;
-            do {
-                    asm volatile("ld.relaxed.sys.global.s32 %0, [%1];" : "=r"(flag) : "l"(pinned_cpu_flag) : "memory");
-            } while (flag == 0);
-        }
-        """
-        from torch.cuda import _compile_kernel
-
-        spin_wait_kernel = _compile_kernel(kernel_source, "wait_for_cpu")
+        spin_wait_kernel = get_wait_for_cpu_kernel()
 
         x = torch.ones(4, device="cuda")
         x_cpu = torch.zeros(x.shape, device="cpu").pin_memory()