tohtana · tohtana · Feb 3, 2026 · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
diff --git a/.github/workflows/aws-torch-latest-full.yml b/.github/workflows/aws-torch-latest-full.yml
@@ -2,25 +2,76 @@
 # DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
 #
 # Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
-# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
+# Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side
+# fallback to 8x A100 nodes when L40S capacity is unavailable.
 #
 # This workflow runs:
 # - Parallel tests with pytest-xdist (-n 8)
 # - Sequential tests marked with @pytest.mark.sequential
+# - Nightly schedule: skips if no new commits since last successful run
 ################################################################################
 
 name: aws-torch-latest-full
 
 on:
+  schedule:
+    - cron: '0 8 * * *'   # Daily at 08:00 UTC (midnight PST)
   workflow_dispatch:
+    inputs:
+      torch_preset:
+        description: PyTorch preset to install for manual runs
+        required: false
+        default: '2.7.1-cu126'
+        type: choice
+        options:
+          - '2.7.1-cu126'
+          - '2.8.0-cu126'
+          - '2.9.1-cu126'
+          - '2.10.0-cu126'
+          - '2.11.0-cu126'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
+  check-changes:
+    name: Check for new commits
+    runs-on: ubuntu-latest
+    if: github.event_name == 'schedule'
+    outputs:
+      has_changes: ${{ steps.check.outputs.has_changes }}
+    steps:
+      - name: Check for commits since last successful run
+        id: check
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          default_branch="${{ github.event.repository.default_branch }}"
+
+          last_sha=$(gh api \
+            "repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&event=schedule&branch=${default_branch}&per_page=1" \
+            --jq '.workflow_runs[0].head_sha // empty')
+
+          current_sha="${{ github.sha }}"
+
+          if [ -z "$last_sha" ]; then
+            echo "No previous successful run found - running tests"
+            echo "has_changes=true" >> "$GITHUB_OUTPUT"
+          elif [ "$last_sha" = "$current_sha" ]; then
+            echo "No new commits since last successful run ($last_sha) - skipping"
+            echo "has_changes=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "New commits detected: $last_sha -> $current_sha - running tests"
+            echo "has_changes=true" >> "$GITHUB_OUTPUT"
+          fi
+
   unit-tests:
     name: Unit Tests (Full)
+    needs: [check-changes]
+    if: |
+      always() &&
+      (github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
     runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
     timeout-minutes: 180
 
@@ -30,11 +81,10 @@ jobs:
       options: --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio
 
     env:
-      TORCH_VER: "2.7"
-      CUDA_VER: "12.6"
+      DEFAULT_TORCH_PRESET: '2.7.1-cu126'
       CUTLASS_PATH: /opt/cutlass
       # Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs
-      DS_DISABLE_REUSE_DIST_ENV: "1"
+      DS_DISABLE_REUSE_DIST_ENV: '1'
 
     steps:
       - name: Install system dependencies
@@ -48,6 +98,79 @@ jobs:
         with:
           lfs: true
 
+      - name: Resolve PyTorch preset
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          MANUAL_TORCH_PRESET: ${{ github.event.inputs.torch_preset || '' }}
+        run: |
+          if [ "$GITHUB_EVENT_NAME" = 'workflow_dispatch' ] && [ -n "$MANUAL_TORCH_PRESET" ]; then
+            selected_preset="$MANUAL_TORCH_PRESET"
+          else
+            selected_preset="$DEFAULT_TORCH_PRESET"
+          fi
+
+          case "$selected_preset" in
+            '2.7.1-cu126')
+              torch_install_version='2.7.1'
+              torchvision_install_version='0.22.1'
+              torchaudio_install_version='2.7.1'
+              torch_test_version='2.7'
+              cuda_test_version='12.6'
+              pytorch_index_url='https://download.pytorch.org/whl/cu126'
+              ;;
+            '2.8.0-cu126')
+              torch_install_version='2.8.0'
+              torchvision_install_version='0.23.0'
+              torchaudio_install_version='2.8.0'
+              torch_test_version='2.8'
+              cuda_test_version='12.6'
+              pytorch_index_url='https://download.pytorch.org/whl/cu126'
+              ;;
+            '2.9.1-cu126')
+              torch_install_version='2.9.1'
+              torchvision_install_version='0.24.1'
+              torchaudio_install_version='2.9.1'
+              torch_test_version='2.9'
+              cuda_test_version='12.6'
+              pytorch_index_url='https://download.pytorch.org/whl/cu126'
+              ;;
+            '2.10.0-cu126')
+              torch_install_version='2.10.0'
+              torchvision_install_version='0.25.0'
+              torchaudio_install_version='2.10.0'
+              torch_test_version='2.10'
+              cuda_test_version='12.6'
+              pytorch_index_url='https://download.pytorch.org/whl/cu126'
+              ;;
+            '2.11.0-cu126')
+              torch_install_version='2.11.0'
+              torchvision_install_version='0.26.0'
+              torchaudio_install_version='2.11.0'
+              torch_test_version='2.11'
+              cuda_test_version='12.6'
+              pytorch_index_url='https://download.pytorch.org/whl/cu126'
+              ;;
+            *)
+              echo "Unsupported torch_preset: $selected_preset" >&2
+              exit 1
+              ;;
+          esac
+
+          {
+            echo "SELECTED_TORCH_PRESET=$selected_preset"
+            echo "TORCH_INSTALL_VERSION=$torch_install_version"
+            echo "TORCHVISION_INSTALL_VERSION=$torchvision_install_version"
+            echo "TORCHAUDIO_INSTALL_VERSION=$torchaudio_install_version"
+            echo "TORCH_TEST_VERSION=$torch_test_version"
+            echo "CUDA_TEST_VERSION=$cuda_test_version"
+            echo "PYTORCH_INDEX_URL=$pytorch_index_url"
+          } >> "$GITHUB_ENV"
+
+          echo "Selected preset: $selected_preset"
+          echo "Resolved install tuple: torch==$torch_install_version torchvision==$torchvision_install_version torchaudio==$torchaudio_install_version"
+          echo "Resolved test expectations: torch=$torch_test_version cuda=$cuda_test_version"
+          echo "Resolved PyTorch index: $pytorch_index_url"
+
       - name: Install CUTLASS
         run: |
           git clone --depth 1 --branch v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass
@@ -56,7 +179,11 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
+          pip install \
+            torch=="$TORCH_INSTALL_VERSION" \
+            torchvision=="$TORCHVISION_INSTALL_VERSION" \
+            torchaudio=="$TORCHAUDIO_INSTALL_VERSION" \
+            --index-url "$PYTORCH_INDEX_URL"
 
       - name: Install transformers
         run: |
@@ -75,6 +202,12 @@ jobs:
 
       - name: Check environment
         run: |
+          echo "=== Selected PyTorch Preset ==="
+          echo "Preset: $SELECTED_TORCH_PRESET"
+          echo "Install tuple: torch==$TORCH_INSTALL_VERSION torchvision==$TORCHVISION_INSTALL_VERSION torchaudio==$TORCHAUDIO_INSTALL_VERSION"
+          echo "PyTorch index URL: $PYTORCH_INDEX_URL"
+          echo "Expected test versions: torch=$TORCH_TEST_VERSION cuda=$CUDA_TEST_VERSION"
+          echo ""
           echo "=== GPU Information ==="
           nvidia-smi
           echo ""
@@ -90,10 +223,32 @@ jobs:
           echo ""
           echo "=== CUTLASS ==="
           echo "CUTLASS_PATH: $CUTLASS_PATH"
-          ls -la $CUTLASS_PATH/include/ | head -5
+          ls -la "$CUTLASS_PATH"/include/ | head -5
+
+      - name: Detect GPU architecture
+        run: |
+          python - <<'PY'
+          import os
+          import torch
+
+          torch.cuda.init()
+          major, minor = torch.cuda.get_device_capability(0)
+          arch = f"{major}.{minor}"
+          gpu_count = torch.cuda.device_count()
+          gpu_name = torch.cuda.get_device_name(0)
+
+          with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
+              env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n")
+              env_file.write(f"GPU_COUNT={gpu_count}\n")
+
+          print(f"Detected GPU: {gpu_name}")
+          print(f"Detected compute capability: {arch}")
+          print(f"Detected GPU count: {gpu_count}")
+          PY
 
       - name: Install DeepSpeed
         run: |
+          echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
           # Initialize CUDA before install so setup.py can detect NCCL version
           python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
           # Use --no-build-isolation so setup.py can access pre-installed PyTorch
@@ -106,7 +261,7 @@ jobs:
 
       - name: Unit tests (parallel)
         run: |
-          export TORCH_CUDA_ARCH_LIST="8.9"
+          echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
           cd tests
           # Skip tests requiring unavailable hardware or known issues:
           # - nvme checkpointing: no nvme device
@@ -120,11 +275,11 @@ jobs:
             --ignore=unit/launcher/test_user_args.py \
             --ignore=unit/runtime/zenflow \
             --ignore=unit/ops/adam/test_zf_torch_adam.py \
-            --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
+            --torch_ver="$TORCH_TEST_VERSION" --cuda_ver="$CUDA_TEST_VERSION"
 
       - name: Unit tests (sequential)
         run: |
-          export TORCH_CUDA_ARCH_LIST="8.9"
+          echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
           cd tests
           rm -rf /mnt/aio/pytest
           pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
@@ -134,4 +289,4 @@ jobs:
             --ignore=unit/runtime/zenflow \
             --ignore=unit/ops/adam/test_zf_torch_adam.py \
             --ignore=unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py \
-            --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
+            --torch_ver="$TORCH_TEST_VERSION" --cuda_ver="$CUDA_TEST_VERSION"
diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
@@ -23,11 +23,20 @@ jobs:
   unit-tests:
     runs-on: ubuntu-24.04
     container:
-      image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
+      image: nvidia/cuda:12.6.3-devel-ubuntu22.04
 
     steps:
+        - name: Install system dependencies
+          run: |
+            apt-get update && apt-get install -y git python3 python3-pip libaio-dev ninja-build
+            ln -sf /usr/bin/python3 /usr/bin/python
+
         - uses: actions/checkout@v4
 
+        - name: Install PyTorch
+          run: |
+            pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu126
+
         - name: environment
           run: |
             which python
@@ -36,7 +45,7 @@ jobs:
             #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
         - name: Compile DeepSpeed Ops
           run: |
-            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
+            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
         - name: DS Report
           run: |
-             ds_report
+             DS_ACCELERATOR=cuda ds_report
diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
@@ -20,7 +20,7 @@ jobs:
   compile-tests:
     runs-on: [self-hosted, intel, xpu]
     container:
-      image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
+      image: intel/oneapi-basekit:2025.0.2-0-devel-ubuntu22.04
       ports:
         - 80
       options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
@@ -31,11 +31,7 @@ jobs:
       run: |
         apt-get update
         apt-get install clinfo libaio-dev python3-pip -y
-        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
-        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
-        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
-        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
-        pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
+        pip install torch==2.10.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
         pip install py-cpuinfo numpy
         pip install .[dev,autotuning]
 
@@ -44,7 +40,7 @@ jobs:
         ldd --version
         ds_report
         python3 -c "import torch; print('torch:', torch.__version__, torch)"
-        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "import torch; print('XPU available:', torch.xpu.is_available())"
         python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
         pip list
 

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
@@ -50,8 +50,7 @@ jobs:
         apt-get install -y python3.11 python3.11-dev python3-pip clinfo libaio-dev
         pip install --upgrade pip
         pip install py-cpuinfo
-        pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu
-        pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us --trusted-host pytorch-extension.intel.com
+        pip install torch==2.10.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
         pip install .[dev,autotuning]
 
     - name: Check container state
@@ -60,7 +59,7 @@ jobs:
         ldd --version
         ds_report
         python3 -c "import torch; print('torch:', torch.__version__, torch)"
-        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "import torch; print('XPU available:', torch.xpu.is_available())"
         python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
         pip list
 

diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,32 @@
+<!-- This file is duplicated as CLAUDE.md and AGENTS.md. Keep them in sync. -->
+# AGENTS.md — Workspace-level instructions for AI coding agents
+
+## DeepSpeed Project Rules
+
+### Commit & CI requirements
+
+- All commits MUST have a `Signed-off-by` line (use `--signoff`). Get the name and email from `git config user.name` / `git config user.email`.
+- Formatting: yapf (column_limit=119, `.style.yapf`) + flake8 (`.flake8`).
+- Always verify changed files pass pre-commit checks before committing: `pre-commit run --files <changed_files>`. Only check modified files, not the entire codebase. Config: `.pre-commit-config.yaml`.
+- `check-torchdist` hook: NEVER directly import torch's distributed module. Use `import deepspeed.comm as dist` instead.
+- New files require license header:
+  ```
+  # SPDX-License-Identifier: Apache-2.0
+  # DeepSpeed Team
+  ```
+
+### Code change discipline
+
+- NEVER make cosmetic/formatting-only changes to existing code. Only add/modify lines that are functionally necessary. Minimizing diff noise is critical for code review.
+- Delete dead code decisively — if code is unused at runtime (only referenced in tests), remove it along with its tests.
+- Prefer consolidating tests over proliferating test files.
+- Blend in: when modifying code, read the surrounding context and match the style of neighboring code (naming, spacing, patterns, idioms).
+- Write beginner-friendly code: avoid deeply nested expressions or chained logic. Break complex expressions into clear, named intermediate steps.
+- Comments should explain **why**, not **what**. Describe the purpose and reasoning, not the mechanics that the code already shows.
+- New features must include corresponding tests and documentation updates.
+
+## Tool Caveats
+
+### Edit tool auto-formatter
+
+The Edit tool has a hidden auto-formatter that silently changes quotes, whitespace, blank lines, and line wrapping. For format-sensitive modifications (e.g., when exact formatting matters for pre-commit), use `bash` with `sed`, `python`, or `cat` instead.