Update torch pin nightly to 20260310

mergennachin · mergennachin · commit edb7b6854c80 · 2026-03-16T17:26:56.000-04:00
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-659af3c353e49b35c191cdd2dba3b3c79d0e6822
+08b6f48d871affbc7abe9277020aed882fdf110a
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -19,12 +19,9 @@ Arguments:
   hf_model    HuggingFace model ID (required)
               Supported models:
                 - mistralai/Voxtral-Mini-3B-2507
-                - nvidia/diar_streaming_sortformer_4spk-v2
                 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                 - google/gemma-3-4b-it
-                - Qwen/Qwen3-0.6B
                 - nvidia/parakeet-tdt
-                - facebook/dinov2-small-imagenet1k-1-layer
                 - mistralai/Voxtral-Mini-4B-Realtime-2602
 
   quant_name  Quantization type (required)
@@ -46,7 +43,6 @@ Arguments:
 Examples:
   test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
   test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
-  test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output"
   test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
   test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
   test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
@@ -155,18 +151,6 @@ case "$HF_MODEL" in
     AUDIO_FILE=""
     IMAGE_PATH="docs/source/_static/img/et-logo.png"
     ;;
-  Qwen/Qwen3-0.6B)
-    MODEL_NAME="qwen3"
-    RUNNER_TARGET="llama_main"
-    RUNNER_PATH="llama"
-    EXPECTED_OUTPUT="Paris"
-    PREPROCESSOR=""
-    TOKENIZER_URL="https://huggingface.co/Qwen/Qwen3-0.6B/resolve/main" # @lint-ignore
-    TOKENIZER_FILE=""
-    AUDIO_URL=""
-    AUDIO_FILE=""
-    IMAGE_PATH=""
-    ;;
   nvidia/parakeet-tdt)
     MODEL_NAME="parakeet"
     RUNNER_TARGET="parakeet_runner"
@@ -179,31 +163,6 @@ case "$HF_MODEL" in
     AUDIO_FILE="test_audio.wav"
     IMAGE_PATH=""
     ;;
-  nvidia/diar_streaming_sortformer_4spk-v2)
-    MODEL_NAME="sortformer"
-    RUNNER_TARGET="sortformer_runner"
-    RUNNER_PATH="sortformer"
-    EXPECTED_OUTPUT="Speaker 1"
-    PREPROCESSOR=""
-    TOKENIZER_URL=""
-    TOKENIZER_FILE=""
-    AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
-    AUDIO_FILE="poem.wav"
-    IMAGE_PATH=""
-    ;;
-  facebook/dinov2-small-imagenet1k-1-layer)
-    MODEL_NAME="dinov2"
-    RUNNER_TARGET="dinov2_runner"
-    RUNNER_PATH="dinov2"
-    EXPECTED_OUTPUT="Samoyed"
-    PREPROCESSOR=""
-    TOKENIZER_URL=""
-    TOKENIZER_FILE=""
-    AUDIO_URL=""
-    AUDIO_FILE=""
-    IMAGE_URL="https://github.com/pytorch/hub/raw/master/images/dog.jpg"
-    IMAGE_PATH=""
-    ;;
   mistralai/Voxtral-Mini-4B-Realtime-2602)
     MODEL_NAME="voxtral_realtime"
     RUNNER_TARGET="voxtral_realtime_runner"
@@ -218,7 +177,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -231,8 +190,8 @@ echo "::endgroup::"
 echo "::group::Prepare $MODEL_NAME Artifacts"
 
 
-# Download tokenizer files (skip for models that bundle tokenizer in export or do not use one)
-if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ]; then
+# Download tokenizer files (skip for parakeet and voxtral_realtime which bundle tokenizer in export)
+if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ]; then
   if [ "$TOKENIZER_FILE" != "" ]; then
     curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
   else
@@ -248,15 +207,10 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.11.0.dev20260217 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.11.0.dev20260312 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
-# Download test image for vision models
-if [ -n "${IMAGE_URL:-}" ]; then
-  curl -L "$IMAGE_URL" -o "${MODEL_DIR}/test_image.jpg"
-fi
-
 ls -al
 echo "::endgroup::"
 
@@ -292,14 +246,9 @@ if [ "$(uname -s)" = "Darwin" ] && [ -f "$RUNNER_BIN" ]; then
     install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER_BIN"
   fi
 fi
-# For CUDA, add named data argument (Metal embeds data in .pte).
-# Llama runner uses --data_paths, other runners use --data_path.
+# For CUDA, add data_path argument (Metal embeds data in .pte)
 if [ "$DEVICE" = "cuda" ]; then
-  if [ "$RUNNER_PATH" = "llama" ]; then
-    RUNNER_ARGS="$RUNNER_ARGS --data_paths ${MODEL_DIR}/aoti_cuda_blob.ptd"
-  else
-    RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
-  fi
+  RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
 fi
 
 # Add model-specific arguments
@@ -313,44 +262,26 @@ case "$MODEL_NAME" in
   gemma3)
     RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
     ;;
-  qwen3)
-    PROMPT_FILE="${MODEL_DIR}/qwen3_prompt.txt"
-    cat > "${PROMPT_FILE}" << 'EOF'
-<|im_start|>user
-What is the capital of France?<|im_end|>
-<|im_start|>assistant
-EOF
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --prompt_file ${PROMPT_FILE}"
-    ;;
   parakeet)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE"
     # For CUDA, add data_path argument (Metal embeds data in .pte)
     if [ "$DEVICE" = "cuda" ]; then
       RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
     fi
     ;;
-  sortformer)
-    RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE"
-    if [ "$DEVICE" = "cuda" ]; then
-      RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
-    fi
-    ;;
-  dinov2)
-    RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --image_path ${MODEL_DIR}/test_image.jpg"
-    if [ "$DEVICE" = "cuda" ]; then
-      RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
-    fi
-    ;;
   voxtral_realtime)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
-    # Add CUDA data path if present
-    if [ "$DEVICE" = "cuda" ] && [ -f "${MODEL_DIR}/aoti_cuda_blob.ptd" ]; then
-      RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
-    fi
     # Determine streaming mode based on MODE parameter
-    USE_STREAMING="true"
-    if [ "$MODE" = "vr-offline" ]; then
+    USE_STREAMING="false"
+    if [ "$MODE" = "vr-streaming" ]; then
+      USE_STREAMING="true"
+    elif [ "$MODE" = "vr-offline" ]; then
       USE_STREAMING="false"
+    elif [ -z "$MODE" ]; then
+      # Auto-detect: XNNPACK uses streaming, others use offline
+      if [ "$DEVICE" = "xnnpack" ]; then
+        USE_STREAMING="true"
+      fi
     fi
     # Add streaming flag if needed
     if [ "$USE_STREAMING" = "true" ]; then
diff --git a/.ci/scripts/test_wheel_package_qnn.sh b/.ci/scripts/test_wheel_package_qnn.sh
@@ -18,9 +18,6 @@ import argparse
 
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
-from executorch.backends.qualcomm.serialization.qc_schema import (
-    QnnExecuTorchBackendType,
-)
 from executorch.backends.qualcomm.utils.utils import (
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
@@ -53,7 +50,7 @@ def main() -> None:
     example_inputs = model.get_example_inputs()
 
     if args.quantization:
-        quantizer = QnnQuantizer(backend=QnnExecuTorchBackendType.kHtpBackend, soc_model=get_soc_to_chipset_map()[args.soc])
+        quantizer = QnnQuantizer()
         m = torch.export.export(model.eval(), example_inputs, strict=True).module()
         if args.quantization == "qat":
             m = prepare_qat_pt2e(m, quantizer)
@@ -89,14 +86,6 @@ EOF
 # ----------------------------
 echo "=== Building Wheel Package ==="
 source .ci/scripts/utils.sh
-
-# Ensure QNN SDK is available so setup.py auto-detects it.
-source backends/qualcomm/scripts/install_qnn_sdk.sh
-install_qnn
-
-# Make QNN SDK libraries available for runtime loading (e.g. libQnnHtp.so)
-export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang/:${LD_LIBRARY_PATH:-}"
-
 install_executorch
 EXECUTORCH_BUILDING_WHEEL=1 python setup.py bdist_wheel
 unset EXECUTORCH_BUILDING_WHEEL
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
@@ -141,9 +141,9 @@ install_pytorch_and_domains() {
 
   dedupe_macos_loader_path_rpaths
   # Grab the pinned audio and vision commits from PyTorch
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.11.0.dev20260217 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.11.0.dev20260312 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
diff --git a/exir/sym_util.py b/exir/sym_util.py
@@ -25,7 +25,10 @@ def eval_expr(symint: Union[int, torch.SymInt]) -> Optional[int]:
     shape_env = node.shape_env
     expr = node.expr
     try:
-        output = shape_env.size_hint(expr)
+        if hasattr(shape_env, "guarding_hint_or_throw"):
+            output = shape_env.guarding_hint_or_throw(expr)
+        else:
+            output = shape_env.size_hint(expr)
     except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
         return None
     return int(output)
diff --git a/runtime/core/portable_type/c10/c10/util/complex_math.h b/runtime/core/portable_type/c10/c10/util/complex_math.h
@@ -86,6 +86,41 @@ C10_HOST_DEVICE inline c10::complex<T> pow(
 #endif
 }
 
+// Regression in ROCm 7.2. See https://github.com/ROCm/rocm-libraries/pull/3836.
+// Specialized version for complex<float> on AMD GPUs to use FMA-based
+// multiplication
+#if defined(__HIPCC__)
+namespace detail {
+// FMA-aware complex multiplication for float precision on AMD GPUs.
+// This prevents SLP vectorizer from breaking FMA formation, which causes
+// numerical precision loss in complex arithmetic.
+// The issue occurs when vectorizer packs scalar multiplies before backend
+// can form FMA instructions, resulting in double rounding instead of single.
+C10_HOST_DEVICE inline thrust::complex<float> complex_mul_fma(
+    thrust::complex<float> a,
+    thrust::complex<float> b) {
+  // Complex multiplication: (a.r + a.i*i) * (b.r + b.i*i)
+  // = (a.r*b.r - a.i*b.i) + (a.r*b.i + a.i*b.r)*i
+  // Using __builtin_fmaf ensures FMA at source level:
+  // real: a.r*b.r + (-(a.i*b.i)) = FMA(a.r, b.r, -(a.i*b.i))
+  // imag: a.i*b.r + a.r*b.i = FMA(a.r, b.i, a.i*b.r)
+  float real_part = __builtin_fmaf(a.real(), b.real(), -(a.imag() * b.imag()));
+  float imag_part = __builtin_fmaf(a.real(), b.imag(), a.imag() * b.real());
+  return thrust::complex<float>(real_part, imag_part);
+}
+} // namespace detail
+
+template <>
+C10_HOST_DEVICE inline c10::complex<float> pow(
+    const c10::complex<float>& x,
+    const c10::complex<float>& y) {
+  auto log_x = thrust::log(static_cast<thrust::complex<float>>(x));
+  auto y_log_x =
+      detail::complex_mul_fma(static_cast<thrust::complex<float>>(y), log_x);
+  return static_cast<c10::complex<float>>(thrust::exp(y_log_x));
+}
+#endif
+
 template <typename T>
 C10_HOST_DEVICE inline c10::complex<T> pow(
     const c10::complex<T>& x,
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -629,7 +629,7 @@ __host__ __device__
 // This macro is used to find older C++ compilers
 // that don't support move optimization for return values.
 
-#if (defined(__GNUC__) && __GNUC__ < 13) || \
+#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
     (defined(__clang_major__) && __clang_major__ < 13)
 #define C10_RETURN_MOVE_IF_OLD_COMPILER 1
 #else
diff --git a/torch_pin.py b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.11.0"
-NIGHTLY_VERSION = "dev20260215"
+NIGHTLY_VERSION = "dev20260312"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-659af3c353e49b35c191cdd2dba3b3c79d0e6822`
	`1`	`+08b6f48d871affbc7abe9277020aed882fdf110a`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`TORCH_VERSION = "2.11.0"`
`2`		`-NIGHTLY_VERSION = "dev20260215"`
	`2`	`+NIGHTLY_VERSION = "dev20260312"`