ROCm · rkayaith · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 30, 2026
diff --git a/.gitignore b/.gitignore
@@ -106,6 +106,7 @@ torch/lib/*.lib
 torch/lib/*.pdb
 torch/lib/*.so*
 torch/lib/protobuf*.pc
+torch/lib/aotriton.images/
 torch/lib/build
 torch/lib/caffe2/
 torch/lib/cmake

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -102,6 +102,8 @@ file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h")
 file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
 file(GLOB miopen_h "miopen/*.h")
 file(GLOB miopen_cpp "miopen/*.cpp")
+file(GLOB hipdnn_h "hipdnn/*.h")
+file(GLOB hipdnn_cpp "hipdnn/*.cpp")
 
 file(GLOB mkl_cpp "mkl/*.cpp")
 file(GLOB mkldnn_cpp "mkldnn/*.cpp")
@@ -179,7 +181,7 @@ file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip")
 file(GLOB native_hip_cpp "native/hip/*.cpp")
 file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp")
 file(GLOB native_miopen_cpp "native/miopen/*.cpp")
-file(GLOB native_cudnn_hip_cpp "native/cudnn/hip/*.cpp")
+file(GLOB native_hipdnn_cpp "native/hipdnn/*.cpp")
 file(GLOB native_nested_hip_hip "native/nested/hip/*.hip")
 file(GLOB native_nested_hip_cpp "native/nested/hip/*.cpp")
 file(GLOB native_sparse_hip_hip "native/sparse/hip/*.hip")
@@ -568,6 +570,7 @@ if(USE_CUDA)
     ${native_cuda_cpp}
     ${native_cudnn_cpp}
     ${native_miopen_cpp}
+    ${native_hipdnn_cpp}
     ${native_nested_cuda_cpp}
     ${native_quantized_cuda_cpp}
     ${native_quantized_cudnn_cpp}
@@ -640,17 +643,21 @@ if(USE_ROCM)
     ${native_sparse_hip_cpp}
     ${native_quantized_hip_cpp}
     ${native_transformers_hip_cpp}
-    ${native_quantized_cudnn_hip_cpp}
+    ${native_quantized_cudnn_cpp}
     ${hip_cpp}
     ${native_hip_cpp}
     ${native_hip_linalg_cpp}
     ${cuda_generated_sources}
     ${ATen_HIP_SRCS}
     ${native_miopen_cpp}
-    ${native_cudnn_hip_cpp}
+    ${native_hipdnn_cpp}
+    ${native_cudnn_cpp}
     ${miopen_cpp}
     ${all_hip_cpp}
   )
+  if(USE_HIPDNN)
+    list(APPEND all_hip_cpp ${hipdnn_cpp})
+  endif()
 endif()
 
 if(USE_XPU)
@@ -937,6 +944,9 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h} ${ATen_TRANSFORMER_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
   list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${native_mtia_h} ${cudnn_h} ${hip_h} ${mtia_h} ${xpu_h} ${mps_h} ${native_kleidiai_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
+  if(USE_HIPDNN)
+    list(APPEND INSTALL_HEADERS ${hipdnn_h})
+  endif()
   # Metal
   if(USE_PYTORCH_METAL_EXPORT)
     # Add files needed from exporting metal models(optimized_for_mobile)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -319,6 +319,14 @@ void Context::setImmediateMiopen(bool b) {
   immediate_miopen = b;
 }
 
+bool Context::userEnabledHipdnn() const {
+  return enabled_hipdnn;
+}
+
+void Context::setUserEnabledHipdnn(bool e) {
+  enabled_hipdnn = e;
+}
+
 bool Context::allowTF32CuBLAS() const {
   bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
   bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32;

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -244,6 +244,8 @@ class TORCH_API Context {
   void setBenchmarkLimitCuDNN(int /*b*/);
   bool immediateMiopen() const;
   void setImmediateMiopen(bool /*b*/);
+  bool userEnabledHipdnn() const;
+  void setUserEnabledHipdnn(bool e);
   bool deterministicCuDNN() const;
   void setDeterministicCuDNN(bool /*b*/);
   bool deterministicMkldnn() const;
@@ -478,6 +480,7 @@ class TORCH_API Context {
   bool allow_fp16_bf16_reduction_mathSDP = false;
   bool benchmark_cudnn = false;
   bool immediate_miopen = false;
+  bool enabled_hipdnn = false;
   Float32MatmulPrecision float32_matmul_precision =
       c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
       ? at::Float32MatmulPrecision::HIGH

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -276,6 +276,14 @@ bool CUDAHooks::compiledWithMIOpen() const {
   return AT_ROCM_ENABLED();
 }
 
+bool CUDAHooks::compiledWithHipDNN() const {
+#ifdef USE_HIPDNN
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
   if (!hasCUDA()) {

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -42,6 +42,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   Allocator* getPinnedMemoryAllocator() const override;
   bool compiledWithCuDNN() const override;
   bool compiledWithMIOpen() const override;
+  bool compiledWithHipDNN() const override;
   bool supportsDilatedConvolutionWithCuDNN() const override;
   bool supportsDepthwiseConvolutionWithCuDNN() const override;
   bool supportsBFloat16ConvolutionWithCuDNNv8() const override;

diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -154,6 +154,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
     return false;
   }
 
+  virtual bool compiledWithHipDNN() const {
+    return false;
+  }
+
   virtual bool supportsDilatedConvolutionWithCuDNN() const {
     return false;
   }

diff --git a/aten/src/ATen/functorch/BatchRulesNorm.cpp b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@@ -815,6 +815,60 @@ struct MiopenBatchNormBackwardBatchRuleHelper {
       decltype(&fn),\
       &fn>::apply)
 
+template <typename F, F Func>
+struct HipdnnBatchNormBatchRuleHelper {
+  static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>> apply(
+    const Tensor& input, std::optional<int64_t> input_bdim,
+    const Tensor& weight_opt, std::optional<int64_t> weight_bdim,
+    const std::optional<Tensor>& bias_opt, std::optional<int64_t> bias_bdim,
+    const std::optional<Tensor>& running_mean_opt, std::optional<int64_t> running_mean_bdim,
+    const std::optional<Tensor>& running_var_opt, std::optional<int64_t> running_var_bdim,
+    bool training, double momentum, double eps) {
+    return batch_norm_batch_rule<F, Func>(
+        input, input_bdim, weight_opt, weight_bdim, bias_opt, bias_bdim,
+        running_mean_opt, running_mean_bdim, running_var_opt, running_var_bdim, training, momentum, eps);
+  }
+};
+
+template <typename F, F Func>
+struct HipdnnBatchNormBackwardBatchRuleHelper {
+  static std::tuple<Tensor,Tensor,Tensor> apply(
+    const at::Tensor & input,
+    const at::Tensor & grad_out,
+    const at::Tensor & weight,
+    const std::optional<at::Tensor> & running_mean_opt,
+    const std::optional<at::Tensor> & running_var_opt,
+    const std::optional<at::Tensor> & save_mean_opt,
+    const std::optional<at::Tensor> & save_rstd_opt,
+    double eps) {
+
+    auto maybe_layer = maybeCurrentDynamicLayer();
+    vmap_check_escaped(maybe_layer, "HipdnnBatchNormBackwardBatchRuleHelper.apply");
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+    int64_t cur_level = maybe_layer->layerId();
+
+    if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
+          running_var_opt, save_mean_opt, save_rstd_opt}, cur_level)) {
+      c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+      return at::hipdnn_batch_norm_backward(input, grad_out, weight,
+          running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
+    }
+
+    return batch_norm_backward_plumbing<F, Func>(
+        grad_out, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, true, eps, {true, true, true});
+  }
+};
+
+#define HIPDNN_BATCH_NORM_BATCH_RULE(fn) SINGLE_ARG(\
+    HipdnnBatchNormBatchRuleHelper<\
+      decltype(&ATEN_FN(fn)),\
+      &ATEN_FN(fn)>::apply)
+
+#define HIPDNN_BATCH_NORM_BACKWARD_BATCH_RULE(fn) SINGLE_ARG(\
+    HipdnnBatchNormBackwardBatchRuleHelper<\
+      decltype(&fn),\
+      &fn>::apply)
+
 static std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_batch_norm_backward_wrapper(
     const at::Tensor & grad_out,
     const at::Tensor & input,
@@ -846,6 +900,21 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_batch_norm_backward_w
     return at::miopen_batch_norm_backward(input, grad_out, weight_opt, running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
   }
 
+static std::tuple<at::Tensor,at::Tensor,at::Tensor> hipdnn_batch_norm_backward_wrapper(
+    const at::Tensor & grad_out,
+    const at::Tensor & input,
+    const at::Tensor& weight_opt,
+    const std::optional<at::Tensor> & running_mean_opt,
+    const std::optional<at::Tensor> & running_var_opt,
+    const std::optional<at::Tensor> & save_mean_opt,
+    const std::optional<at::Tensor> & save_rstd_opt,
+    bool training,
+    double eps,
+    std::array<bool,3> output_mask) {
+    TORCH_INTERNAL_ASSERT(!training);
+    return at::hipdnn_batch_norm_backward(input, grad_out, weight_opt, running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
+  }
+
 // NB: This is NOT good. In the ideal world, we do NOT want to convert the new legit op back into native_batch_norm
 // as native_batch_norm has a problematic schema--it promises it is functional when it is not. However, vmap doesn't
 // work with dynamo anyway so we gain some buffer room to do wrong things here. The (reasonable) hope is that we will
@@ -866,11 +935,13 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(native_batch_norm, NATIVE_BATCH_NORM_BATCH_RULE(native_batch_norm));
   VMAP_SUPPORT(cudnn_batch_norm, CUDNN_BATCH_NORM_BATCH_RULE(cudnn_batch_norm));
   VMAP_SUPPORT(miopen_batch_norm, MIOPEN_BATCH_NORM_BATCH_RULE(miopen_batch_norm));
+  VMAP_SUPPORT(hipdnn_batch_norm, HIPDNN_BATCH_NORM_BATCH_RULE(hipdnn_batch_norm));
   m.impl("_native_batch_norm_legit", _native_batch_norm_legit_batch);
   m.impl("_native_batch_norm_legit.no_stats", _native_batch_norm_legit_no_stats_batch);
   m.impl("native_batch_norm_backward", NATIVE_BATCH_NORM_BACKWARD_BATCH_RULE(native_batch_norm_backward));
   m.impl("cudnn_batch_norm_backward", CUDNN_BATCH_NORM_BACKWARD_BATCH_RULE(at::functorch::cudnn_batch_norm_backward_wrapper));
   m.impl("miopen_batch_norm_backward", MIOPEN_BATCH_NORM_BACKWARD_BATCH_RULE(at::functorch::miopen_batch_norm_backward_wrapper));
+  m.impl("hipdnn_batch_norm_backward", HIPDNN_BATCH_NORM_BACKWARD_BATCH_RULE(at::functorch::hipdnn_batch_norm_backward_wrapper));
   m.impl("native_group_norm", native_group_norm_plumbing);
   m.impl("native_group_norm_backward", native_group_norm_backward_plumbing);
   VMAP_SUPPORT(native_layer_norm, native_layer_norm_batch_rule);

diff --git a/aten/src/ATen/hip_compat/include/ATen/cuda/Exceptions.h b/aten/src/ATen/hip_compat/include/ATen/cuda/Exceptions.h
@@ -0,0 +1,8 @@
+#pragma once
+
+// Shim of `<ATen/cuda/Exceptions.h>` for HIP builds. Defines
+// AT_CUDNN_FRONTEND_CHECK in terms of hipDNN's check macro.
+
+#include <ATen/hipdnn/Exceptions.h>
+
+#define AT_CUDNN_FRONTEND_CHECK(e) HIPDNN_FE_CHECK(e)
diff --git a/aten/src/ATen/hip_compat/include/ATen/cudnn/Handle.h b/aten/src/ATen/hip_compat/include/ATen/cudnn/Handle.h
@@ -0,0 +1,13 @@
+#pragma once
+
+// Shim of `<ATen/cudnn/Handle.h>` for HIP builds. Forwards the cuDNN handle
+// symbols to their hipDNN equivalents so non-hipified files compile against
+// the cuDNN-named API.
+
+#include <ATen/hipdnn/Handle.h>
+#include <ATen/hipdnn/hipdnn-wrapper.h>
+
+using cudnnHandle_t = hipdnnHandle_t;
+inline cudnnHandle_t getCudnnHandle() {
+  return at::native::getHipdnnHandle();
+}
diff --git a/aten/src/ATen/hip_compat/include/c10/cuda/impl/cuda_cmake_macros.h b/aten/src/ATen/hip_compat/include/c10/cuda/impl/cuda_cmake_macros.h
@@ -0,0 +1,5 @@
+#pragma once
+
+// CMake-generated cuda_cmake_macros.h doesn't exist on HIP builds; forward
+// to its hip equivalent so c10/cuda/CUDAMacros.h transitive-includes work.
+#include <c10/hip/impl/hip_cmake_macros.h>
diff --git a/aten/src/ATen/hip_compat/include/cuda.h b/aten/src/ATen/hip_compat/include/cuda.h
@@ -0,0 +1,5 @@
+#pragma once
+
+// CUDA driver API header; on HIP, forward to hip_runtime which exposes the
+// equivalent driver entry points used by c10/cuda/CUDAException.h.
+#include <hip/hip_runtime.h>
diff --git a/aten/src/ATen/hip_compat/include/cuda_runtime.h b/aten/src/ATen/hip_compat/include/cuda_runtime.h
@@ -0,0 +1,6 @@
+#pragma once
+
+// `cuda_runtime.h` is the catch-all CUDA SDK header; on HIP builds, forward
+// to the equivalent. cuda_runtime_api.h carries the type/function aliases.
+#include <hip/hip_runtime.h>
+#include <cuda_runtime_api.h>
diff --git a/aten/src/ATen/hip_compat/include/cuda_runtime_api.h b/aten/src/ATen/hip_compat/include/cuda_runtime_api.h
@@ -0,0 +1,36 @@
+#pragma once
+
+// Drop-in shim so non-hipified files including <cuda_runtime_api.h> on a
+// HIP build compile. Forwards to hip_runtime_api.h and aliases the cuda*
+// types/enums/functions used by c10/cuda/* headers to their hip*
+// equivalents — mirrors the source-level rewrites that hipify performs.
+
+#include <hip/hip_runtime_api.h>
+
+using cudaStream_t = hipStream_t;
+using cudaError_t = hipError_t;
+using cudaMemcpyKind = hipMemcpyKind;
+using cudaStreamCaptureMode = hipStreamCaptureMode;
+using cudaStreamCaptureStatus = hipStreamCaptureStatus;
+
+// Enum values are accessed via `cudaStreamCaptureStatus::cudaStreamCaptureStatusNone`
+// (C++ scope resolution into the enum), which becomes
+// `hipStreamCaptureStatus::cudaStreamCaptureStatusNone` after the type alias.
+// The inner identifier needs to be a macro that substitutes to the hip-named
+// value so the lookup hits the enum's actual member.
+#define cudaSuccess hipSuccess
+#define cudaStreamCaptureStatusNone hipStreamCaptureStatusNone
+#define cudaStreamCaptureStatusActive hipStreamCaptureStatusActive
+#define cudaStreamCaptureStatusInvalidated hipStreamCaptureStatusInvalidated
+
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaMallocAsync hipMallocAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamGetPriority hipStreamGetPriority
+#define cudaStreamIsCapturing hipStreamIsCapturing
+#define cudaStreamGetCaptureInfo hipStreamGetCaptureInfo
+#define cudaThreadExchangeStreamCaptureMode hipThreadExchangeStreamCaptureMode
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+#define cudaDeviceGetStreamPriorityRange hipDeviceGetStreamPriorityRange
diff --git a/aten/src/ATen/hip_compat/include/cudnn_frontend.h b/aten/src/ATen/hip_compat/include/cudnn_frontend.h
@@ -0,0 +1,62 @@
+#pragma once
+
+// Shim of cuDNN's `<cudnn_frontend.h>` for ROCm/hipDNN builds. Forwards
+// `cudnn_frontend` symbols to `hipdnn_frontend` with cuDNN-style API shims
+// layered on top (Graph::check_support(handle), HeurMode_t::A, etc.), and
+// also forwards cuDNN-side `cudnnHandle_t`/`getCudnnHandle()`/
+// `AT_CUDNN_FRONTEND_CHECK` to hipDNN equivalents — so non-hipified cuDNN
+// call sites compile unchanged on HIP.
+
+// TODO: drop this define once hipDNN exposes SDPA unconditionally and
+// pytorch's LoadHIP.cmake propagates hipdnn_frontend's
+// INTERFACE_COMPILE_DEFINITIONS.
+#define HIPDNN_ENABLE_SDPA
+
+#include <ATen/hipdnn/Exceptions.h>
+#include <ATen/hipdnn/Handle.h>
+#include <ATen/hipdnn/hipdnn-wrapper.h>
+#include <hipdnn_frontend.hpp>
+
+namespace at::native::hipdnn_compat {
+
+using namespace hipdnn_frontend;
+
+namespace graph {
+using namespace hipdnn_frontend::graph;
+
+class Graph : public hipdnn_frontend::graph::Graph {
+ public:
+  // cuDNN's check_support / build_plans take a handle; hipDNN's don't (the
+  // handle is bound at execute time). Add overloads that ignore the handle
+  // and forward to the no-arg APIs.
+  using hipdnn_frontend::graph::Graph::check_support;
+  using hipdnn_frontend::graph::Graph::build_plans;
+  auto check_support(hipdnnHandle_t /*handle*/) { return check_support(); }
+  auto build_plans(hipdnnHandle_t /*handle*/) { return build_plans(); }
+
+  // cuDNN exposes a per-uid query via an out-parameter. hipDNN only offers
+  // a one-shot {uid -> shared_ptr<Tensor_attributes>} map; wrap it.
+  hipdnn_frontend::error_t query_tensor_attributes_of_uid(
+      int64_t uid,
+      hipdnn_frontend::graph::Tensor_attributes& attrs) const {
+    auto graph_tensors = getTensorsByUid();
+    auto it = graph_tensors.find(uid);
+    if (it == graph_tensors.end()) {
+      return {hipdnn_frontend::error_code_t::ATTRIBUTE_NOT_SET,
+              "tensor uid not in graph"};
+    }
+    attrs = *it->second;
+    return {hipdnn_frontend::error_code_t::OK, ""};
+  }
+};
+
+} // namespace graph
+
+// Map cuDNN's HeurMode_t::A (recommended heuristic) to FALLBACK on hipDNN.
+struct HeurMode_t {
+  static constexpr auto A = hipdnn_frontend::HeurMode_t::FALLBACK;
+};
+
+} // namespace at::native::hipdnn_compat
+
+namespace cudnn_frontend = at::native::hipdnn_compat;