Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ torch/lib/*.lib
torch/lib/*.pdb
torch/lib/*.so*
torch/lib/protobuf*.pc
torch/lib/aotriton.images/
torch/lib/build
torch/lib/caffe2/
torch/lib/cmake
Expand Down
16 changes: 13 additions & 3 deletions aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h")
file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
file(GLOB miopen_h "miopen/*.h")
file(GLOB miopen_cpp "miopen/*.cpp")
file(GLOB hipdnn_h "hipdnn/*.h")
file(GLOB hipdnn_cpp "hipdnn/*.cpp")

file(GLOB mkl_cpp "mkl/*.cpp")
file(GLOB mkldnn_cpp "mkldnn/*.cpp")
Expand Down Expand Up @@ -179,7 +181,7 @@ file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip")
file(GLOB native_hip_cpp "native/hip/*.cpp")
file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp")
file(GLOB native_miopen_cpp "native/miopen/*.cpp")
file(GLOB native_cudnn_hip_cpp "native/cudnn/hip/*.cpp")
file(GLOB native_hipdnn_cpp "native/hipdnn/*.cpp")
file(GLOB native_nested_hip_hip "native/nested/hip/*.hip")
file(GLOB native_nested_hip_cpp "native/nested/hip/*.cpp")
file(GLOB native_sparse_hip_hip "native/sparse/hip/*.hip")
Expand Down Expand Up @@ -568,6 +570,7 @@ if(USE_CUDA)
${native_cuda_cpp}
${native_cudnn_cpp}
${native_miopen_cpp}
${native_hipdnn_cpp}
${native_nested_cuda_cpp}
${native_quantized_cuda_cpp}
${native_quantized_cudnn_cpp}
Expand Down Expand Up @@ -640,17 +643,21 @@ if(USE_ROCM)
${native_sparse_hip_cpp}
${native_quantized_hip_cpp}
${native_transformers_hip_cpp}
${native_quantized_cudnn_hip_cpp}
${native_quantized_cudnn_cpp}
${hip_cpp}
${native_hip_cpp}
${native_hip_linalg_cpp}
${cuda_generated_sources}
${ATen_HIP_SRCS}
${native_miopen_cpp}
${native_cudnn_hip_cpp}
${native_hipdnn_cpp}
${native_cudnn_cpp}
${miopen_cpp}
${all_hip_cpp}
)
if(USE_HIPDNN)
list(APPEND all_hip_cpp ${hipdnn_cpp})
endif()
endif()

if(USE_XPU)
Expand Down Expand Up @@ -937,6 +944,9 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h} ${ATen_TRANSFORMER_HEADERS})
if(NOT INTERN_BUILD_MOBILE)
list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${native_mtia_h} ${cudnn_h} ${hip_h} ${mtia_h} ${xpu_h} ${mps_h} ${native_kleidiai_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
if(USE_HIPDNN)
list(APPEND INSTALL_HEADERS ${hipdnn_h})
endif()
# Metal
if(USE_PYTORCH_METAL_EXPORT)
# Add files needed from exporting metal models(optimized_for_mobile)
Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,14 @@ void Context::setImmediateMiopen(bool b) {
immediate_miopen = b;
}

bool Context::userEnabledHipdnn() const {
return enabled_hipdnn;
}

void Context::setUserEnabledHipdnn(bool e) {
enabled_hipdnn = e;
}

bool Context::allowTF32CuBLAS() const {
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32;
Expand Down
3 changes: 3 additions & 0 deletions aten/src/ATen/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ class TORCH_API Context {
void setBenchmarkLimitCuDNN(int /*b*/);
bool immediateMiopen() const;
void setImmediateMiopen(bool /*b*/);
bool userEnabledHipdnn() const;
void setUserEnabledHipdnn(bool e);
bool deterministicCuDNN() const;
void setDeterministicCuDNN(bool /*b*/);
bool deterministicMkldnn() const;
Expand Down Expand Up @@ -478,6 +480,7 @@ class TORCH_API Context {
bool allow_fp16_bf16_reduction_mathSDP = false;
bool benchmark_cudnn = false;
bool immediate_miopen = false;
bool enabled_hipdnn = false;
Float32MatmulPrecision float32_matmul_precision =
c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
? at::Float32MatmulPrecision::HIGH
Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/cuda/detail/CUDAHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,14 @@ bool CUDAHooks::compiledWithMIOpen() const {
return AT_ROCM_ENABLED();
}

bool CUDAHooks::compiledWithHipDNN() const {
#ifdef USE_HIPDNN
return true;
#else
return false;
#endif
}

bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
#if AT_CUDNN_ENABLED()
if (!hasCUDA()) {
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/cuda/detail/CUDAHooks.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
Allocator* getPinnedMemoryAllocator() const override;
bool compiledWithCuDNN() const override;
bool compiledWithMIOpen() const override;
bool compiledWithHipDNN() const override;
bool supportsDilatedConvolutionWithCuDNN() const override;
bool supportsDepthwiseConvolutionWithCuDNN() const override;
bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/detail/CUDAHooksInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
return false;
}

virtual bool compiledWithHipDNN() const {
return false;
}

virtual bool supportsDilatedConvolutionWithCuDNN() const {
return false;
}
Expand Down
71 changes: 71 additions & 0 deletions aten/src/ATen/functorch/BatchRulesNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,60 @@ struct MiopenBatchNormBackwardBatchRuleHelper {
decltype(&fn),\
&fn>::apply)

template <typename F, F Func>
struct HipdnnBatchNormBatchRuleHelper {
static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>> apply(
const Tensor& input, std::optional<int64_t> input_bdim,
const Tensor& weight_opt, std::optional<int64_t> weight_bdim,
const std::optional<Tensor>& bias_opt, std::optional<int64_t> bias_bdim,
const std::optional<Tensor>& running_mean_opt, std::optional<int64_t> running_mean_bdim,
const std::optional<Tensor>& running_var_opt, std::optional<int64_t> running_var_bdim,
bool training, double momentum, double eps) {
return batch_norm_batch_rule<F, Func>(
input, input_bdim, weight_opt, weight_bdim, bias_opt, bias_bdim,
running_mean_opt, running_mean_bdim, running_var_opt, running_var_bdim, training, momentum, eps);
}
};

template <typename F, F Func>
struct HipdnnBatchNormBackwardBatchRuleHelper {
static std::tuple<Tensor,Tensor,Tensor> apply(
const at::Tensor & input,
const at::Tensor & grad_out,
const at::Tensor & weight,
const std::optional<at::Tensor> & running_mean_opt,
const std::optional<at::Tensor> & running_var_opt,
const std::optional<at::Tensor> & save_mean_opt,
const std::optional<at::Tensor> & save_rstd_opt,
double eps) {

auto maybe_layer = maybeCurrentDynamicLayer();
vmap_check_escaped(maybe_layer, "HipdnnBatchNormBackwardBatchRuleHelper.apply");
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
int64_t cur_level = maybe_layer->layerId();

if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
running_var_opt, save_mean_opt, save_rstd_opt}, cur_level)) {
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
return at::hipdnn_batch_norm_backward(input, grad_out, weight,
running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
}

return batch_norm_backward_plumbing<F, Func>(
grad_out, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, true, eps, {true, true, true});
}
};

#define HIPDNN_BATCH_NORM_BATCH_RULE(fn) SINGLE_ARG(\
HipdnnBatchNormBatchRuleHelper<\
decltype(&ATEN_FN(fn)),\
&ATEN_FN(fn)>::apply)

#define HIPDNN_BATCH_NORM_BACKWARD_BATCH_RULE(fn) SINGLE_ARG(\
HipdnnBatchNormBackwardBatchRuleHelper<\
decltype(&fn),\
&fn>::apply)

static std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_batch_norm_backward_wrapper(
const at::Tensor & grad_out,
const at::Tensor & input,
Expand Down Expand Up @@ -846,6 +900,21 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_batch_norm_backward_w
return at::miopen_batch_norm_backward(input, grad_out, weight_opt, running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
}

static std::tuple<at::Tensor,at::Tensor,at::Tensor> hipdnn_batch_norm_backward_wrapper(
const at::Tensor & grad_out,
const at::Tensor & input,
const at::Tensor& weight_opt,
const std::optional<at::Tensor> & running_mean_opt,
const std::optional<at::Tensor> & running_var_opt,
const std::optional<at::Tensor> & save_mean_opt,
const std::optional<at::Tensor> & save_rstd_opt,
bool training,
double eps,
std::array<bool,3> output_mask) {
TORCH_INTERNAL_ASSERT(!training);
return at::hipdnn_batch_norm_backward(input, grad_out, weight_opt, running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
}

// NB: This is NOT good. In the ideal world, we do NOT want to convert the new legit op back into native_batch_norm
// as native_batch_norm has a problematic schema--it promises it is functional when it is not. However, vmap doesn't
// work with dynamo anyway so we gain some buffer room to do wrong things here. The (reasonable) hope is that we will
Expand All @@ -866,11 +935,13 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
VMAP_SUPPORT(native_batch_norm, NATIVE_BATCH_NORM_BATCH_RULE(native_batch_norm));
VMAP_SUPPORT(cudnn_batch_norm, CUDNN_BATCH_NORM_BATCH_RULE(cudnn_batch_norm));
VMAP_SUPPORT(miopen_batch_norm, MIOPEN_BATCH_NORM_BATCH_RULE(miopen_batch_norm));
VMAP_SUPPORT(hipdnn_batch_norm, HIPDNN_BATCH_NORM_BATCH_RULE(hipdnn_batch_norm));
m.impl("_native_batch_norm_legit", _native_batch_norm_legit_batch);
m.impl("_native_batch_norm_legit.no_stats", _native_batch_norm_legit_no_stats_batch);
m.impl("native_batch_norm_backward", NATIVE_BATCH_NORM_BACKWARD_BATCH_RULE(native_batch_norm_backward));
m.impl("cudnn_batch_norm_backward", CUDNN_BATCH_NORM_BACKWARD_BATCH_RULE(at::functorch::cudnn_batch_norm_backward_wrapper));
m.impl("miopen_batch_norm_backward", MIOPEN_BATCH_NORM_BACKWARD_BATCH_RULE(at::functorch::miopen_batch_norm_backward_wrapper));
m.impl("hipdnn_batch_norm_backward", HIPDNN_BATCH_NORM_BACKWARD_BATCH_RULE(at::functorch::hipdnn_batch_norm_backward_wrapper));
m.impl("native_group_norm", native_group_norm_plumbing);
m.impl("native_group_norm_backward", native_group_norm_backward_plumbing);
VMAP_SUPPORT(native_layer_norm, native_layer_norm_batch_rule);
Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/hip_compat/include/ATen/cuda/Exceptions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#pragma once

// Shim of `<ATen/cuda/Exceptions.h>` for HIP builds. Defines
// AT_CUDNN_FRONTEND_CHECK in terms of hipDNN's check macro.

#include <ATen/hipdnn/Exceptions.h>

#define AT_CUDNN_FRONTEND_CHECK(e) HIPDNN_FE_CHECK(e)
13 changes: 13 additions & 0 deletions aten/src/ATen/hip_compat/include/ATen/cudnn/Handle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

// Shim of `<ATen/cudnn/Handle.h>` for HIP builds. Forwards the cuDNN handle
// symbols to their hipDNN equivalents so non-hipified files compile against
// the cuDNN-named API.

#include <ATen/hipdnn/Handle.h>
#include <ATen/hipdnn/hipdnn-wrapper.h>

using cudnnHandle_t = hipdnnHandle_t;
inline cudnnHandle_t getCudnnHandle() {
return at::native::getHipdnnHandle();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

// CMake-generated cuda_cmake_macros.h doesn't exist on HIP builds; forward
// to its hip equivalent so c10/cuda/CUDAMacros.h transitive-includes work.
#include <c10/hip/impl/hip_cmake_macros.h>
5 changes: 5 additions & 0 deletions aten/src/ATen/hip_compat/include/cuda.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

// CUDA driver API header; on HIP, forward to hip_runtime which exposes the
// equivalent driver entry points used by c10/cuda/CUDAException.h.
#include <hip/hip_runtime.h>
6 changes: 6 additions & 0 deletions aten/src/ATen/hip_compat/include/cuda_runtime.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#pragma once

// `cuda_runtime.h` is the catch-all CUDA SDK header; on HIP builds, forward
// to the equivalent. cuda_runtime_api.h carries the type/function aliases.
#include <hip/hip_runtime.h>
#include <cuda_runtime_api.h>
36 changes: 36 additions & 0 deletions aten/src/ATen/hip_compat/include/cuda_runtime_api.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#pragma once

// Drop-in shim so non-hipified files including <cuda_runtime_api.h> on a
// HIP build compile. Forwards to hip_runtime_api.h and aliases the cuda*
// types/enums/functions used by c10/cuda/* headers to their hip*
// equivalents — mirrors the source-level rewrites that hipify performs.

#include <hip/hip_runtime_api.h>

using cudaStream_t = hipStream_t;
using cudaError_t = hipError_t;
using cudaMemcpyKind = hipMemcpyKind;
using cudaStreamCaptureMode = hipStreamCaptureMode;
using cudaStreamCaptureStatus = hipStreamCaptureStatus;

// Enum values are accessed via `cudaStreamCaptureStatus::cudaStreamCaptureStatusNone`
// (C++ scope resolution into the enum), which becomes
// `hipStreamCaptureStatus::cudaStreamCaptureStatusNone` after the type alias.
// The inner identifier needs to be a macro that substitutes to the hip-named
// value so the lookup hits the enum's actual member.
#define cudaSuccess hipSuccess
#define cudaStreamCaptureStatusNone hipStreamCaptureStatusNone
#define cudaStreamCaptureStatusActive hipStreamCaptureStatusActive
#define cudaStreamCaptureStatusInvalidated hipStreamCaptureStatusInvalidated

#define cudaMemGetInfo hipMemGetInfo
#define cudaMallocAsync hipMallocAsync
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamGetPriority hipStreamGetPriority
#define cudaStreamIsCapturing hipStreamIsCapturing
#define cudaStreamGetCaptureInfo hipStreamGetCaptureInfo
#define cudaThreadExchangeStreamCaptureMode hipThreadExchangeStreamCaptureMode
#define cudaGetLastError hipGetLastError
#define cudaGetErrorString hipGetErrorString
#define cudaDeviceGetStreamPriorityRange hipDeviceGetStreamPriorityRange
62 changes: 62 additions & 0 deletions aten/src/ATen/hip_compat/include/cudnn_frontend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#pragma once

// Shim of cuDNN's `<cudnn_frontend.h>` for ROCm/hipDNN builds. Forwards
// `cudnn_frontend` symbols to `hipdnn_frontend` with cuDNN-style API shims
// layered on top (Graph::check_support(handle), HeurMode_t::A, etc.), and
// also forwards cuDNN-side `cudnnHandle_t`/`getCudnnHandle()`/
// `AT_CUDNN_FRONTEND_CHECK` to hipDNN equivalents — so non-hipified cuDNN
// call sites compile unchanged on HIP.

// TODO: drop this define once hipDNN exposes SDPA unconditionally and
// pytorch's LoadHIP.cmake propagates hipdnn_frontend's
// INTERFACE_COMPILE_DEFINITIONS.
#define HIPDNN_ENABLE_SDPA

#include <ATen/hipdnn/Exceptions.h>
#include <ATen/hipdnn/Handle.h>
#include <ATen/hipdnn/hipdnn-wrapper.h>
#include <hipdnn_frontend.hpp>

namespace at::native::hipdnn_compat {

using namespace hipdnn_frontend;

namespace graph {
using namespace hipdnn_frontend::graph;

class Graph : public hipdnn_frontend::graph::Graph {
public:
// cuDNN's check_support / build_plans take a handle; hipDNN's don't (the
// handle is bound at execute time). Add overloads that ignore the handle
// and forward to the no-arg APIs.
using hipdnn_frontend::graph::Graph::check_support;
using hipdnn_frontend::graph::Graph::build_plans;
auto check_support(hipdnnHandle_t /*handle*/) { return check_support(); }
auto build_plans(hipdnnHandle_t /*handle*/) { return build_plans(); }

// cuDNN exposes a per-uid query via an out-parameter. hipDNN only offers
// a one-shot {uid -> shared_ptr<Tensor_attributes>} map; wrap it.
hipdnn_frontend::error_t query_tensor_attributes_of_uid(
int64_t uid,
hipdnn_frontend::graph::Tensor_attributes& attrs) const {
auto graph_tensors = getTensorsByUid();
auto it = graph_tensors.find(uid);
if (it == graph_tensors.end()) {
return {hipdnn_frontend::error_code_t::ATTRIBUTE_NOT_SET,
"tensor uid not in graph"};
}
attrs = *it->second;
return {hipdnn_frontend::error_code_t::OK, ""};
}
};

} // namespace graph

// Map cuDNN's HeurMode_t::A (recommended heuristic) to FALLBACK on hipDNN.
struct HeurMode_t {
static constexpr auto A = hipdnn_frontend::HeurMode_t::FALLBACK;
};

} // namespace at::native::hipdnn_compat

namespace cudnn_frontend = at::native::hipdnn_compat;
Loading