diff --git a/cupy_backends/cuda/libs/cudnn.pxd b/cupy_backends/cuda/libs/cudnn.pxd index 5f7430ab3f6..530f1ff215c 100644 --- a/cupy_backends/cuda/libs/cudnn.pxd +++ b/cupy_backends/cuda/libs/cudnn.pxd @@ -6,19 +6,10 @@ from libc.stdint cimport intptr_t ############################################################################### cpdef enum: - CUDNN_DATA_FLOAT = 0 - CUDNN_DATA_DOUBLE = 1 - CUDNN_DATA_HALF = 2 - - CUDNN_DEFAULT_MATH = 0 - CUDNN_TENSOR_OP_MATH = 1 - + CUDNN_NOT_PROPAGATE_NAN = 0 CUDNN_PROPAGATE_NAN = 1 - CUDNN_NON_DETERMINISTIC = 0 - CUDNN_DETERMINISTIC = 1 - CUDNN_TENSOR_NCHW = 0 CUDNN_TENSOR_NHWC = 1 @@ -26,8 +17,6 @@ cpdef enum: CUDNN_OP_TENSOR_MUL = 1 CUDNN_OP_TENSOR_MIN = 2 CUDNN_OP_TENSOR_MAX = 3 - CUDNN_OP_TENSOR_SQRT = 4 - CUDNN_OP_TENSOR_NOT = 5 CUDNN_REDUCE_TENSOR_ADD = 0 CUDNN_REDUCE_TENSOR_MUL = 1 @@ -37,7 +26,6 @@ cpdef enum: CUDNN_REDUCE_TENSOR_AVG = 5 CUDNN_REDUCE_TENSOR_NORM1 = 6 CUDNN_REDUCE_TENSOR_NORM2 = 7 - CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 CUDNN_REDUCE_TENSOR_NO_INDICES = 0 CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1 @@ -47,51 +35,10 @@ cpdef enum: CUDNN_16BIT_INDICES = 2 CUDNN_8BIT_INDICES = 3 - CUDNN_ADD_IMAGE = 0 - CUDNN_ADD_SAME_HW = 0 - CUDNN_ADD_FEATURE_MAP = 1 - CUDNN_ADD_SAME_CHW = 1 - CUDNN_ADD_SAME_C = 2 - CUDNN_ADD_FULL_TENSOR = 3 - + # TODO Confirm from miopen team CUDNN_CONVOLUTION = 0 CUDNN_CROSS_CORRELATION = 1 - CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0 - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1 - CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3 - CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4 - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7 - - CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5 - - CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 - CUDNN_SOFTMAX_FAST = 0 CUDNN_SOFTMAX_ACCURATE = 1 CUDNN_SOFTMAX_LOG = 2 @@ -99,32 +46,10 @@ cpdef enum: CUDNN_SOFTMAX_MODE_INSTANCE = 0 CUDNN_SOFTMAX_MODE_CHANNEL = 1 - CUDNN_POOLING_MAX = 0 - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 - CUDNN_POOLING_MAX_DETERMINISTIC = 3 - - CUDNN_ACTIVATION_SIGMOID = 0 - CUDNN_ACTIVATION_RELU = 1 - CUDNN_ACTIVATION_TANH = 2 - CUDNN_ACTIVATION_CLIPPED_RELU = 3 - CUDNN_ACTIVATION_ELU = 4 - CUDNN_ACTIVATION_IDENTITY = 5 - - CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0 - - CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 - CUDNN_BATCHNORM_PER_ACTIVATION = 0 CUDNN_BATCHNORM_SPATIAL = 1 - CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0 - CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 - - CUDNN_BATCHNORM_OPS_BN = 0 - CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 - CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 CUDNN_RNN_RELU = 0 CUDNN_RNN_TANH = 1 @@ -134,645 +59,756 @@ cpdef enum: CUDNN_UNIDIRECTIONAL = 0 CUDNN_BIDIRECTIONAL = 1 - CUDNN_RNN_ALGO_STANDARD = 0 - CUDNN_RNN_ALGO_PERSIST_STATIC = 1 - CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 - - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0 - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 - CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 - CUDNN_RNN_PADDED_IO_DISABLED = 0 CUDNN_RNN_PADDED_IO_ENABLED = 1 CUDNN_LINEAR_INPUT = 0 CUDNN_SKIP_INPUT = 1 - CUDNN_SAMPLER_BILINEAR = 0 - CUDNN_STATUS_SUCCESS = 0 - CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 - CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 - CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 - - CUDNN_ERRQUERY_RAWCODE = 0 - CUDNN_ERRQUERY_NONBLOCKING = 1 - CUDNN_ERRQUERY_BLOCKING = 2 - - # cudnnFusedOps_t - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0 - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3 - CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4 - CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5 - CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6 - - # cudnnFusedOpsConstParamLabel_t - CUDNN_PARAM_XDESC = 0 - CUDNN_PARAM_XDATA_PLACEHOLDER = 1 - CUDNN_PARAM_BN_MODE = 2 - CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3 - CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4 - CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5 - CUDNN_PARAM_ACTIVATION_DESC = 6 - CUDNN_PARAM_CONV_DESC = 7 - CUDNN_PARAM_WDESC = 8 - CUDNN_PARAM_WDATA_PLACEHOLDER = 9 - CUDNN_PARAM_DWDESC = 10 - CUDNN_PARAM_DWDATA_PLACEHOLDER = 11 - CUDNN_PARAM_YDESC = 12 - CUDNN_PARAM_YDATA_PLACEHOLDER = 13 - CUDNN_PARAM_DYDESC = 14 - CUDNN_PARAM_DYDATA_PLACEHOLDER = 15 - CUDNN_PARAM_YSTATS_DESC = 16 - CUDNN_PARAM_YSUM_PLACEHOLDER = 17 - CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18 - CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19 - CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20 - CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21 - CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22 - CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23 - CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24 - CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25 - CUDNN_PARAM_ZDESC = 26 - CUDNN_PARAM_ZDATA_PLACEHOLDER = 27 - CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28 - CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29 - CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30 - CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31 - CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32 - CUDNN_PARAM_DXDESC = 33 - CUDNN_PARAM_DXDATA_PLACEHOLDER = 34 - CUDNN_PARAM_DZDESC = 35 - CUDNN_PARAM_DZDATA_PLACEHOLDER = 36 - CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37 - CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38 - - # cudnnFusedOpsPointerPlaceHolder_t - CUDNN_PTR_NULL = 0 - CUDNN_PTR_ELEM_ALIGNED = 1 - CUDNN_PTR_16B_ALIGNED = 2 - - # cudnnFusedOpsVariantParamLabel_t - CUDNN_PTR_XDATA = 0 - CUDNN_PTR_BN_EQSCALE = 1 - CUDNN_PTR_BN_EQBIAS = 2 - CUDNN_PTR_WDATA = 3 - CUDNN_PTR_DWDATA = 4 - CUDNN_PTR_YDATA = 5 - CUDNN_PTR_DYDATA = 6 - CUDNN_PTR_YSUM = 7 - CUDNN_PTR_YSQSUM = 8 - CUDNN_PTR_WORKSPACE = 9 - CUDNN_PTR_BN_SCALE = 10 - CUDNN_PTR_BN_BIAS = 11 - CUDNN_PTR_BN_SAVED_MEAN = 12 - CUDNN_PTR_BN_SAVED_INVSTD = 13 - CUDNN_PTR_BN_RUNNING_MEAN = 14 - CUDNN_PTR_BN_RUNNING_VAR = 15 - CUDNN_PTR_ZDATA = 16 - CUDNN_PTR_BN_Z_EQSCALE = 17 - CUDNN_PTR_BN_Z_EQBIAS = 18 - CUDNN_PTR_ACTIVATION_BITMASK = 19 - CUDNN_PTR_DXDATA = 20 - CUDNN_PTR_DZDATA = 21 - CUDNN_PTR_BN_DSCALE = 22 - CUDNN_PTR_BN_DBIAS = 23 - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100 - CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101 - CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 - CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 - - -############################################################################### -# Class -############################################################################### - -cdef class CuDNNAlgoPerf: - cdef: - int algo - int status - float time - size_t memory - int determinism - int mathType - - -############################################################################### -# Version -############################################################################### - -cpdef size_t getVersion() except? 0 - -############################################################################### -# Runtime error checking -############################################################################### -cpdef queryRuntimeError(intptr_t handle, int mode) - -############################################################################### -# Initialization and CUDA cooperation -############################################################################### - -cpdef intptr_t create() except? 0 -cpdef destroy(intptr_t handle) -cpdef setStream(intptr_t handle, size_t stream) -cpdef size_t getStream(intptr_t handle) except? 0 - - -############################################################################### -# Tensor manipulation -############################################################################### - -cpdef size_t createTensorDescriptor() except? 0 -cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, - int n, int c, int h, int w) -cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, - int n, int c, int h, int w, int nStride, - int cStride, int hStride, int wStride) -cpdef tuple getTensor4dDescriptor(size_t tensorDesc) -cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, - size_t dimA, size_t strideA) -cpdef destroyTensorDescriptor(size_t tensorDesc) -cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, - size_t b, size_t beta, size_t yDesc, size_t y) - - -############################################################################### -# Tensor operations -############################################################################### - -cpdef size_t createOpTensorDescriptor() except? 0 -cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, - int opTensorCompType, int opTensorNanOpt) -cpdef getOpTensorDescriptor(size_t opTensorDesc) -cpdef destroyOpTensorDescriptor(size_t opTensorDesc) -cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, - size_t aDesc, size_t A, size_t alpha2, size_t bDesc, - size_t B, size_t beta, size_t cDesc, size_t C) - - -############################################################################### -# Tensor reductions -############################################################################### - -cpdef size_t createReduceTensorDescriptor() except? 0 -cpdef setReduceTensorDescriptor( - size_t reduceTensorDesc, int reduceTensorOp, - int reduceTensorCompType, int reduceTensorNanOpt, - int reduceTensorIndices, int reduceTensorIndicesType) -cpdef getReduceTensorDescriptor(size_t reduceTensorDesc) -cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc) -cpdef size_t getReductionIndicesSize( - intptr_t handle, size_t reduceTensorDesc, size_t aDesc, - size_t cDesc) except? 0 -cpdef size_t getReductionWorkspaceSize( - intptr_t handle, size_t reduceTensorDesc, size_t aDesc, - size_t cDesc) except? 0 -cpdef reduceTensor( - intptr_t handle, size_t reduceTensorDesc, size_t indices, - size_t indicesSizeInBytes, size_t workspace, - size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, - size_t A, size_t beta, size_t cDesc, size_t C) -cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr) -cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha) - - -############################################################################### -# Filter manipulation -############################################################################### - -cpdef size_t createFilterDescriptor() except? 0 -cpdef setFilter4dDescriptor_v4( - size_t filterDesc, int dataType, int format, int k, int c, int h, int w) -cpdef setFilterNdDescriptor_v4( - size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA) -cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested) -cpdef destroyFilterDescriptor(size_t filterDesc) - - -############################################################################### -# Convolution -############################################################################### - -cpdef size_t createConvolutionDescriptor() except? 0 -cpdef setConvolutionMathType( - size_t convDesc, size_t mathType) -cpdef size_t getConvolutionMathType(size_t convDesc) except? 0 -cpdef setConvolutionGroupCount( - size_t convDesc, int groupCount) -cpdef int getConvolutionGroupCount(size_t convDesc) except? -1 -cpdef setConvolution2dDescriptor_v4( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode) -cpdef setConvolution2dDescriptor_v5( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode, size_t computeType) -cpdef setConvolutionNdDescriptor_v3( - size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, - size_t dilationA, int mode, int dataType) -cpdef destroyConvolutionDescriptor(size_t convDesc) -cpdef findConvolutionForwardAlgorithm( - intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, size_t yDesc, - int requestedAlgoCount) -cpdef list findConvolutionForwardAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionForwardAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionForwardAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionForwardAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int algo) except? -1 -cpdef convolutionForward( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t filterDesc, size_t filterData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t destDesc, size_t destData) -cpdef convolutionBackwardBias( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t beta, size_t destDesc, size_t destData) -cpdef findConvolutionBackwardFilterAlgorithm( - intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, - size_t dwDesc, int requestedAlgoCount) -cpdef list findConvolutionBackwardFilterAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionBackwardFilterAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int preference, size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionBackwardFilterAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int algo) except? -1 -cpdef convolutionBackwardFilter_v3( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData) -cpdef findConvolutionBackwardDataAlgorithm( - intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, - size_t dxDesc, int requestedAlgoCount) -cpdef list findConvolutionBackwardDataAlgorithmEx( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionBackwardDataAlgorithmEx_v7( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionBackwardDataAlgorithm_v6( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, size_t preference, - size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionBackwardDataAlgorithm_v7( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int algo) except? -1 -cpdef convolutionBackwardData_v3( - intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData) - - -############################################################################### -# Pooling -############################################################################### - -cpdef size_t createPoolingDescriptor() except? 0 -cpdef setPooling2dDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, - int windowWidth, int verticalPadding, int horizontalPadding, - int verticalStride, int horizontalStride) -cpdef setPoolingNdDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, - size_t windowDimA, size_t paddingA, size_t strideA) -cpdef destroyPoolingDescriptor(size_t poolingDesc) -cpdef poolingForward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData) -cpdef poolingBackward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData) - -############################################################################### -# Batch Normalization -############################################################################### - -cpdef deriveBNTensorDescriptor( - size_t derivedBnDesc, size_t xDesc, int mode) - -cpdef batchNormalizationForwardTraining( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance) - -cpdef batchNormalizationForwardInference( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, size_t estimatedMean, size_t estimatedVariance, - double epsilon) - -cpdef batchNormalizationBackward( - intptr_t handle, int mode, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, size_t dyDesc, - size_t dy, size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, size_t bnScale, - size_t dBnScaleResult, size_t dBnBiasResult, - double epsilon, size_t savedMean, size_t savedInvVariance) - -cpdef batchNormalizationForwardTrainingEx( - intptr_t handle, int mode, int bnOps, - size_t alpha, size_t beta, - size_t xDesc, size_t x, - size_t zDesc, size_t z, - size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, - size_t bnScale, size_t bnBias, - double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - -cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t zDesc, - size_t yDesc, - size_t bnScaleBiasMeanVarDesc, - size_t activationDesc) except? 0 - -cpdef batchNormalizationBackwardEx( - intptr_t handle, int mode, int bnops, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, - size_t yDesc, size_t y, - size_t dyDesc, size_t dy, - size_t dzDesc, size_t dz, - size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, - size_t bnScaleData, size_t bnBiasData, - size_t dBnScaleData, size_t dBnBiasData, - double epsilon, - size_t savedMean, size_t savedInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - -cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t yDesc, - size_t dyDesc, - size_t dzDesc, - size_t dxDesc, - size_t dBnScaleBiasDesc, - size_t activationDesc) except? 0 - -cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( - intptr_t handle, int mode, int bnOps, - size_t activationDesc, - size_t xDesc) except? 0 - - -############################################################################### -# Activation -############################################################################### - -cpdef size_t createActivationDescriptor() except? 0 -cpdef setActivationDescriptor( - size_t activationDesc, int mode, int reluNanOpt, double reluCeiling) -cpdef destroyActivationDescriptor(size_t activationDesc) -cpdef softmaxForward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData) -cpdef softmaxBackward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, - size_t destDiffDesc, size_t destDiffData) -cpdef activationForward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData) -cpdef activationBackward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData) - - -############################################################################### -# Dropout -############################################################################### -cpdef size_t createDropoutDescriptor() except? 0 -cpdef destroyDropoutDescriptor(size_t dropoutDesc) -cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1 -cpdef setDropoutDescriptor( - size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed) -cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0 -cpdef dropoutForward( - intptr_t handle, size_t dropoutDesc, - size_t srcDesc, size_t srcData, - size_t dstDesc, size_t dstData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef dropoutBackward( - intptr_t handle, size_t dropoutDesc, - size_t dyDesc, size_t dyData, - size_t dxtDesc, size_t dxData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - - -############################################################################### -# CTC -############################################################################### - -cpdef size_t createCTCLossDescriptor() except? 0 -cpdef destroyCTCLossDescriptor(size_t ctcLossDesc) -cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType) -cpdef getCTCLossDescriptor(size_t ctcLossDesc) -cpdef size_t getCTCLossWorkspaceSize( - intptr_t handle, size_t probsDesc, size_t gradientsDesc, - size_t labels, size_t labelLengths, size_t inputLengths, - int algo, size_t ctcLossDesc) except? 0 -cpdef CTCLoss( - intptr_t handle, size_t probsDesc, - size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, - size_t costs, size_t gradientsDesc, size_t gradients, int algo, - size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes) - - -############################################################################### -# RNN -############################################################################### - -cpdef size_t createRNNDescriptor() except? 0 -cpdef destroyRNNDescriptor(size_t rnnDesc) -cpdef size_t createPersistentRNNPlan( - size_t rnnDesc, int minibatch, int dataType) except? 0 -cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan) -cpdef destroyPersistentRNNPlan(size_t plan) -cpdef setRNNDescriptor_v5( - size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int dataType) -cpdef setRNNDescriptor_v6( - intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int algo, int dataType) -cpdef setRNNPaddingMode(size_t rnnDesc, int paddingMode) -cpdef getRNNPaddingMode(size_t rnnDesc) -cpdef size_t createRNNDataDescriptor() except? 0 -cpdef destroyRNNDataDescriptor(size_t RNNDataDesc) -cpdef setRNNDataDescriptor( - size_t RNNDataDesc, int dataType, size_t layout, - int maxSeqLength, int batchSize, int vectorSize, - size_t seqLengthArray, size_t paddingFill) -cpdef getRNNDataDescriptor( - size_t RNNDataDesc, size_t dataType, - size_t layout, size_t maxSeqLength, size_t batchSize, - size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, - size_t paddingFill) -cpdef getRNNWorkspaceSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) -cpdef getRNNTrainingReserveSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) -cpdef getRNNParamsSize( - intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType) -cpdef getRNNLinLayerMatrixParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat) -cpdef getRNNLinLayerBiasParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerBiasDesc, - size_t linLayerBias) -cpdef RNNForwardInference( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, - size_t x, size_t hxDesc, size_t hx, size_t cxDesc, - size_t cx, size_t wDesc, size_t w, size_t yDesc, - size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t workspace, size_t workSpaceSizeInBytes) -cpdef RNNForwardTraining( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t wDesc, size_t w, size_t yDesc, size_t y, - size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, - size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardData( - intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, - size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, - size_t dcxDesc, size_t dcx, size_t workspace, - size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardWeights( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, - size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNForwardInferenceEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef RNNForwardTrainingEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardDataEx( - intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, - size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, - size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, - size_t dkDesc, size_t dkeys, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardWeightsEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t dwDesc, size_t dw, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - - -############################################################################### -# Spatial Transformer -############################################################################### - -cpdef size_t createSpatialTransformerDescriptor() except? 0 -cpdef destroySpatialTransformerDescriptor(size_t stDesc) -cpdef setSpatialTransformerDescriptor( - size_t stDesc, size_t samplerType, int dataType, - int nbDims, size_t dimA) -cpdef spatialTfGridGeneratorForward( - intptr_t handle, size_t stDesc, size_t theta, size_t grid) -cpdef spatialTfGridGeneratorBackward( - intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta) -cpdef spatialTfSamplerForward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t grid, size_t beta, size_t yDesc, size_t y) -cpdef spatialTfSamplerBackward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, - size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid) - -############################################################################### -# Fused Ops -############################################################################### - -cpdef createFusedOpsConstParamPack(int ops) -cpdef destroyFusedOpsConstParamPack(size_t constPack) -cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param) -cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param) -cpdef createFusedOpsVariantParamPack(int ops) -cpdef destroyFusedOpsVariantParamPack(size_t varPack) -cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr) -cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr) -cpdef createFusedOpsPlan(int ops) -cpdef destroyFusedOpsPlan(size_t plan) -cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) -cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) +IF CUPY_HIP_VERSION > 0: + cpdef enum: + CUDNN_DATA_FLOAT = 1 + CUDNN_DATA_DOUBLE = 6 + CUDNN_DATA_HALF = 0 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 5 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 1 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 2 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 3 + + CUDNN_POOLING_MAX = 0 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 2 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 1 + + CUDNN_ACTIVATION_RELU = 3 + CUDNN_ACTIVATION_TANH = 2 + CUDNN_ACTIVATION_CLIPPED_RELU = 7 + CUDNN_ACTIVATION_ELU = 9 + CUDNN_ACTIVATION_IDENTITY = 0 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 1 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 2 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 3 +ELSE: + cpdef enum: + CUDNN_DATA_FLOAT = 0 + CUDNN_DATA_DOUBLE = 1 + CUDNN_DATA_HALF = 2 + + CUDNN_DEFAULT_MATH = 0 + CUDNN_TENSOR_OP_MATH = 1 + + CUDNN_NON_DETERMINISTIC = 0 + CUDNN_DETERMINISTIC = 1 + + CUDNN_OP_TENSOR_SQRT = 4 + CUDNN_OP_TENSOR_NOT = 5 + + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 + + CUDNN_ADD_IMAGE = 0 + CUDNN_ADD_SAME_HW = 0 + CUDNN_ADD_FEATURE_MAP = 1 + CUDNN_ADD_SAME_CHW = 1 + CUDNN_ADD_SAME_C = 2 + CUDNN_ADD_FULL_TENSOR = 3 + + CUDNN_CONVOLUTION = 0 + CUDNN_CROSS_CORRELATION = 1 + + CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0 + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4 + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7 + + CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5 + + CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 + + CUDNN_POOLING_MAX = 0 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 + CUDNN_POOLING_MAX_DETERMINISTIC = 3 + + CUDNN_ACTIVATION_SIGMOID = 0 + CUDNN_ACTIVATION_RELU = 1 + CUDNN_ACTIVATION_TANH = 2 + CUDNN_ACTIVATION_CLIPPED_RELU = 3 + CUDNN_ACTIVATION_ELU = 4 + CUDNN_ACTIVATION_IDENTITY = 5 + + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0 + + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 + + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 + + CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 + + CUDNN_BATCHNORM_OPS_BN = 0 + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 + + CUDNN_RNN_ALGO_STANDARD = 0 + CUDNN_RNN_ALGO_PERSIST_STATIC = 1 + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 + + CUDNN_SAMPLER_BILINEAR = 0 + + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 + + CUDNN_ERRQUERY_RAWCODE = 0 + CUDNN_ERRQUERY_NONBLOCKING = 1 + CUDNN_ERRQUERY_BLOCKING = 2 + + # cudnnFusedOps_t + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0 + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3 + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4 + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5 + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6 + + # cudnnFusedOpsConstParamLabel_t + CUDNN_PARAM_XDESC = 0 + CUDNN_PARAM_XDATA_PLACEHOLDER = 1 + CUDNN_PARAM_BN_MODE = 2 + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3 + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4 + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5 + CUDNN_PARAM_ACTIVATION_DESC = 6 + CUDNN_PARAM_CONV_DESC = 7 + CUDNN_PARAM_WDESC = 8 + CUDNN_PARAM_WDATA_PLACEHOLDER = 9 + CUDNN_PARAM_DWDESC = 10 + CUDNN_PARAM_DWDATA_PLACEHOLDER = 11 + CUDNN_PARAM_YDESC = 12 + CUDNN_PARAM_YDATA_PLACEHOLDER = 13 + CUDNN_PARAM_DYDESC = 14 + CUDNN_PARAM_DYDATA_PLACEHOLDER = 15 + CUDNN_PARAM_YSTATS_DESC = 16 + CUDNN_PARAM_YSUM_PLACEHOLDER = 17 + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18 + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19 + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20 + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21 + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22 + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23 + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24 + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25 + CUDNN_PARAM_ZDESC = 26 + CUDNN_PARAM_ZDATA_PLACEHOLDER = 27 + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28 + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29 + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30 + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31 + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32 + CUDNN_PARAM_DXDESC = 33 + CUDNN_PARAM_DXDATA_PLACEHOLDER = 34 + CUDNN_PARAM_DZDESC = 35 + CUDNN_PARAM_DZDATA_PLACEHOLDER = 36 + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37 + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38 + + # cudnnFusedOpsPointerPlaceHolder_t + CUDNN_PTR_NULL = 0 + CUDNN_PTR_ELEM_ALIGNED = 1 + CUDNN_PTR_16B_ALIGNED = 2 + + # cudnnFusedOpsVariantParamLabel_t + CUDNN_PTR_XDATA = 0 + CUDNN_PTR_BN_EQSCALE = 1 + CUDNN_PTR_BN_EQBIAS = 2 + CUDNN_PTR_WDATA = 3 + CUDNN_PTR_DWDATA = 4 + CUDNN_PTR_YDATA = 5 + CUDNN_PTR_DYDATA = 6 + CUDNN_PTR_YSUM = 7 + CUDNN_PTR_YSQSUM = 8 + CUDNN_PTR_WORKSPACE = 9 + CUDNN_PTR_BN_SCALE = 10 + CUDNN_PTR_BN_BIAS = 11 + CUDNN_PTR_BN_SAVED_MEAN = 12 + CUDNN_PTR_BN_SAVED_INVSTD = 13 + CUDNN_PTR_BN_RUNNING_MEAN = 14 + CUDNN_PTR_BN_RUNNING_VAR = 15 + CUDNN_PTR_ZDATA = 16 + CUDNN_PTR_BN_Z_EQSCALE = 17 + CUDNN_PTR_BN_Z_EQBIAS = 18 + CUDNN_PTR_ACTIVATION_BITMASK = 19 + CUDNN_PTR_DXDATA = 20 + CUDNN_PTR_DZDATA = 21 + CUDNN_PTR_BN_DSCALE = 22 + CUDNN_PTR_BN_DBIAS = 23 + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100 + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101 + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 + +IF CUPY_HIP_VERSION == 0: + ############################################################################### + # Class + ############################################################################### + + cdef class CuDNNAlgoPerf: + cdef: + int algo + int status + float time + size_t memory + int determinism + int mathType + + + ############################################################################### + # Version + ############################################################################### + + cpdef size_t getVersion() except? 0 + + ############################################################################### + # Runtime error checking + ############################################################################### + cpdef queryRuntimeError(intptr_t handle, int mode) + + ############################################################################### + # Initialization and CUDA cooperation + ############################################################################### + + cpdef intptr_t create() except? 0 + cpdef destroy(intptr_t handle) + cpdef setStream(intptr_t handle, size_t stream) + cpdef size_t getStream(intptr_t handle) except? 0 + + + ############################################################################### + # Tensor manipulation + ############################################################################### + + cpdef size_t createTensorDescriptor() except? 0 + cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w) + cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride) + cpdef tuple getTensor4dDescriptor(size_t tensorDesc) + cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA) + cpdef destroyTensorDescriptor(size_t tensorDesc) + cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y) + + + ############################################################################### + # Tensor operations + ############################################################################### + + cpdef size_t createOpTensorDescriptor() except? 0 + cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt) + cpdef getOpTensorDescriptor(size_t opTensorDesc) + cpdef destroyOpTensorDescriptor(size_t opTensorDesc) + cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C) + + + ############################################################################### + # Tensor reductions + ############################################################################### + + cpdef size_t createReduceTensorDescriptor() except? 0 + cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, + int reduceTensorCompType, int reduceTensorNanOpt, + int reduceTensorIndices, int reduceTensorIndicesType) + cpdef getReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef size_t getReductionIndicesSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef size_t getReductionWorkspaceSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef reduceTensor( + intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C) + cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr) + cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha) + + + ############################################################################### + # Filter manipulation + ############################################################################### + + cpdef size_t createFilterDescriptor() except? 0 + cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, int format, int k, int c, int h, int w) + cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA) + cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested) + cpdef destroyFilterDescriptor(size_t filterDesc) + + + ############################################################################### + # Convolution + ############################################################################### + + cpdef size_t createConvolutionDescriptor() except? 0 + cpdef setConvolutionMathType( + size_t convDesc, size_t mathType) + cpdef size_t getConvolutionMathType(size_t convDesc) except? 0 + cpdef setConvolutionGroupCount( + size_t convDesc, int groupCount) + cpdef int getConvolutionGroupCount(size_t convDesc) except? -1 + cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode) + cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType) + cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType) + cpdef destroyConvolutionDescriptor(size_t convDesc) + cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, size_t yDesc, + int requestedAlgoCount) + cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1 + cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData) + cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData) + cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1 + cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1 + cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + + + ############################################################################### + # Pooling + ############################################################################### + + cpdef size_t createPoolingDescriptor() except? 0 + cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride) + cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA) + cpdef destroyPoolingDescriptor(size_t poolingDesc) + cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + ############################################################################### + # Batch Normalization + ############################################################################### + + cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode) + + cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance) + + cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon) + + cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance) + + cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0 + + cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0 + + cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0 + + + ############################################################################### + # Activation + ############################################################################### + + cpdef size_t createActivationDescriptor() except? 0 + cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling) + cpdef destroyActivationDescriptor(size_t activationDesc) + cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData) + cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + + ############################################################################### + # Dropout + ############################################################################### + cpdef size_t createDropoutDescriptor() except? 0 + cpdef destroyDropoutDescriptor(size_t dropoutDesc) + cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1 + cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed) + cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0 + cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxtDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # CTC + ############################################################################### + + cpdef size_t createCTCLossDescriptor() except? 0 + cpdef destroyCTCLossDescriptor(size_t ctcLossDesc) + cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType) + cpdef getCTCLossDescriptor(size_t ctcLossDesc) + cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0 + cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, int algo, + size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes) + + + ############################################################################### + # RNN + ############################################################################### + + cpdef size_t createRNNDescriptor() except? 0 + cpdef destroyRNNDescriptor(size_t rnnDesc) + cpdef size_t createPersistentRNNPlan( + size_t rnnDesc, int minibatch, int dataType) except? 0 + cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan) + cpdef destroyPersistentRNNPlan(size_t plan) + cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType) + cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType) + cpdef setRNNPaddingMode(size_t rnnDesc, int paddingMode) + cpdef getRNNPaddingMode(size_t rnnDesc) + cpdef size_t createRNNDataDescriptor() except? 0 + cpdef destroyRNNDataDescriptor(size_t RNNDataDesc) + cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill) + cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill) + cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType) + cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat) + cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias) + cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # Spatial Transformer + ############################################################################### + + cpdef size_t createSpatialTransformerDescriptor() except? 0 + cpdef destroySpatialTransformerDescriptor(size_t stDesc) + cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA) + cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid) + cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta) + cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y) + cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid) + + ############################################################################### + # Fused Ops + ############################################################################### + + cpdef createFusedOpsConstParamPack(int ops) + cpdef destroyFusedOpsConstParamPack(size_t constPack) + cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef createFusedOpsVariantParamPack(int ops) + cpdef destroyFusedOpsVariantParamPack(size_t varPack) + cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef createFusedOpsPlan(int ops) + cpdef destroyFusedOpsPlan(size_t plan) + cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) + cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 464c59d8a00..2af12623dc6 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -8,2534 +8,2536 @@ from libcpp cimport vector from cupy_backends.cuda.api cimport driver from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module - -############################################################################### -# Extern -############################################################################### - -cdef extern from '../../cupy_cudnn.h' nogil: - # Types - ctypedef int ActivationMode 'cudnnActivationMode_t' - ctypedef int AddMode 'cudnnAddMode_t' - ctypedef int BatchNormMode 'cudnnBatchNormMode_t' - ctypedef int BatchNormOps 'cudnnBatchNormOps_t' - ctypedef int ConvolutionBwdDataAlgo 'cudnnConvolutionBwdDataAlgo_t' - ctypedef int ConvolutionBwdDataPreference \ - 'cudnnConvolutionBwdDataPreference_t' - ctypedef struct ConvolutionBwdDataAlgoPerf \ - 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ - 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionBwdFilterAlgo 'cudnnConvolutionBwdFilterAlgo_t' - ctypedef int ConvolutionBwdFilterPreference \ - 'cudnnConvolutionBwdFilterPreference_t' - ctypedef struct ConvolutionBwdFilterAlgoPerf \ - 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ - 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionFwdAlgo 'cudnnConvolutionFwdAlgo_t' - ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' - ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionFwdAlgoPerf_v7 \ - 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionMode 'cudnnConvolutionMode_t' - ctypedef int DataType 'cudnnDataType_t' - ctypedef int MathType 'cudnnMathType_t' - ctypedef int DirectionMode 'cudnnDirectionMode_t' - ctypedef int NanPropagation 'cudnnNanPropagation_t' - ctypedef int PoolingMode 'cudnnPoolingMode_t' - ctypedef int RNNInputMode 'cudnnRNNInputMode_t' - ctypedef int CTCLossAlgo 'cudnnCTCLossAlgo_t' - ctypedef int RNNMode 'cudnnRNNMode_t' - ctypedef int RNNAlgo 'cudnnRNNAlgo_t' - ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' - ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' - ctypedef int SoftmaxAlgorithm 'cudnnSoftmaxAlgorithm_t' - ctypedef int SoftmaxMode 'cudnnSoftmaxMode_t' - ctypedef int Status 'cudnnStatus_t' - ctypedef int TensorFormat 'cudnnTensorFormat_t' - ctypedef int OpTensorOp 'cudnnOpTensorOp_t' - ctypedef int ReduceTensorOp 'cudnnReduceTensorOp_t' - ctypedef int ReduceTensorIndices 'cudnnReduceTensorIndices_t' - ctypedef int IndicesType 'cudnnIndicesType_t' - ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' - ctypedef int FusedOps 'cudnnFusedOps_t' - ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' - ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' - ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' - ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' - - ctypedef void* ActivationDescriptor 'cudnnActivationDescriptor_t' - ctypedef void* ConvolutionDescriptor 'cudnnConvolutionDescriptor_t' - ctypedef void* DropoutDescriptor 'cudnnDropoutDescriptor_t' - ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' - ctypedef void* Handle 'cudnnHandle_t' - ctypedef void* PoolingDescriptor 'cudnnPoolingDescriptor_t' - ctypedef void* CTCLossDescriptor 'cudnnCTCLossDescriptor_t' - ctypedef void* RNNDescriptor 'cudnnRNNDescriptor_t' - ctypedef void* RNNDataDescriptor 'cudnnRNNDataDescriptor_t' - ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' - ctypedef void* TensorDescriptor 'cudnnTensorDescriptor_t' - ctypedef void* OpTensorDescriptor 'cudnnOpTensorDescriptor_t' - ctypedef void* ReduceTensorDescriptor 'cudnnReduceTensorDescriptor_t' - ctypedef void* SpatialTransformerDescriptor \ - 'cudnnSpatialTransformerDescriptor_t' - ctypedef void* SamplerType 'cudnnSamplerType_t' - ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' - ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' - ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' - +IF CUPY_USE_GEN_HIP_CODE: + from cupy_backends.cuda.libs.miopen import * +ELSE: + ############################################################################### + # Extern + ############################################################################### + + cdef extern from '../../cupy_cudnn.h' nogil: + # Types + ctypedef int ActivationMode 'cudnnActivationMode_t' + ctypedef int AddMode 'cudnnAddMode_t' + ctypedef int BatchNormMode 'cudnnBatchNormMode_t' + ctypedef int BatchNormOps 'cudnnBatchNormOps_t' + ctypedef int ConvolutionBwdDataAlgo 'cudnnConvolutionBwdDataAlgo_t' + ctypedef int ConvolutionBwdDataPreference \ + 'cudnnConvolutionBwdDataPreference_t' + ctypedef struct ConvolutionBwdDataAlgoPerf \ + 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ + 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionBwdFilterAlgo 'cudnnConvolutionBwdFilterAlgo_t' + ctypedef int ConvolutionBwdFilterPreference \ + 'cudnnConvolutionBwdFilterPreference_t' + ctypedef struct ConvolutionBwdFilterAlgoPerf \ + 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ + 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionFwdAlgo 'cudnnConvolutionFwdAlgo_t' + ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' + ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionFwdAlgoPerf_v7 \ + 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionMode 'cudnnConvolutionMode_t' + ctypedef int DataType 'cudnnDataType_t' + ctypedef int MathType 'cudnnMathType_t' + ctypedef int DirectionMode 'cudnnDirectionMode_t' + ctypedef int NanPropagation 'cudnnNanPropagation_t' + ctypedef int PoolingMode 'cudnnPoolingMode_t' + ctypedef int RNNInputMode 'cudnnRNNInputMode_t' + ctypedef int CTCLossAlgo 'cudnnCTCLossAlgo_t' + ctypedef int RNNMode 'cudnnRNNMode_t' + ctypedef int RNNAlgo 'cudnnRNNAlgo_t' + ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' + ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' + ctypedef int SoftmaxAlgorithm 'cudnnSoftmaxAlgorithm_t' + ctypedef int SoftmaxMode 'cudnnSoftmaxMode_t' + ctypedef int Status 'cudnnStatus_t' + ctypedef int TensorFormat 'cudnnTensorFormat_t' + ctypedef int OpTensorOp 'cudnnOpTensorOp_t' + ctypedef int ReduceTensorOp 'cudnnReduceTensorOp_t' + ctypedef int ReduceTensorIndices 'cudnnReduceTensorIndices_t' + ctypedef int IndicesType 'cudnnIndicesType_t' + ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' + ctypedef int FusedOps 'cudnnFusedOps_t' + ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' + ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' + ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' + ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' + + ctypedef void* ActivationDescriptor 'cudnnActivationDescriptor_t' + ctypedef void* ConvolutionDescriptor 'cudnnConvolutionDescriptor_t' + ctypedef void* DropoutDescriptor 'cudnnDropoutDescriptor_t' + ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' + ctypedef void* Handle 'cudnnHandle_t' + ctypedef void* PoolingDescriptor 'cudnnPoolingDescriptor_t' + ctypedef void* CTCLossDescriptor 'cudnnCTCLossDescriptor_t' + ctypedef void* RNNDescriptor 'cudnnRNNDescriptor_t' + ctypedef void* RNNDataDescriptor 'cudnnRNNDataDescriptor_t' + ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' + ctypedef void* TensorDescriptor 'cudnnTensorDescriptor_t' + ctypedef void* OpTensorDescriptor 'cudnnOpTensorDescriptor_t' + ctypedef void* ReduceTensorDescriptor 'cudnnReduceTensorDescriptor_t' + ctypedef void* SpatialTransformerDescriptor \ + 'cudnnSpatialTransformerDescriptor_t' + ctypedef void* SamplerType 'cudnnSamplerType_t' + ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' + ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' + ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' + + # Error handling + const char* cudnnGetErrorString(Status status) + + # Version + size_t cudnnGetVersion() + + # Runtime error checking + int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + ErrQueryMode mode, RuntimeTag *tag) + + # Initialization and CUDA cooperation + int cudnnCreate(Handle* handle) + int cudnnDestroy(Handle handle) + int cudnnSetStream(Handle handle, driver.Stream stream) + int cudnnGetStream(Handle handle, driver.Stream* stream) + + # Tensor manipulation + int cudnnCreateTensorDescriptor(TensorDescriptor* descriptor) + int cudnnSetTensor4dDescriptor( + TensorDescriptor tensorDesc, TensorFormat format, + DataType dataType, int n, int c, int h, int w) + int cudnnSetTensor4dDescriptorEx( + TensorDescriptor tensorDesc, DataType dataType, + int n, int c, int h, int w, + int nStride, int cStride, int hStride, int wStride) + int cudnnGetTensor4dDescriptor( + TensorDescriptor tensorDesc, DataType* dataType, + int* n, int* c, int* h, int* w, + int* nStride, int* cStride, int* hStride, int* wStride) + int cudnnSetTensorNdDescriptor( + TensorDescriptor tensorDesc, DataType dataType, int nbDims, + int* dimA, int* strideA) + int cudnnDestroyTensorDescriptor(TensorDescriptor tensorDesc) + int cudnnAddTensor_v3( + Handle handle, void* alpha, TensorDescriptor bDesc, + void* b, void* beta, TensorDescriptor yDesc, void* y) + + # Tensor operations + int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) + int cudnnSetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, + DataType opTensorCompType, NanPropagation opTensorNanOpt) + int cudnnGetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, + DataType* opTensorCompType, NanPropagation* opTensorNanOpt) + int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) + int cudnnOpTensor( + Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, + TensorDescriptor aDesc, void* A, void* alpha2, + TensorDescriptor bDesc, void* B, void* beta, + TensorDescriptor cDesc, void* C) + + # Tensor reductions + int cudnnCreateReduceTensorDescriptor( + ReduceTensorDescriptor* reduceTensorDesc) + int cudnnSetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, + DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, + ReduceTensorIndices reduceTensorIndices, + IndicesType reduceTensorIndicesType) + int cudnnGetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, + ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, + NanPropagation* reduceTensorNanOpt, + ReduceTensorIndices* reduceTensorIndices, + IndicesType* reduceTensorIndicesType) + int cudnnDestroyReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc) + int cudnnGetReductionIndicesSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int cudnnGetReductionWorkspaceSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int cudnnReduceTensor( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, + size_t indicesSizeInBytes, void* workspace, + size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, + void* A, void* beta, TensorDescriptor cDesc, void* c) + int cudnnSetTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) + int cudnnScaleTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* alpha) + + # Filter manipulation + int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) + int cudnnSetFilter4dDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int k, int c, int h, int w) + int cudnnSetFilterNdDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int nbDims, const int filterDimA[]) + int cudnnGetFilterNdDescriptor_v4( + FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, + TensorFormat* format, int* nbDims, int filterDimA[]) + int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) + + # Convolution + int cudnnCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) + int cudnnSetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType mathType) + int cudnnGetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType *mathType) + int cudnnSetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int groupCount) + int cudnnGetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int *groupCount) + int cudnnSetConvolution2dDescriptor_v4( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode) + int cudnnSetConvolution2dDescriptor_v5( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode, + DataType computeType) + int cudnnSetConvolutionNdDescriptor_v3( + ConvolutionDescriptor convDesc, int arrayLength, int* padA, + int* filterStrideA, int* dilationA, ConvolutionMode mode, + DataType dataType) + int cudnnDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) + int cudnnFindConvolutionForwardAlgorithm( + Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, + ConvolutionDescriptor convDesc, TensorDescriptor yDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionFwdAlgoPerf* perfResults) + int cudnnFindConvolutionForwardAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionForwardAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionForwardAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, ConvolutionFwdPreference preference, + size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) + int cudnnGetConvolutionForwardAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) + int cudnnGetConvolutionForwardWorkspaceSize( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, ConvolutionFwdAlgo algo, + size_t* sizeInBytes) + int cudnnConvolutionForward( + Handle handle, void* alpha, TensorDescriptor srcDesc, + void* srcData, FilterDescriptor filterDesc, void* filterData, + ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnConvolutionBackwardBias( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnFindConvolutionBackwardFilterAlgorithm( + Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardFilterAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionBackwardFilterAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) + int cudnnGetConvolutionBackwardFilterAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf_v7* perfResults) + int cudnnGetConvolutionBackwardFilterWorkspaceSize( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) + int cudnnConvolutionBackwardFilter_v3( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + FilterDescriptor gradDesc, void* gradData) + int cudnnGetConvolutionBackwardDataAlgorithm_v6( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + ConvolutionBwdDataPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) + int cudnnGetConvolutionBackwardDataAlgorithm_v7( + Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf_v7* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithm( + Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithmEx( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionBackwardDataWorkspaceSize( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + ConvolutionBwdDataAlgo algo, size_t* sizeInBytes) + int cudnnConvolutionBackwardData_v3( + Handle handle, void* alpha, + FilterDescriptor filterDesc, void* filterData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor gradDesc, void* gradData) + + # Pooling + int cudnnCreatePoolingDescriptor(PoolingDescriptor* desc) + int cudnnSetPooling2dDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, + int verticalPadding, int horizontalPadding, int verticalStride, + int horizontalStride) + int cudnnSetPoolingNdDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int nbDims, + int* windowDimA, int* paddingA, int* strideA) + int cudnnDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) + int cudnnPoolingForward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnPoolingBackward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Batch Normalization + int cudnnDeriveBNTensorDescriptor( + TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, + BatchNormMode mode) + int cudnnBatchNormalizationForwardTraining( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, void* resultSaveMean, + void* resultSaveInvVariance) + int cudnnBatchNormalizationForwardInference( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, void* estimatedMean, void* estimatedVariance, + double epsilon) + int cudnnBatchNormalizationBackward( + Handle handle, BatchNormMode mode, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, void* bnScale, + void* dBnScaleResult, void* dBnBiasResult, + double epsilon, void* savedMean, void* savedInvVariance) + + int cudnnBatchNormalizationForwardTrainingEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + void* alpha, void* beta, + TensorDescriptor xDesc, void* x, + TensorDescriptor zDesc, void* z, + TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, + void* bnScale, void* bnBias, + double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, + void* resultSaveMean, void* resultSaveInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor zDesc, + TensorDescriptor yDesc, + TensorDescriptor bnScaleBiasMeanVarDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnBatchNormalizationBackwardEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnops, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor yDesc, void* y, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dzDesc, void* dz, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, + void* bnScaleData, void* bnBiasData, + void* dBnScaleData, void* dBnBiasData, + double epsilon, + void* savedMean, void* savedInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationBackwardExWorkspaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor yDesc, + TensorDescriptor dyDesc, + TensorDescriptor dzDesc, + TensorDescriptor dxDesc, + TensorDescriptor dBnScaleBiasDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + ActivationDescriptor activationDesc, + TensorDescriptor xDesc, + size_t* sizeInBytes) + + # Activation + int cudnnCreateActivationDescriptor( + ActivationDescriptor* activationDesc) + int cudnnSetActivationDescriptor( + ActivationDescriptor activationDesc, ActivationMode mode, + NanPropagation reluNanOpt, double reluCeiling) + int cudnnDestroyActivationDescriptor( + ActivationDescriptor activationDesc) + int cudnnSoftmaxForward( + Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, + void* alpha, TensorDescriptor srcDesc, void* srcData, + void* beta, TensorDescriptor dstDesc, void* dstData) + int cudnnSoftmaxBackward( + Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, + void* alpha, TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + int cudnnActivationForward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnActivationBackward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Dropout + int cudnnCreateDropoutDescriptor(DropoutDescriptor* desc) + int cudnnDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) + int cudnnDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) + int cudnnDropoutGetReserveSpaceSize( + TensorDescriptor xDesc, size_t* sizeInBytes) + int cudnnSetDropoutDescriptor( + DropoutDescriptor dropoutDesc, Handle handle, float dropout, + void* states, size_t stateSizeInBytes, unsigned long long seed) + int cudnnDropoutForward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor dstDesc, void* dstData, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnDropoutBackward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, + void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # CTC + int cudnnCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) + int cudnnDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) + int cudnnSetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType dataType) + int cudnnGetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType* dataType) + int cudnnGetCTCLossWorkspaceSize( + Handle handle, TensorDescriptor probsDesc, + TensorDescriptor gradientsDesc, int* labels, + int* labelLengths, int* inputLengths, CTCLossAlgo algo, + CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) + int cudnnCTCLoss( + Handle handle, TensorDescriptor probsDesc, + void* probs, int* labels, int* labelLengths, int* inputLengths, + void* costs, TensorDescriptor gradientsDesc, void* gradients, + CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, + void* workspace, size_t workSpaceSizeInBytes) + # RNN + int cudnnCreateRNNDescriptor(RNNDescriptor* rnnDesc) + int cudnnDestroyRNNDescriptor(RNNDescriptor rnnDesc) + int cudnnCreatePersistentRNNPlan( + RNNDescriptor rnnDesc, + const int minibatch, DataType dataType, + PersistentRNNPlan* plan) + int cudnnSetPersistentRNNPlan( + RNNDescriptor rnnDesc, PersistentRNNPlan plan) + int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) + int cudnnSetRNNDescriptor_v5( + RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, DataType dataType) + int cudnnSetRNNDescriptor_v6( + Handle handle, RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) + int cudnnSetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) + int cudnnGetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) + int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) + int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) + int cudnnSetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, + int maxSeqLength, int batchSize, int vectorSize, + const int seqLengthArray[], void *paddingFill) + int cudnnGetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType* dataType, + RNNDataLayout* layout, int* maxSeqLength, int* batchSize, + int* vectorSize, int arrayLengthRequested, int seqLengthArray[], + void* paddingFill) + int cudnnGetRNNWorkspaceSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int cudnnGetRNNTrainingReserveSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int cudnnGetRNNParamsSize( + Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, + size_t* sizeInBytes, DataType dataType) + int cudnnGetRNNLinLayerMatrixParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerMatDesc, + void** linLayerMat) + int cudnnGetRNNLinLayerBiasParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerBiasDesc, + void** linLayerBias) + int cudnnRNNForwardInference( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, + void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, + void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, + void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, + void* cy, void* workspace, size_t workSpaceSizeInBytes) + int cudnnRNNForwardTraining( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, + TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, + FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, + TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, + void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardData( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* yDesc, void* y, + TensorDescriptor* dyDesc, void* dy, + TensorDescriptor dhyDesc, void* dhy, + TensorDescriptor dcyDesc, void* dcy, + FilterDescriptor wDesc, void* w, + TensorDescriptor hxDesc, void* hx, + TensorDescriptor cxDesc, void* cx, + TensorDescriptor* dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, void* workspace, + size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeights( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, + TensorDescriptor* yDesc, void* y, + void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, + void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + int cudnnRNNForwardInferenceEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnRNNForwardTrainingEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardDataEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor yDesc, const void* y, + RNNDataDescriptor dyDesc, const void* dy, + RNNDataDescriptor dcDesc, const void* dcAttn, + TensorDescriptor dhyDesc, const void* dhy, + TensorDescriptor dcyDesc, const void* dcy, + FilterDescriptor wDesc, const void* w, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + RNNDataDescriptor dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, + RNNDataDescriptor dkDesc, void* dkeys, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeightsEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + RNNDataDescriptor yDesc, const void* y, + void* workSpace, size_t workSpaceSizeInBytes, + FilterDescriptor dwDesc, void* dw, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # Spatial Transformer + int cudnnCreateSpatialTransformerDescriptor( + SpatialTransformerDescriptor* stDesc) + int cudnnDestroySpatialTransformerDescriptor( + SpatialTransformerDescriptor stDesc) + int cudnnSetSpatialTransformerNdDescriptor( + SpatialTransformerDescriptor stDesc, SamplerType samplerType, + DataType dataType, int nbDims, int dimA[]) + int cudnnSpatialTfGridGeneratorForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* theta, void* grid) + int cudnnSpatialTfGridGeneratorBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* dgrid, void* dtheta) + int cudnnSpatialTfSamplerForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, + void* grid, void* beta, TensorDescriptor yDesc, void* y) + int cudnnSpatialTfSamplerBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, void* beta, + TensorDescriptor dxDesc, void* dx, void* alphaDgrid, + TensorDescriptor dyDesc, void* dy, void* grid, + void* betaDgrid, void* dgrid) + + # Fused Ops + int cudnnCreateFusedOpsConstParamPack( + FusedOpsConstParamPack* constPack, int ops) + int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) + int cudnnSetFusedOpsConstParamPackAttribute( + FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, + const void *param) + int cudnnGetFusedOpsConstParamPackAttribute( + const FusedOpsConstParamPack constPack, + FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) + int cudnnCreateFusedOpsVariantParamPack( + FusedOpsVariantParamPack *varPack, FusedOps ops) + int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) + int cudnnSetFusedOpsVariantParamPackAttribute( + FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, + void *ptr) + int cudnnGetFusedOpsVariantParamPackAttribute( + const FusedOpsVariantParamPack varPack, + FusedOpsVariantParamLabel paramLabel, void *ptr) + int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) + int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) + int cudnnMakeFusedOpsPlan( + Handle handle, FusedOpsPlan plan, + const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) + int cudnnFusedOpsExecute( + Handle handle, const FusedOpsPlan plan, + FusedOpsVariantParamPack varPack) + + # Build-time version + int CUDNN_VERSION + + # Constants + double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' + + """ + cdef class CuDNNAlgoPerf: + + def __init__(self, algo, status, time, memory, determinism, mathType): + self.algo = algo + self.status = status + self.time = time + self.memory = memory + self.determinism = determinism + self.mathType = mathType + """ + + ############################################################################### # Error handling - const char* cudnnGetErrorString(Status status) - + ############################################################################### + + class CuDNNError(RuntimeError): + + def __init__(self, int status): + self.status = status + msg = cudnnGetErrorString(status) + super(CuDNNError, self).__init__( + 'cuDNN Error: {}'.format(msg.decode())) + self._infos = [] + + def add_info(self, info): + assert isinstance(info, str) + self._infos.append(info) + + def add_infos(self, infos): + assert isinstance(infos, list) + self._infos.extend(infos) + + def __str__(self): + base = super(CuDNNError, self).__str__() + return base + ''.join( + '\n ' + info for info in self._infos) + + def __reduce__(self): + return (type(self), (self.status,)) + + + @cython.profile(False) + cpdef inline check_status(int status): + if status != 0: + raise CuDNNError(status) + + + ############################################################################### + # Build-time version + ############################################################################### + + def get_build_version(): + return CUDNN_VERSION + + + ############################################################################### # Version - size_t cudnnGetVersion() - + ############################################################################### + + cpdef size_t getVersion() except? 0: + return cudnnGetVersion() + + + ############################################################################### # Runtime error checking - int cudnnQueryRuntimeError(Handle handle, Status *rstatus, - ErrQueryMode mode, RuntimeTag *tag) - + ############################################################################### + + cpdef queryRuntimeError(intptr_t handle, int mode): + cdef Status rstatus + with nogil: + status = cudnnQueryRuntimeError(handle, &rstatus, + mode, 0) + check_status(status) + return rstatus + + + ############################################################################### # Initialization and CUDA cooperation - int cudnnCreate(Handle* handle) - int cudnnDestroy(Handle handle) - int cudnnSetStream(Handle handle, driver.Stream stream) - int cudnnGetStream(Handle handle, driver.Stream* stream) - + ############################################################################### + + cpdef intptr_t create() except? 0: + cdef Handle handle + with nogil: + status = cudnnCreate(&handle) + check_status(status) + return handle + + + cpdef destroy(intptr_t handle): + with nogil: + status = cudnnDestroy(handle) + check_status(status) + + + cpdef setStream(intptr_t handle, size_t stream): + # TODO(leofang): The support of stream capture is not mentioned at all in + # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. + if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): + raise NotImplementedError( + 'calling cuDNN API during stream capture is currently ' + 'unsupported') + + status = cudnnSetStream(handle, stream) + check_status(status) + + + cpdef size_t getStream(intptr_t handle) except? 0: + cdef driver.Stream stream + status = cudnnGetStream(handle, &stream) + check_status(status) + return stream + + + cdef _setStream(intptr_t handle): + """Set current stream""" + setStream(handle, stream_module.get_current_stream_ptr()) + + ############################################################################### # Tensor manipulation - int cudnnCreateTensorDescriptor(TensorDescriptor* descriptor) - int cudnnSetTensor4dDescriptor( - TensorDescriptor tensorDesc, TensorFormat format, - DataType dataType, int n, int c, int h, int w) - int cudnnSetTensor4dDescriptorEx( - TensorDescriptor tensorDesc, DataType dataType, - int n, int c, int h, int w, - int nStride, int cStride, int hStride, int wStride) - int cudnnGetTensor4dDescriptor( - TensorDescriptor tensorDesc, DataType* dataType, - int* n, int* c, int* h, int* w, - int* nStride, int* cStride, int* hStride, int* wStride) - int cudnnSetTensorNdDescriptor( - TensorDescriptor tensorDesc, DataType dataType, int nbDims, - int* dimA, int* strideA) - int cudnnDestroyTensorDescriptor(TensorDescriptor tensorDesc) - int cudnnAddTensor_v3( - Handle handle, void* alpha, TensorDescriptor bDesc, - void* b, void* beta, TensorDescriptor yDesc, void* y) - + ############################################################################### + + cpdef size_t createTensorDescriptor() except? 0: + cdef TensorDescriptor descriptor + status = cudnnCreateTensorDescriptor(&descriptor) + check_status(status) + return descriptor + + + cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w): + status = cudnnSetTensor4dDescriptor( + tensorDesc, format, + dataType, n, c, h, w) + check_status(status) + + + cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride): + status = cudnnSetTensor4dDescriptorEx( + tensorDesc, dataType, n, c, h, w, + nStride, cStride, hStride, wStride) + check_status(status) + + + cpdef tuple getTensor4dDescriptor(size_t tensorDesc): + cdef DataType dataType + cdef int n, c, h, w, nStride, cStride, hStride, wStride + status = cudnnGetTensor4dDescriptor( + tensorDesc, &dataType, + &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) + check_status(status) + return dataType, n, c, h, w, nStride, cStride, hStride, wStride + + + cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA): + status = cudnnSetTensorNdDescriptor( + tensorDesc, dataType, nbDims, + dimA, strideA) + check_status(status) + + + cpdef destroyTensorDescriptor(size_t tensorDesc): + status = cudnnDestroyTensorDescriptor(tensorDesc) + check_status(status) + + + cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnAddTensor_v3( + handle, alpha, bDesc, + b, beta, yDesc, y) + check_status(status) + + + ############################################################################### # Tensor operations - int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) - int cudnnSetOpTensorDescriptor( - OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, - DataType opTensorCompType, NanPropagation opTensorNanOpt) - int cudnnGetOpTensorDescriptor( - OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, - DataType* opTensorCompType, NanPropagation* opTensorNanOpt) - int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) - int cudnnOpTensor( - Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, - TensorDescriptor aDesc, void* A, void* alpha2, - TensorDescriptor bDesc, void* B, void* beta, - TensorDescriptor cDesc, void* C) - + ############################################################################### + + cpdef size_t createOpTensorDescriptor() except? 0: + cdef OpTensorDescriptor opTensorDesc + status = cudnnCreateOpTensorDescriptor(&opTensorDesc) + check_status(status) + return opTensorDesc + + + cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt): + status = cudnnSetOpTensorDescriptor( + opTensorDesc, opTensorOp, + opTensorCompType, opTensorNanOpt) + check_status(status) + + + cpdef getOpTensorDescriptor(size_t opTensorDesc): + cdef OpTensorOp opTensorOp + cdef DataType opTensorCompType + cdef NanPropagation opTensorNanOpt + status = cudnnGetOpTensorDescriptor( + opTensorDesc, &opTensorOp, &opTensorCompType, + &opTensorNanOpt) + check_status(status) + return opTensorOp, opTensorCompType, opTensorNanOpt + + + cpdef destroyOpTensorDescriptor(size_t opTensorDesc): + status = cudnnDestroyOpTensorDescriptor(opTensorDesc) + check_status(status) + + + cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) + check_status(status) + + + ############################################################################### # Tensor reductions - int cudnnCreateReduceTensorDescriptor( - ReduceTensorDescriptor* reduceTensorDesc) - int cudnnSetReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, - DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, - ReduceTensorIndices reduceTensorIndices, - IndicesType reduceTensorIndicesType) - int cudnnGetReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc, - ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, - NanPropagation* reduceTensorNanOpt, - ReduceTensorIndices* reduceTensorIndices, - IndicesType* reduceTensorIndicesType) - int cudnnDestroyReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc) - int cudnnGetReductionIndicesSize( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, - TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) - int cudnnGetReductionWorkspaceSize( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, - TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) - int cudnnReduceTensor( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, - size_t indicesSizeInBytes, void* workspace, - size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, - void* A, void* beta, TensorDescriptor cDesc, void* c) - int cudnnSetTensor( - Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) - int cudnnScaleTensor( - Handle handle, TensorDescriptor yDesc, void* y, void* alpha) - + ############################################################################### + + cpdef size_t createReduceTensorDescriptor() except? 0: + cdef ReduceTensorDescriptor reduceTensorDesc + status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) + check_status(status) + return reduceTensorDesc + + cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, + int reduceTensorNanOpt, int reduceTensorIndices, + int reduceTensorIndicesType): + status = cudnnSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) + check_status(status) + + + cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): + cdef ReduceTensorOp redOp + cdef DataType redCompType + cdef NanPropagation redNanOpt + cdef ReduceTensorIndices redIndices + cdef IndicesType redIndicesType + status = cudnnGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) + check_status(status) + return redOp, redCompType, redNanOpt, redIndices, redIndicesType + + + cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): + status = cudnnDestroyReduceTensorDescriptor( + reduceTensorDesc) + check_status(status) + + + cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef size_t getReductionWorkspaceSize(intptr_t handle, + size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) + check_status(status) + + + cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): + _setStream(handle) + with nogil: + status = cudnnSetTensor( + handle, yDesc, y, + valuePtr) + check_status(status) + + + cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): + _setStream(handle) + with nogil: + status = cudnnScaleTensor( + handle, yDesc, y, + alpha) + check_status(status) + + + ############################################################################### # Filter manipulation - int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) - int cudnnSetFilter4dDescriptor_v4( - FilterDescriptor filterDesc, DataType dataType, - TensorFormat format, int k, int c, int h, int w) - int cudnnSetFilterNdDescriptor_v4( - FilterDescriptor filterDesc, DataType dataType, - TensorFormat format, int nbDims, const int filterDimA[]) - int cudnnGetFilterNdDescriptor_v4( - FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, - TensorFormat* format, int* nbDims, int filterDimA[]) - int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) - + ############################################################################### + + cpdef size_t createFilterDescriptor() except? 0: + cdef FilterDescriptor desc + status = cudnnCreateFilterDescriptor(&desc) + check_status(status) + return desc + + + cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, + int format, int k, int c, int h, int w): + status = cudnnSetFilter4dDescriptor_v4( + filterDesc, dataType, + format, k, c, h, w) + check_status(status) + + + cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, + int format, int nbDims, size_t filterDimA): + status = cudnnSetFilterNdDescriptor_v4( + filterDesc, dataType, + format, nbDims, filterDimA) + check_status(status) + + + cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): + cdef DataType dataType + cdef TensorFormat format + cdef int nbDims + cdef vector.vector[int] filterDimA + filterDimA.resize(nbDimsRequested) + + status = cudnnGetFilterNdDescriptor_v4( + wDesc, nbDimsRequested, &dataType, + &format, &nbDims, filterDimA.data()) + check_status(status) + return dataType, format, nbDims, tuple(filterDimA) + + + cpdef destroyFilterDescriptor(size_t filterDesc): + status = cudnnDestroyFilterDescriptor(filterDesc) + check_status(status) + + + ############################################################################### # Convolution - int cudnnCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) - int cudnnSetConvolutionMathType( - ConvolutionDescriptor convDesc, MathType mathType) - int cudnnGetConvolutionMathType( - ConvolutionDescriptor convDesc, MathType *mathType) - int cudnnSetConvolutionGroupCount( - ConvolutionDescriptor convDesc, int groupCount) - int cudnnGetConvolutionGroupCount( - ConvolutionDescriptor convDesc, int *groupCount) - int cudnnSetConvolution2dDescriptor_v4( - ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, - int v, int dilation_h, int dilation_w, ConvolutionMode mode) - int cudnnSetConvolution2dDescriptor_v5( - ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, - int v, int dilation_h, int dilation_w, ConvolutionMode mode, - DataType computeType) - int cudnnSetConvolutionNdDescriptor_v3( - ConvolutionDescriptor convDesc, int arrayLength, int* padA, - int* filterStrideA, int* dilationA, ConvolutionMode mode, - DataType dataType) - int cudnnDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) - int cudnnFindConvolutionForwardAlgorithm( - Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, - ConvolutionDescriptor convDesc, TensorDescriptor yDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionFwdAlgoPerf* perfResults) - int cudnnFindConvolutionForwardAlgorithmEx( - Handle handle, TensorDescriptor xDesc, void* x, - FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, - TensorDescriptor yDesc, void* y, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionForwardAlgorithmEx_v7( - Handle handle, TensorDescriptor xDesc, void* x, - FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, - TensorDescriptor yDesc, void* y, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionForwardAlgorithm_v6( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, ConvolutionFwdPreference preference, - size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) - int cudnnGetConvolutionForwardAlgorithm_v7( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) - int cudnnGetConvolutionForwardWorkspaceSize( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, ConvolutionFwdAlgo algo, - size_t* sizeInBytes) - int cudnnConvolutionForward( - Handle handle, void* alpha, TensorDescriptor srcDesc, - void* srcData, FilterDescriptor filterDesc, void* filterData, - ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - TensorDescriptor destDesc, void* destData) - int cudnnConvolutionBackwardBias( - Handle handle, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor destDesc, void* destData) - int cudnnFindConvolutionBackwardFilterAlgorithm( - Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, - ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdFilterAlgoPerf* perfResults) - int cudnnFindConvolutionBackwardFilterAlgorithmEx( - Handle handle, TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( - Handle handle, TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionBackwardFilterAlgorithm_v6( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, - ConvolutionBwdFilterPreference preference, - size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) - int cudnnGetConvolutionBackwardFilterAlgorithm_v7( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdFilterAlgoPerf_v7* perfResults) - int cudnnGetConvolutionBackwardFilterWorkspaceSize( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, - ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) - int cudnnConvolutionBackwardFilter_v3( - Handle handle, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor diffDesc, void* diffData, - ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - FilterDescriptor gradDesc, void* gradData) - int cudnnGetConvolutionBackwardDataAlgorithm_v6( - Handle handle, FilterDescriptor filterDesc, - TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, - ConvolutionBwdDataPreference preference, - size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) - int cudnnGetConvolutionBackwardDataAlgorithm_v7( - Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdDataAlgoPerf_v7* perfResults) - int cudnnFindConvolutionBackwardDataAlgorithm( - Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, - ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdDataAlgoPerf* perfResults) - int cudnnFindConvolutionBackwardDataAlgorithmEx( - Handle handle, FilterDescriptor wDesc, void* w, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( - Handle handle, FilterDescriptor wDesc, void* w, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionBackwardDataWorkspaceSize( - Handle handle, FilterDescriptor filterDesc, - TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, - ConvolutionBwdDataAlgo algo, size_t* sizeInBytes) - int cudnnConvolutionBackwardData_v3( - Handle handle, void* alpha, - FilterDescriptor filterDesc, void* filterData, - TensorDescriptor diffDesc, void* diffData, - ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - TensorDescriptor gradDesc, void* gradData) - + ############################################################################### + + cpdef size_t createConvolutionDescriptor() except? 0: + cdef ConvolutionDescriptor desc + status = cudnnCreateConvolutionDescriptor(&desc) + check_status(status) + return desc + + + cpdef setConvolutionMathType(size_t convDesc, size_t mathType): + status = cudnnSetConvolutionMathType( + convDesc, mathType) + check_status(status) + + + cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: + cdef MathType mathType + status = cudnnGetConvolutionMathType( + convDesc, &mathType) + check_status(status) + return mathType + + + cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): + status = cudnnSetConvolutionGroupCount( + convDesc, groupCount) + check_status(status) + + + cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: + cdef int groupCount + status = cudnnGetConvolutionGroupCount( + convDesc, &groupCount) + check_status(status) + return groupCount + + + cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode): + status = cudnnSetConvolution2dDescriptor_v4( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode) + check_status(status) + + + cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType): + status = cudnnSetConvolution2dDescriptor_v5( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode, computeType) + check_status(status) + + + cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType): + status = cudnnSetConvolutionNdDescriptor_v3( + convDesc, arrayLength, padA, + filterStrideA, dilationA, mode, + dataType) + check_status(status) + + + cpdef destroyConvolutionDescriptor(size_t convDesc): + status = cudnnDestroyConvolutionDescriptor( + convDesc) + check_status(status) + + + cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, + size_t yDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithm( + handle, xDesc, wDesc, + convDesc, yDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + """ + cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx_v7( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + """ + + cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: + cdef ConvolutionFwdAlgo algo + status = cudnnGetConvolutionForwardAlgorithm_v6( + handle, srcDesc, + filterDesc, convDesc, + destDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + """ + cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionForwardAlgorithm_v7( + handle, srcDesc, + filterDesc, convDesc, + destDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + """ + + cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionForwardWorkspaceSize( + handle, srcDesc, + filterDesc, convDesc, + destDesc, algo, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) + check_status(status) + + + cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) + check_status(status) + + + cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithm( + handle, xDesc, dyDesc, + convDesc, dwDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + """ + cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + """ + + cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdFilterAlgo algo + status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, + preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + """ + cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( + handle, srcDesc, diffDesc, + convDesc, gradDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + """ + + cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, algo, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardFilter_v3( + handle, alpha, + srcDesc, srcData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + + + cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithm( + handle, wDesc, dyDesc, + convDesc, dxDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + """ + cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + """ + + cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdDataAlgo algo + status = cudnnGetConvolutionBackwardDataAlgorithm_v6( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + """ + cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardDataAlgorithm_v7( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + """ + + cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, filterDesc, + diffDesc, + convDesc, gradDesc, + algo, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardData_v3( + handle, alpha, + filterDesc, filterData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + + ############################################################################### # Pooling - int cudnnCreatePoolingDescriptor(PoolingDescriptor* desc) - int cudnnSetPooling2dDescriptor_v4( - PoolingDescriptor poolingDesc, PoolingMode mode, - NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, - int verticalPadding, int horizontalPadding, int verticalStride, - int horizontalStride) - int cudnnSetPoolingNdDescriptor_v4( - PoolingDescriptor poolingDesc, PoolingMode mode, - NanPropagation maxpoolingNanOpt, int nbDims, - int* windowDimA, int* paddingA, int* strideA) - int cudnnDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) - int cudnnPoolingForward( - Handle handle, PoolingDescriptor poolingDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor dstDesc, void* dstData) - int cudnnPoolingBackward( - Handle handle, PoolingDescriptor poolingDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, - TensorDescriptor destDesc, void* destData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - + ############################################################################### + + cpdef size_t createPoolingDescriptor() except? 0: + cdef PoolingDescriptor desc + status = cudnnCreatePoolingDescriptor(&desc) + check_status(status) + return desc + + + cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride): + status = cudnnSetPooling2dDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, windowHeight, windowWidth, + verticalPadding, horizontalPadding, verticalStride, horizontalStride) + check_status(status) + + + cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA): + status = cudnnSetPoolingNdDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, nbDims, + windowDimA, paddingA, strideA) + check_status(status) + + + cpdef destroyPoolingDescriptor(size_t poolingDesc): + status = cudnnDestroyPoolingDescriptor(poolingDesc) + check_status(status) + + + cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnPoolingForward( + handle, poolingDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + + cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnPoolingBackward( + handle, poolingDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + + ############################################################################### # Batch Normalization - int cudnnDeriveBNTensorDescriptor( - TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, - BatchNormMode mode) - int cudnnBatchNormalizationForwardTraining( - Handle handle, BatchNormMode mode, - void* alpha, void* beta, TensorDescriptor xDesc, - void* x, TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, - void* bnBias, double exponentialAverageFactor, - void* resultRunningMean, void* resultRunningVariance, - double epsilon, void* resultSaveMean, - void* resultSaveInvVariance) - int cudnnBatchNormalizationForwardInference( - Handle handle, BatchNormMode mode, - void* alpha, void* beta, TensorDescriptor xDesc, - void* x, TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, - void* bnBias, void* estimatedMean, void* estimatedVariance, - double epsilon) - int cudnnBatchNormalizationBackward( - Handle handle, BatchNormMode mode, - void* alphaDataDiff, void* betaDataDiff, - void* alphaParamDiff, void* betaParamDiff, - TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, - TensorDescriptor dxDesc, void* dx, - TensorDescriptor dBnScaleBiasDesc, void* bnScale, - void* dBnScaleResult, void* dBnBiasResult, - double epsilon, void* savedMean, void* savedInvVariance) - - int cudnnBatchNormalizationForwardTrainingEx( - Handle handle, - BatchNormMode mode, BatchNormOps bnOps, - void* alpha, void* beta, - TensorDescriptor xDesc, void* x, - TensorDescriptor zDesc, void* z, - TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, - void* bnScale, void* bnBias, - double exponentialAverageFactor, - void* resultRunningMean, void* resultRunningVariance, - double epsilon, - void* resultSaveMean, void* resultSaveInvVariance, - ActivationDescriptor activationDesc, - void* workspace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - Handle handle, - BatchNormMode mode, BatchNormOps bnOps, - TensorDescriptor xDesc, - TensorDescriptor zDesc, - TensorDescriptor yDesc, - TensorDescriptor bnScaleBiasMeanVarDesc, - ActivationDescriptor activationDesc, - size_t* sizeInBytes) - int cudnnBatchNormalizationBackwardEx( - Handle handle, - BatchNormMode mode, BatchNormOps bnops, - void* alphaDataDiff, void* betaDataDiff, - void* alphaParamDiff, void* betaParamDiff, - TensorDescriptor xDesc, void* x, - TensorDescriptor yDesc, void* y, - TensorDescriptor dyDesc, void* dy, - TensorDescriptor dzDesc, void* dz, - TensorDescriptor dxDesc, void* dx, - TensorDescriptor dBnScaleBiasDesc, - void* bnScaleData, void* bnBiasData, - void* dBnScaleData, void* dBnBiasData, - double epsilon, - void* savedMean, void* savedInvVariance, - ActivationDescriptor activationDesc, - void* workspace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnGetBatchNormalizationBackwardExWorkspaceSize( - Handle handle, - BatchNormMode mode, - BatchNormOps bnOps, - TensorDescriptor xDesc, - TensorDescriptor yDesc, - TensorDescriptor dyDesc, - TensorDescriptor dzDesc, - TensorDescriptor dxDesc, - TensorDescriptor dBnScaleBiasDesc, - ActivationDescriptor activationDesc, - size_t* sizeInBytes) - int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - Handle handle, - BatchNormMode mode, - BatchNormOps bnOps, - ActivationDescriptor activationDesc, - TensorDescriptor xDesc, - size_t* sizeInBytes) - + ############################################################################### + + CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON + + cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode): + status = cudnnDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) + check_status(status) + + + cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) + check_status(status) + + + cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) + check_status(status) + + + cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) + check_status(status) + + + cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardTrainingEx( + handle, mode, bnOps, + alpha, beta, + xDesc, x, + zDesc, z, + yDesc, y, + bnScaleBiasMeanVarDesc, + bnScale, bnBias, + exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + handle, + mode, bnOps, + xDesc, + zDesc, + yDesc, + bnScaleBiasMeanVarDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationBackwardEx( + handle, + mode, bnops, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + yDesc, y, + dyDesc, dy, + dzDesc, dz, + dxDesc, dx, + dBnScaleBiasDesc, + bnScaleData, bnBiasData, + dBnScaleData, dBnBiasData, + epsilon, + savedMean, savedInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( + handle, + mode, + bnOps, + xDesc, + yDesc, + dyDesc, + dzDesc, + dxDesc, + dBnScaleBiasDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + handle, + mode, + bnOps, + activationDesc, + xDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + ############################################################################### # Activation - int cudnnCreateActivationDescriptor( - ActivationDescriptor* activationDesc) - int cudnnSetActivationDescriptor( - ActivationDescriptor activationDesc, ActivationMode mode, - NanPropagation reluNanOpt, double reluCeiling) - int cudnnDestroyActivationDescriptor( - ActivationDescriptor activationDesc) - int cudnnSoftmaxForward( - Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, - void* alpha, TensorDescriptor srcDesc, void* srcData, - void* beta, TensorDescriptor dstDesc, void* dstData) - int cudnnSoftmaxBackward( - Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, - void* alpha, TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - int cudnnActivationForward_v4( - Handle handle, ActivationDescriptor activationDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor dstDesc, void* dstData) - int cudnnActivationBackward_v4( - Handle handle, ActivationDescriptor activationDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, - TensorDescriptor destDesc, void* destData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - + ############################################################################### + + cpdef size_t createActivationDescriptor() except? 0: + cdef ActivationDescriptor activationDesc + status = cudnnCreateActivationDescriptor(&activationDesc) + check_status(status) + return activationDesc + + + cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): + status = cudnnSetActivationDescriptor( + activationDesc, mode, + reluNanOpt, reluCeiling) + check_status(status) + + + cpdef destroyActivationDescriptor(size_t activationDesc): + status = cudnnDestroyActivationDescriptor( + activationDesc) + check_status(status) + + + cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnSoftmaxForward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) + check_status(status) + + + cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnSoftmaxBackward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + check_status(status) + + + cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnActivationForward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + + cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnActivationBackward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + + + ############################################################################### # Dropout - int cudnnCreateDropoutDescriptor(DropoutDescriptor* desc) - int cudnnDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) - int cudnnDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) - int cudnnDropoutGetReserveSpaceSize( - TensorDescriptor xDesc, size_t* sizeInBytes) - int cudnnSetDropoutDescriptor( - DropoutDescriptor dropoutDesc, Handle handle, float dropout, - void* states, size_t stateSizeInBytes, unsigned long long seed) - int cudnnDropoutForward( - Handle handle, DropoutDescriptor dropoutDesc, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor dstDesc, void* dstData, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnDropoutBackward( - Handle handle, DropoutDescriptor dropoutDesc, - TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, - void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) - + ############################################################################### + + cpdef size_t createDropoutDescriptor() except? 0: + cdef DropoutDescriptor desc + status = cudnnCreateDropoutDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyDropoutDescriptor(size_t dropoutDesc): + status = cudnnDestroyDropoutDescriptor(dropoutDesc) + check_status(status) + + + cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: + cdef size_t sizeInBytes + status = cudnnDropoutGetStatesSize( + handle, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed): + status = cudnnSetDropoutDescriptor( + dropoutDesc, handle, dropout, + states, stateSizeInBytes, seed) + check_status(status) + + + cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutForward( + handle, dropoutDesc, + srcDesc, srcData, + dstDesc, dstData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutBackward( + handle, dropoutDesc, + dyDesc, dyData, + dxDesc, dxData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + ############################################################################### # CTC - int cudnnCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) - int cudnnDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) - int cudnnSetCTCLossDescriptor( - CTCLossDescriptor ctcLossDesc, DataType dataType) - int cudnnGetCTCLossDescriptor( - CTCLossDescriptor ctcLossDesc, DataType* dataType) - int cudnnGetCTCLossWorkspaceSize( - Handle handle, TensorDescriptor probsDesc, - TensorDescriptor gradientsDesc, int* labels, - int* labelLengths, int* inputLengths, CTCLossAlgo algo, - CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) - int cudnnCTCLoss( - Handle handle, TensorDescriptor probsDesc, - void* probs, int* labels, int* labelLengths, int* inputLengths, - void* costs, TensorDescriptor gradientsDesc, void* gradients, - CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, - void* workspace, size_t workSpaceSizeInBytes) + ############################################################################### + cpdef size_t createCTCLossDescriptor() except? 0: + cdef CTCLossDescriptor desc + status = cudnnCreateCTCLossDescriptor(&desc) + check_status(status) + return desc + + cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): + status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) + check_status(status) + + cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): + status = cudnnSetCTCLossDescriptor( + ctcLossDesc, dataType) + check_status(status) + + cpdef getCTCLossDescriptor(size_t ctcLossDesc): + cdef DataType compType + status = cudnnGetCTCLossDescriptor( + ctcLossDesc, &compType) + check_status(status) + return compType + + cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, + int algo, size_t ctcLossDesc, + size_t workspace, size_t workSpaceSizeInBytes): + status = cudnnCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) + check_status(status) + + + ############################################################################### # RNN - int cudnnCreateRNNDescriptor(RNNDescriptor* rnnDesc) - int cudnnDestroyRNNDescriptor(RNNDescriptor rnnDesc) - int cudnnCreatePersistentRNNPlan( - RNNDescriptor rnnDesc, - const int minibatch, DataType dataType, - PersistentRNNPlan* plan) - int cudnnSetPersistentRNNPlan( - RNNDescriptor rnnDesc, PersistentRNNPlan plan) - int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) - int cudnnSetRNNDescriptor_v5( - RNNDescriptor rnnDesc, int hiddenSize, - int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, - DirectionMode direction, RNNMode mode, DataType dataType) - int cudnnSetRNNDescriptor_v6( - Handle handle, RNNDescriptor rnnDesc, int hiddenSize, - int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, - DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) - int cudnnSetRNNPaddingMode( - RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) - int cudnnGetRNNPaddingMode( - RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) - int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) - int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) - int cudnnSetRNNDataDescriptor( - RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, - int maxSeqLength, int batchSize, int vectorSize, - const int seqLengthArray[], void *paddingFill) - int cudnnGetRNNDataDescriptor( - RNNDataDescriptor RNNDataDesc, DataType* dataType, - RNNDataLayout* layout, int* maxSeqLength, int* batchSize, - int* vectorSize, int arrayLengthRequested, int seqLengthArray[], - void* paddingFill) - int cudnnGetRNNWorkspaceSize( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, size_t* sizeInBytes) - int cudnnGetRNNTrainingReserveSize( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, size_t* sizeInBytes) - int cudnnGetRNNParamsSize( - Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, - size_t* sizeInBytes, DataType dataType) - int cudnnGetRNNLinLayerMatrixParams( - Handle handle, RNNDescriptor rnnDesc, int layer, - TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, - int linLayerID, FilterDescriptor linLayerMatDesc, - void** linLayerMat) - int cudnnGetRNNLinLayerBiasParams( - Handle handle, RNNDescriptor rnnDesc, int layer, - TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, - int linLayerID, FilterDescriptor linLayerBiasDesc, - void** linLayerBias) - int cudnnRNNForwardInference( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, - void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, - void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, - void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, - void* cy, void* workspace, size_t workSpaceSizeInBytes) - int cudnnRNNForwardTraining( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, void* x, - TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, - FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, - TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, - void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, - size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardData( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* yDesc, void* y, - TensorDescriptor* dyDesc, void* dy, - TensorDescriptor dhyDesc, void* dhy, - TensorDescriptor dcyDesc, void* dcy, - FilterDescriptor wDesc, void* w, - TensorDescriptor hxDesc, void* hx, - TensorDescriptor cxDesc, void* cx, - TensorDescriptor* dxDesc, void* dx, - TensorDescriptor dhxDesc, void* dhx, - TensorDescriptor dcxDesc, void* dcx, void* workspace, - size_t workSpaceSizeInBytes, void* reserveSpace, - size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardWeights( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, - TensorDescriptor* yDesc, void* y, - void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, - void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) - - int cudnnRNNForwardInferenceEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - FilterDescriptor wDesc, const void* w, - RNNDataDescriptor yDesc, void* y, - TensorDescriptor hyDesc, void* hy, - TensorDescriptor cyDesc, void* cy, - RNNDataDescriptor kDesc, const void* keys, - RNNDataDescriptor cDesc, void* cAttn, - RNNDataDescriptor iDesc, void* iAttn, - RNNDataDescriptor qDesc, void* queries, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnRNNForwardTrainingEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - FilterDescriptor wDesc, const void* w, - RNNDataDescriptor yDesc, void* y, - TensorDescriptor hyDesc, void* hy, - TensorDescriptor cyDesc, void* cy, - RNNDataDescriptor kDesc, const void* keys, - RNNDataDescriptor cDesc, void* cAttn, - RNNDataDescriptor iDesc, void* iAttn, - RNNDataDescriptor qDesc, void* queries, - void* workSpace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardDataEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor yDesc, const void* y, - RNNDataDescriptor dyDesc, const void* dy, - RNNDataDescriptor dcDesc, const void* dcAttn, - TensorDescriptor dhyDesc, const void* dhy, - TensorDescriptor dcyDesc, const void* dcy, - FilterDescriptor wDesc, const void* w, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - RNNDataDescriptor dxDesc, void* dx, - TensorDescriptor dhxDesc, void* dhx, - TensorDescriptor dcxDesc, void* dcx, - RNNDataDescriptor dkDesc, void* dkeys, - void* workSpace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardWeightsEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - RNNDataDescriptor yDesc, const void* y, - void* workSpace, size_t workSpaceSizeInBytes, - FilterDescriptor dwDesc, void* dw, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - - # Spatial Transformer - int cudnnCreateSpatialTransformerDescriptor( - SpatialTransformerDescriptor* stDesc) - int cudnnDestroySpatialTransformerDescriptor( - SpatialTransformerDescriptor stDesc) - int cudnnSetSpatialTransformerNdDescriptor( - SpatialTransformerDescriptor stDesc, SamplerType samplerType, - DataType dataType, int nbDims, int dimA[]) - int cudnnSpatialTfGridGeneratorForward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* theta, void* grid) - int cudnnSpatialTfGridGeneratorBackward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* dgrid, void* dtheta) - int cudnnSpatialTfSamplerForward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* alpha, TensorDescriptor xDesc, void* x, - void* grid, void* beta, TensorDescriptor yDesc, void* y) - int cudnnSpatialTfSamplerBackward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* alpha, TensorDescriptor xDesc, void* x, void* beta, - TensorDescriptor dxDesc, void* dx, void* alphaDgrid, - TensorDescriptor dyDesc, void* dy, void* grid, - void* betaDgrid, void* dgrid) - - # Fused Ops - int cudnnCreateFusedOpsConstParamPack( - FusedOpsConstParamPack* constPack, int ops) - int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) - int cudnnSetFusedOpsConstParamPackAttribute( - FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, - const void *param) - int cudnnGetFusedOpsConstParamPackAttribute( - const FusedOpsConstParamPack constPack, - FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) - int cudnnCreateFusedOpsVariantParamPack( - FusedOpsVariantParamPack *varPack, FusedOps ops) - int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) - int cudnnSetFusedOpsVariantParamPackAttribute( - FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, - void *ptr) - int cudnnGetFusedOpsVariantParamPackAttribute( - const FusedOpsVariantParamPack varPack, - FusedOpsVariantParamLabel paramLabel, void *ptr) - int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) - int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) - int cudnnMakeFusedOpsPlan( - Handle handle, FusedOpsPlan plan, - const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) - int cudnnFusedOpsExecute( - Handle handle, const FusedOpsPlan plan, - FusedOpsVariantParamPack varPack) - - # Build-time version - int CUDNN_VERSION - - # Constants - double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' - - -cdef class CuDNNAlgoPerf: - - def __init__(self, algo, status, time, memory, determinism, mathType): - self.algo = algo - self.status = status - self.time = time - self.memory = memory - self.determinism = determinism - self.mathType = mathType - - -############################################################################### -# Error handling -############################################################################### - -class CuDNNError(RuntimeError): - - def __init__(self, int status): - self.status = status - msg = cudnnGetErrorString(status) - super(CuDNNError, self).__init__( - 'cuDNN Error: {}'.format(msg.decode())) - self._infos = [] - - def add_info(self, info): - assert isinstance(info, str) - self._infos.append(info) - - def add_infos(self, infos): - assert isinstance(infos, list) - self._infos.extend(infos) - - def __str__(self): - base = super(CuDNNError, self).__str__() - return base + ''.join( - '\n ' + info for info in self._infos) - - def __reduce__(self): - return (type(self), (self.status,)) - - -@cython.profile(False) -cpdef inline check_status(int status): - if status != 0: - raise CuDNNError(status) - - -############################################################################### -# Build-time version -############################################################################### - -def get_build_version(): - return CUDNN_VERSION - - -############################################################################### -# Version -############################################################################### - -cpdef size_t getVersion() except? 0: - return cudnnGetVersion() - - -############################################################################### -# Runtime error checking -############################################################################### - -cpdef queryRuntimeError(intptr_t handle, int mode): - cdef Status rstatus - with nogil: - status = cudnnQueryRuntimeError(handle, &rstatus, - mode, 0) - check_status(status) - return rstatus - - -############################################################################### -# Initialization and CUDA cooperation -############################################################################### - -cpdef intptr_t create() except? 0: - cdef Handle handle - with nogil: - status = cudnnCreate(&handle) - check_status(status) - return handle - - -cpdef destroy(intptr_t handle): - with nogil: - status = cudnnDestroy(handle) - check_status(status) - - -cpdef setStream(intptr_t handle, size_t stream): - # TODO(leofang): The support of stream capture is not mentioned at all in - # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. - if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): - raise NotImplementedError( - 'calling cuDNN API during stream capture is currently ' - 'unsupported') - - status = cudnnSetStream(handle, stream) - check_status(status) - - -cpdef size_t getStream(intptr_t handle) except? 0: - cdef driver.Stream stream - status = cudnnGetStream(handle, &stream) - check_status(status) - return stream - - -cdef _setStream(intptr_t handle): - """Set current stream""" - setStream(handle, stream_module.get_current_stream_ptr()) - -############################################################################### -# Tensor manipulation -############################################################################### - -cpdef size_t createTensorDescriptor() except? 0: - cdef TensorDescriptor descriptor - status = cudnnCreateTensorDescriptor(&descriptor) - check_status(status) - return descriptor - - -cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, - int n, int c, int h, int w): - status = cudnnSetTensor4dDescriptor( - tensorDesc, format, - dataType, n, c, h, w) - check_status(status) - - -cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, - int n, int c, int h, int w, int nStride, - int cStride, int hStride, int wStride): - status = cudnnSetTensor4dDescriptorEx( - tensorDesc, dataType, n, c, h, w, - nStride, cStride, hStride, wStride) - check_status(status) - - -cpdef tuple getTensor4dDescriptor(size_t tensorDesc): - cdef DataType dataType - cdef int n, c, h, w, nStride, cStride, hStride, wStride - status = cudnnGetTensor4dDescriptor( - tensorDesc, &dataType, - &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) - check_status(status) - return dataType, n, c, h, w, nStride, cStride, hStride, wStride - - -cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, - size_t dimA, size_t strideA): - status = cudnnSetTensorNdDescriptor( - tensorDesc, dataType, nbDims, - dimA, strideA) - check_status(status) - - -cpdef destroyTensorDescriptor(size_t tensorDesc): - status = cudnnDestroyTensorDescriptor(tensorDesc) - check_status(status) - - -cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, - size_t b, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnAddTensor_v3( - handle, alpha, bDesc, - b, beta, yDesc, y) - check_status(status) - - -############################################################################### -# Tensor operations -############################################################################### - -cpdef size_t createOpTensorDescriptor() except? 0: - cdef OpTensorDescriptor opTensorDesc - status = cudnnCreateOpTensorDescriptor(&opTensorDesc) - check_status(status) - return opTensorDesc - - -cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, - int opTensorCompType, int opTensorNanOpt): - status = cudnnSetOpTensorDescriptor( - opTensorDesc, opTensorOp, - opTensorCompType, opTensorNanOpt) - check_status(status) - - -cpdef getOpTensorDescriptor(size_t opTensorDesc): - cdef OpTensorOp opTensorOp - cdef DataType opTensorCompType - cdef NanPropagation opTensorNanOpt - status = cudnnGetOpTensorDescriptor( - opTensorDesc, &opTensorOp, &opTensorCompType, - &opTensorNanOpt) - check_status(status) - return opTensorOp, opTensorCompType, opTensorNanOpt - - -cpdef destroyOpTensorDescriptor(size_t opTensorDesc): - status = cudnnDestroyOpTensorDescriptor(opTensorDesc) - check_status(status) - - -cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, - size_t aDesc, size_t A, size_t alpha2, size_t bDesc, - size_t B, size_t beta, size_t cDesc, size_t C): - _setStream(handle) - with nogil: - status = cudnnOpTensor( - handle, opTensorDesc, alpha1, - aDesc, A, alpha2, - bDesc, B, beta, - cDesc, C) - check_status(status) - - -############################################################################### -# Tensor reductions -############################################################################### - -cpdef size_t createReduceTensorDescriptor() except? 0: - cdef ReduceTensorDescriptor reduceTensorDesc - status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) - check_status(status) - return reduceTensorDesc - -cpdef setReduceTensorDescriptor( - size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, - int reduceTensorNanOpt, int reduceTensorIndices, - int reduceTensorIndicesType): - status = cudnnSetReduceTensorDescriptor( - reduceTensorDesc, - reduceTensorOp, - reduceTensorCompType, reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType) - check_status(status) - - -cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): - cdef ReduceTensorOp redOp - cdef DataType redCompType - cdef NanPropagation redNanOpt - cdef ReduceTensorIndices redIndices - cdef IndicesType redIndicesType - status = cudnnGetReduceTensorDescriptor( - reduceTensorDesc, &redOp, - &redCompType, &redNanOpt, &redIndices, &redIndicesType) - check_status(status) - return redOp, redCompType, redNanOpt, redIndices, redIndicesType - - -cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - status = cudnnDestroyReduceTensorDescriptor( - reduceTensorDesc) - check_status(status) - - -cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetReductionIndicesSize( - handle, reduceTensorDesc, - aDesc, cDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getReductionWorkspaceSize(intptr_t handle, - size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetReductionWorkspaceSize( - handle, reduceTensorDesc, - aDesc, cDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, - size_t indicesSizeInBytes, size_t workspace, - size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, - size_t A, size_t beta, size_t cDesc, size_t C): - _setStream(handle) - with nogil: - status = cudnnReduceTensor( - handle, reduceTensorDesc, - indices, indicesSizeInBytes, workspace, - workspaceSizeInBytes, alpha, aDesc, - A, beta, cDesc, C) - check_status(status) - - -cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): - _setStream(handle) - with nogil: - status = cudnnSetTensor( - handle, yDesc, y, - valuePtr) - check_status(status) - - -cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): - _setStream(handle) - with nogil: - status = cudnnScaleTensor( - handle, yDesc, y, - alpha) - check_status(status) - - -############################################################################### -# Filter manipulation -############################################################################### - -cpdef size_t createFilterDescriptor() except? 0: - cdef FilterDescriptor desc - status = cudnnCreateFilterDescriptor(&desc) - check_status(status) - return desc - - -cpdef setFilter4dDescriptor_v4( - size_t filterDesc, int dataType, - int format, int k, int c, int h, int w): - status = cudnnSetFilter4dDescriptor_v4( - filterDesc, dataType, - format, k, c, h, w) - check_status(status) - - -cpdef setFilterNdDescriptor_v4( - size_t filterDesc, int dataType, - int format, int nbDims, size_t filterDimA): - status = cudnnSetFilterNdDescriptor_v4( - filterDesc, dataType, - format, nbDims, filterDimA) - check_status(status) - - -cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): - cdef DataType dataType - cdef TensorFormat format - cdef int nbDims - cdef vector.vector[int] filterDimA - filterDimA.resize(nbDimsRequested) - - status = cudnnGetFilterNdDescriptor_v4( - wDesc, nbDimsRequested, &dataType, - &format, &nbDims, filterDimA.data()) - check_status(status) - return dataType, format, nbDims, tuple(filterDimA) - - -cpdef destroyFilterDescriptor(size_t filterDesc): - status = cudnnDestroyFilterDescriptor(filterDesc) - check_status(status) - - -############################################################################### -# Convolution -############################################################################### - -cpdef size_t createConvolutionDescriptor() except? 0: - cdef ConvolutionDescriptor desc - status = cudnnCreateConvolutionDescriptor(&desc) - check_status(status) - return desc - - -cpdef setConvolutionMathType(size_t convDesc, size_t mathType): - status = cudnnSetConvolutionMathType( - convDesc, mathType) - check_status(status) - - -cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: - cdef MathType mathType - status = cudnnGetConvolutionMathType( - convDesc, &mathType) - check_status(status) - return mathType - - -cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - status = cudnnSetConvolutionGroupCount( - convDesc, groupCount) - check_status(status) - - -cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: - cdef int groupCount - status = cudnnGetConvolutionGroupCount( - convDesc, &groupCount) - check_status(status) - return groupCount - - -cpdef setConvolution2dDescriptor_v4( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode): - status = cudnnSetConvolution2dDescriptor_v4( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode) - check_status(status) - - -cpdef setConvolution2dDescriptor_v5( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode, size_t computeType): - status = cudnnSetConvolution2dDescriptor_v5( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode, computeType) - check_status(status) - - -cpdef setConvolutionNdDescriptor_v3( - size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, - size_t dilationA, int mode, int dataType): - status = cudnnSetConvolutionNdDescriptor_v3( - convDesc, arrayLength, padA, - filterStrideA, dilationA, mode, - dataType) - check_status(status) - - -cpdef destroyConvolutionDescriptor(size_t convDesc): - status = cudnnDestroyConvolutionDescriptor( - convDesc) - check_status(status) - - -cpdef findConvolutionForwardAlgorithm( - intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, - size_t yDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithm( - handle, xDesc, wDesc, - convDesc, yDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionForwardAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionForwardAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx_v7( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionForwardAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: - cdef ConvolutionFwdAlgo algo - status = cudnnGetConvolutionForwardAlgorithm_v6( - handle, srcDesc, - filterDesc, convDesc, - destDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionForwardAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionForwardAlgorithm_v7( - handle, srcDesc, - filterDesc, convDesc, - destDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = cudnnGetConvolutionForwardWorkspaceSize( - handle, srcDesc, - filterDesc, convDesc, - destDesc, algo, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionForward( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t filterDesc, size_t filterData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t destDesc, size_t destData): - _setStream(handle) - with nogil: - status = cudnnConvolutionForward( - handle, alpha, - srcDesc, srcData, - filterDesc, filterData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - destDesc, destData) - check_status(status) - - -cpdef convolutionBackwardBias( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t beta, size_t destDesc, size_t destData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardBias( - handle, alpha, - srcDesc, srcData, beta, - destDesc, destData) - check_status(status) - - -cpdef findConvolutionBackwardFilterAlgorithm( - intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, - size_t dwDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithm( - handle, xDesc, dyDesc, - convDesc, dwDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionBackwardFilterAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithmEx( - handle, xDesc, x, - dyDesc, dy, convDesc, - dwDesc, dw, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( - handle, xDesc, x, - dyDesc, dy, convDesc, - dwDesc, dw, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionBackwardFilterAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int preference, - size_t memoryLimitInbytes) except? -1: - cdef ConvolutionBwdFilterAlgo algo - status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( - handle, srcDesc, - diffDesc, convDesc, - filterDesc, - preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionBackwardFilterAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( - handle, srcDesc, diffDesc, - convDesc, gradDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle, srcDesc, - diffDesc, convDesc, - filterDesc, algo, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionBackwardFilter_v3( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardFilter_v3( - handle, alpha, - srcDesc, srcData, - diffDesc, diffData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - gradDesc, gradData) - check_status(status) - - -cpdef findConvolutionBackwardDataAlgorithm( - intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, - size_t dxDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithm( - handle, wDesc, dyDesc, - convDesc, dxDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionBackwardDataAlgorithmEx( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithmEx( - handle, wDesc, w, - dyDesc, dy, convDesc, - dxDesc, dx, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionBackwardDataAlgorithmEx_v7( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( - handle, wDesc, w, - dyDesc, dy, convDesc, - dxDesc, dx, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionBackwardDataAlgorithm_v6( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, size_t preference, - size_t memoryLimitInbytes) except? -1: - cdef ConvolutionBwdDataAlgo algo - status = cudnnGetConvolutionBackwardDataAlgorithm_v6( - handle, filterDesc, - diffDesc, convDesc, - gradDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionBackwardDataAlgorithm_v7( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionBackwardDataAlgorithm_v7( - handle, filterDesc, - diffDesc, convDesc, - gradDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = cudnnGetConvolutionBackwardDataWorkspaceSize( - handle, filterDesc, - diffDesc, - convDesc, gradDesc, - algo, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionBackwardData_v3( - intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardData_v3( - handle, alpha, - filterDesc, filterData, - diffDesc, diffData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - gradDesc, gradData) - check_status(status) - -############################################################################### -# Pooling -############################################################################### - -cpdef size_t createPoolingDescriptor() except? 0: - cdef PoolingDescriptor desc - status = cudnnCreatePoolingDescriptor(&desc) - check_status(status) - return desc - - -cpdef setPooling2dDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, - int windowWidth, int verticalPadding, int horizontalPadding, - int verticalStride, int horizontalStride): - status = cudnnSetPooling2dDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, windowHeight, windowWidth, - verticalPadding, horizontalPadding, verticalStride, horizontalStride) - check_status(status) - - -cpdef setPoolingNdDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, - size_t windowDimA, size_t paddingA, size_t strideA): - status = cudnnSetPoolingNdDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, nbDims, - windowDimA, paddingA, strideA) - check_status(status) - - -cpdef destroyPoolingDescriptor(size_t poolingDesc): - status = cudnnDestroyPoolingDescriptor(poolingDesc) - check_status(status) - - -cpdef poolingForward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnPoolingForward( - handle, poolingDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef poolingBackward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnPoolingBackward( - handle, poolingDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - -############################################################################### -# Batch Normalization -############################################################################### - -CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON - -cpdef deriveBNTensorDescriptor( - size_t derivedBnDesc, size_t xDesc, int mode): - status = cudnnDeriveBNTensorDescriptor( - derivedBnDesc, xDesc, - mode) - check_status(status) - - -cpdef batchNormalizationForwardTraining( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardTraining( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardInference( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, size_t estimatedMean, size_t estimatedVariance, - double epsilon): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardInference( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, estimatedMean, estimatedVariance, - epsilon) - check_status(status) - - -cpdef batchNormalizationBackward( - intptr_t handle, int mode, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, size_t dyDesc, - size_t dy, size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, size_t bnScale, - size_t dBnScaleResult, size_t dBnBiasResult, - double epsilon, size_t savedMean, size_t savedInvVariance): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationBackward( - handle, mode, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - dyDesc, dy, - dxDesc, dx, - dBnScaleBiasDesc, bnScale, - dBnScaleResult, dBnBiasResult, - epsilon, savedMean, savedInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardTrainingEx( - intptr_t handle, int mode, int bnOps, - size_t alpha, size_t beta, - size_t xDesc, size_t x, - size_t zDesc, size_t z, - size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, - size_t bnScale, size_t bnBias, - double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardTrainingEx( - handle, mode, bnOps, - alpha, beta, - xDesc, x, - zDesc, z, - yDesc, y, - bnScaleBiasMeanVarDesc, - bnScale, bnBias, - exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t zDesc, - size_t yDesc, - size_t bnScaleBiasMeanVarDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - handle, - mode, bnOps, - xDesc, - zDesc, - yDesc, - bnScaleBiasMeanVarDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef batchNormalizationBackwardEx( - intptr_t handle, int mode, int bnops, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, - size_t yDesc, size_t y, - size_t dyDesc, size_t dy, - size_t dzDesc, size_t dz, - size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, - size_t bnScaleData, size_t bnBiasData, - size_t dBnScaleData, size_t dBnBiasData, - double epsilon, - size_t savedMean, size_t savedInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationBackwardEx( - handle, - mode, bnops, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - yDesc, y, - dyDesc, dy, - dzDesc, dz, - dxDesc, dx, - dBnScaleBiasDesc, - bnScaleData, bnBiasData, - dBnScaleData, dBnBiasData, - epsilon, - savedMean, savedInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t yDesc, - size_t dyDesc, - size_t dzDesc, - size_t dxDesc, - size_t dBnScaleBiasDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( - handle, - mode, - bnOps, - xDesc, - yDesc, - dyDesc, - dzDesc, - dxDesc, - dBnScaleBiasDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( - intptr_t handle, int mode, int bnOps, - size_t activationDesc, - size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - handle, - mode, - bnOps, - activationDesc, - xDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -############################################################################### -# Activation -############################################################################### - -cpdef size_t createActivationDescriptor() except? 0: - cdef ActivationDescriptor activationDesc - status = cudnnCreateActivationDescriptor(&activationDesc) - check_status(status) - return activationDesc - - -cpdef setActivationDescriptor( - size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): - status = cudnnSetActivationDescriptor( - activationDesc, mode, - reluNanOpt, reluCeiling) - check_status(status) - - -cpdef destroyActivationDescriptor(size_t activationDesc): - status = cudnnDestroyActivationDescriptor( - activationDesc) - check_status(status) - - -cpdef softmaxForward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnSoftmaxForward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - beta, dstDesc, dstData) - check_status(status) - - -cpdef softmaxBackward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, - size_t destDiffDesc, size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnSoftmaxBackward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - srcDiffDesc, srcDiffData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -cpdef activationForward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnActivationForward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef activationBackward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnActivationBackward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -############################################################################### -# Dropout -############################################################################### - -cpdef size_t createDropoutDescriptor() except? 0: - cdef DropoutDescriptor desc - status = cudnnCreateDropoutDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyDropoutDescriptor(size_t dropoutDesc): - status = cudnnDestroyDropoutDescriptor(dropoutDesc) - check_status(status) - - -cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: - cdef size_t sizeInBytes - status = cudnnDropoutGetStatesSize( - handle, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef setDropoutDescriptor( - size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed): - status = cudnnSetDropoutDescriptor( - dropoutDesc, handle, dropout, - states, stateSizeInBytes, seed) - check_status(status) - - -cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnDropoutGetReserveSpaceSize( - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef dropoutForward( - intptr_t handle, size_t dropoutDesc, - size_t srcDesc, size_t srcData, - size_t dstDesc, size_t dstData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutForward( - handle, dropoutDesc, - srcDesc, srcData, - dstDesc, dstData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef dropoutBackward( - intptr_t handle, size_t dropoutDesc, - size_t dyDesc, size_t dyData, - size_t dxDesc, size_t dxData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutBackward( - handle, dropoutDesc, - dyDesc, dyData, - dxDesc, dxData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# CTC -############################################################################### -cpdef size_t createCTCLossDescriptor() except? 0: - cdef CTCLossDescriptor desc - status = cudnnCreateCTCLossDescriptor(&desc) - check_status(status) - return desc - -cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) - check_status(status) - -cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): - status = cudnnSetCTCLossDescriptor( - ctcLossDesc, dataType) - check_status(status) - -cpdef getCTCLossDescriptor(size_t ctcLossDesc): - cdef DataType compType - status = cudnnGetCTCLossDescriptor( - ctcLossDesc, &compType) - check_status(status) - return compType - -cpdef size_t getCTCLossWorkspaceSize( - intptr_t handle, size_t probsDesc, size_t gradientsDesc, - size_t labels, size_t labelLengths, size_t inputLengths, - int algo, size_t ctcLossDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetCTCLossWorkspaceSize( - handle, probsDesc, - gradientsDesc, - labels, labelLengths, inputLengths, - algo, ctcLossDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - -cpdef CTCLoss( - intptr_t handle, size_t probsDesc, - size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, - size_t costs, size_t gradientsDesc, size_t gradients, - int algo, size_t ctcLossDesc, - size_t workspace, size_t workSpaceSizeInBytes): - status = cudnnCTCLoss( - handle, probsDesc, probs, - labels, labelLengths, inputLengths, - costs, gradientsDesc, gradients, - algo, ctcLossDesc, - workspace, workSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# RNN -############################################################################### - -cpdef size_t createRNNDescriptor() except? 0: - cdef RNNDescriptor desc - status = cudnnCreateRNNDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDescriptor(size_t rnnDesc): - status = cudnnDestroyRNNDescriptor(rnnDesc) - check_status(status) - - -cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, - int dataType) except? 0: - cdef PersistentRNNPlan plan - status = cudnnCreatePersistentRNNPlan( - rnnDesc, - minibatch, dataType, &plan) - check_status(status) - return plan - - -cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): - status = cudnnSetPersistentRNNPlan( - rnnDesc, plan) - check_status(status) - - -cpdef destroyPersistentRNNPlan(size_t plan): - status = cudnnDestroyPersistentRNNPlan(plan) - check_status(status) - - -cpdef setRNNDescriptor_v5( - size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int dataType): - status = cudnnSetRNNDescriptor_v5( - rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, dataType) - check_status(status) - - -cpdef setRNNDescriptor_v6( - intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int algo, int dataType): - status = cudnnSetRNNDescriptor_v6( - handle, rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, algo, - dataType) - check_status(status) - - -cpdef setRNNPaddingMode( - size_t rnnDesc, int paddingMode): - status = cudnnSetRNNPaddingMode( - rnnDesc, paddingMode) - check_status(status) - - -cpdef getRNNPaddingMode(size_t rnnDesc): - cdef RNNPaddingMode paddingMode - status = cudnnGetRNNPaddingMode( - rnnDesc, &paddingMode) - check_status(status) - return paddingMode - - -cpdef size_t createRNNDataDescriptor() except? 0: - cdef RNNDataDescriptor desc - status = cudnnCreateRNNDataDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): - status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) - check_status(status) - - -cpdef setRNNDataDescriptor( - size_t RNNDataDesc, int dataType, size_t layout, - int maxSeqLength, int batchSize, int vectorSize, - size_t seqLengthArray, size_t paddingFill): - status = cudnnSetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, vectorSize, - seqLengthArray, paddingFill) - check_status(status) - - -cpdef getRNNDataDescriptor( - size_t RNNDataDesc, size_t dataType, - size_t layout, size_t maxSeqLength, size_t batchSize, - size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, - size_t paddingFill): - status = cudnnGetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, - vectorSize, arrayLengthRequested, seqLengthArray, - paddingFill) - check_status(status) - - -cpdef getRNNWorkspaceSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - status = cudnnGetRNNWorkspaceSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNTrainingReserveSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - status = cudnnGetRNNTrainingReserveSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNParamsSize( - intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): - cdef size_t sizeInBytes - status = cudnnGetRNNParamsSize( - handle, rnnDesc, xDesc, - &sizeInBytes, dataType) - check_status(status) - return sizeInBytes - - -cpdef getRNNLinLayerMatrixParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): - status = cudnnGetRNNLinLayerMatrixParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerMatDesc, linLayerMat) - check_status(status) - - -cpdef getRNNLinLayerBiasParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerBiasDesc, - size_t linLayerBias): - status = cudnnGetRNNLinLayerBiasParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerBiasDesc, linLayerBias) - check_status(status) - - -cpdef RNNForwardInference( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, - size_t x, size_t hxDesc, size_t hx, size_t cxDesc, - size_t cx, size_t wDesc, size_t w, size_t yDesc, - size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t workspace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardInference( + ############################################################################### + + cpdef size_t createRNNDescriptor() except? 0: + cdef RNNDescriptor desc + status = cudnnCreateRNNDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyRNNDescriptor(size_t rnnDesc): + status = cudnnDestroyRNNDescriptor(rnnDesc) + check_status(status) + + + cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, + int dataType) except? 0: + cdef PersistentRNNPlan plan + status = cudnnCreatePersistentRNNPlan( + rnnDesc, + minibatch, dataType, &plan) + check_status(status) + return plan + + + cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): + status = cudnnSetPersistentRNNPlan( + rnnDesc, plan) + check_status(status) + + + cpdef destroyPersistentRNNPlan(size_t plan): + status = cudnnDestroyPersistentRNNPlan(plan) + check_status(status) + + + cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType): + status = cudnnSetRNNDescriptor_v5( + rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, dataType) + check_status(status) + + + cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType): + status = cudnnSetRNNDescriptor_v6( + handle, rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, algo, + dataType) + check_status(status) + + + cpdef setRNNPaddingMode( + size_t rnnDesc, int paddingMode): + status = cudnnSetRNNPaddingMode( + rnnDesc, paddingMode) + check_status(status) + + + cpdef getRNNPaddingMode(size_t rnnDesc): + cdef RNNPaddingMode paddingMode + status = cudnnGetRNNPaddingMode( + rnnDesc, &paddingMode) + check_status(status) + return paddingMode + + + cpdef size_t createRNNDataDescriptor() except? 0: + cdef RNNDataDescriptor desc + status = cudnnCreateRNNDataDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): + status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) + check_status(status) + + + cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill): + status = cudnnSetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, vectorSize, + seqLengthArray, paddingFill) + check_status(status) + + + cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill): + status = cudnnGetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, + vectorSize, arrayLengthRequested, seqLengthArray, + paddingFill) + check_status(status) + + + cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = cudnnGetRNNWorkspaceSize( handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTraining( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t wDesc, size_t w, size_t yDesc, size_t y, - size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, - size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardTraining( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardData( - intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, - size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, - size_t dcxDesc, size_t dcx, size_t workspace, - size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardData( + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = cudnnGetRNNTrainingReserveSize( handle, rnnDesc, seqLength, - yDesc, y, - dyDesc, dy, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeights( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, - size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeights( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - yDesc, y, - workspace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardInferenceEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardInferenceEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTrainingEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardTrainingEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardDataEx( - intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, - size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, - size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, - size_t dkDesc, size_t dkeys, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardDataEx( - handle, rnnDesc, - yDesc, y, - dyDesc, dy, - dcDesc, dcAttn, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - dkDesc, dkeys, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeightsEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t dwDesc, size_t dw, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeightsEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - yDesc, y, - workSpace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# Spatial Transformer -############################################################################### - -cpdef size_t createSpatialTransformerDescriptor() except? 0: - cdef SpatialTransformerDescriptor stDesc - status = cudnnCreateSpatialTransformerDescriptor(&stDesc) - check_status(status) - return stDesc - - -cpdef destroySpatialTransformerDescriptor(size_t stDesc): - status = cudnnDestroySpatialTransformerDescriptor( - stDesc) - check_status(status) - - -cpdef setSpatialTransformerDescriptor( - size_t stDesc, size_t samplerType, int dataType, - int nbDims, size_t dimA): - status = cudnnSetSpatialTransformerNdDescriptor( - stDesc, samplerType, - dataType, nbDims, dimA) - check_status(status) - - -cpdef spatialTfGridGeneratorForward( - intptr_t handle, size_t stDesc, size_t theta, size_t grid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorForward( - handle, stDesc, - theta, grid) - check_status(status) - - -cpdef spatialTfGridGeneratorBackward( - intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorBackward( - handle, stDesc, - dgrid, dtheta) - check_status(status) - - -cpdef spatialTfSamplerForward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerForward( - handle, stDesc, - alpha, xDesc, x, grid, - beta, yDesc, y) - check_status(status) - - -cpdef spatialTfSamplerBackward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, - size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerBackward( - handle, stDesc, - alpha, xDesc, x, beta, - dxDesc, dx, alphaDgrid, - dyDesc, dy, grid, - betaDgrid, dgrid) - check_status(status) - -############################################################################### -# Fused Ops -############################################################################### - -cpdef createFusedOpsConstParamPack(int ops): - cdef FusedOpsConstParamPack constPack - with nogil: - status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) - check_status(status) - return constPack - -cpdef destroyFusedOpsConstParamPack(size_t constPack): - with nogil: - status = cudnnDestroyFusedOpsConstParamPack( - constPack) - check_status(status) - -cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - with nogil: - status = cudnnSetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param) - check_status(status) - -cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - cdef int isNULL = 0 - with nogil: - status = cudnnGetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param, &isNULL) - check_status(status) - return isNULL - -cpdef createFusedOpsVariantParamPack(int ops): - cdef FusedOpsVariantParamPack varPack - with nogil: - status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) - check_status(status) - return varPack - -cpdef destroyFusedOpsVariantParamPack(size_t varPack): - with nogil: - status = cudnnDestroyFusedOpsVariantParamPack( - varPack) - check_status(status) - -cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnSetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnGetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef createFusedOpsPlan(int ops): - cdef FusedOpsPlan plan - with nogil: - status = cudnnCreateFusedOpsPlan(&plan, ops) - check_status(status) - return plan - -cpdef destroyFusedOpsPlan(size_t plan): - with nogil: - status = cudnnDestroyFusedOpsPlan(plan) - check_status(status) - -cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): - cdef size_t workspaceSizeInBytes - _setStream(handle) - with nogil: - status = cudnnMakeFusedOpsPlan(handle, plan, - constPack, - &workspaceSizeInBytes) - check_status(status) - return workspaceSizeInBytes - -cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): - _setStream(handle) - with nogil: - status = cudnnFusedOpsExecute(handle, plan, - varPack) - check_status(status) + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): + cdef size_t sizeInBytes + status = cudnnGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) + check_status(status) + return sizeInBytes + + + cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): + status = cudnnGetRNNLinLayerMatrixParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerMatDesc, linLayerMat) + check_status(status) + + + cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias): + status = cudnnGetRNNLinLayerBiasParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerBiasDesc, linLayerBias) + check_status(status) + + + cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardData( + handle, rnnDesc, seqLength, + yDesc, y, + dyDesc, dy, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeights( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + yDesc, y, + workspace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardInferenceEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardTrainingEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardDataEx( + handle, rnnDesc, + yDesc, y, + dyDesc, dy, + dcDesc, dcAttn, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + dkDesc, dkeys, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeightsEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + yDesc, y, + workSpace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + ############################################################################### + # Spatial Transformer + ############################################################################### + + cpdef size_t createSpatialTransformerDescriptor() except? 0: + cdef SpatialTransformerDescriptor stDesc + status = cudnnCreateSpatialTransformerDescriptor(&stDesc) + check_status(status) + return stDesc + + + cpdef destroySpatialTransformerDescriptor(size_t stDesc): + status = cudnnDestroySpatialTransformerDescriptor( + stDesc) + check_status(status) + + + cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA): + status = cudnnSetSpatialTransformerNdDescriptor( + stDesc, samplerType, + dataType, nbDims, dimA) + check_status(status) + + + cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorForward( + handle, stDesc, + theta, grid) + check_status(status) + + + cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorBackward( + handle, stDesc, + dgrid, dtheta) + check_status(status) + + + cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerForward( + handle, stDesc, + alpha, xDesc, x, grid, + beta, yDesc, y) + check_status(status) + + + cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerBackward( + handle, stDesc, + alpha, xDesc, x, beta, + dxDesc, dx, alphaDgrid, + dyDesc, dy, grid, + betaDgrid, dgrid) + check_status(status) + + ############################################################################### + # Fused Ops + ############################################################################### + + cpdef createFusedOpsConstParamPack(int ops): + cdef FusedOpsConstParamPack constPack + with nogil: + status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) + check_status(status) + return constPack + + cpdef destroyFusedOpsConstParamPack(size_t constPack): + with nogil: + status = cudnnDestroyFusedOpsConstParamPack( + constPack) + check_status(status) + + cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + with nogil: + status = cudnnSetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param) + check_status(status) + + cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + cdef int isNULL = 0 + with nogil: + status = cudnnGetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param, &isNULL) + check_status(status) + return isNULL + + cpdef createFusedOpsVariantParamPack(int ops): + cdef FusedOpsVariantParamPack varPack + with nogil: + status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) + check_status(status) + return varPack + + cpdef destroyFusedOpsVariantParamPack(size_t varPack): + with nogil: + status = cudnnDestroyFusedOpsVariantParamPack( + varPack) + check_status(status) + + cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnSetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + + cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnGetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + + cpdef createFusedOpsPlan(int ops): + cdef FusedOpsPlan plan + with nogil: + status = cudnnCreateFusedOpsPlan(&plan, ops) + check_status(status) + return plan + + cpdef destroyFusedOpsPlan(size_t plan): + with nogil: + status = cudnnDestroyFusedOpsPlan(plan) + check_status(status) + + cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): + cdef size_t workspaceSizeInBytes + _setStream(handle) + with nogil: + status = cudnnMakeFusedOpsPlan(handle, plan, + constPack, + &workspaceSizeInBytes) + check_status(status) + return workspaceSizeInBytes + + cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): + _setStream(handle) + with nogil: + status = cudnnFusedOpsExecute(handle, plan, + varPack) + check_status(status) diff --git a/cupy_backends/cuda/libs/miopen.pxd b/cupy_backends/cuda/libs/miopen.pxd new file mode 100644 index 00000000000..5d1baf02526 --- /dev/null +++ b/cupy_backends/cuda/libs/miopen.pxd @@ -0,0 +1,624 @@ +from libc.stdint cimport intptr_t + + +############################################################################### +# Enum +############################################################################### +IF CUPY_HIP_VERSION != 0: + cpdef enum: + miopenFloat = 1 + miopenDouble = 6 + miopenHalf = 0 + + miopenConvolutionFwdAlgoGEMM = 0 + miopenConvolutionFwdAlgoDirect = 1 + miopenConvolutionFwdAlgoFFT = 2 + miopenConvolutionFwdAlgoWinograd = 3 + miopenConvolutionFwdAlgoImplicitGEMM = 5 + + miopenPoolingMax = 0 + miopenPoolingAverage = 1 + miopenPoolingAverageInclusive = 2 + + miopenActivationPASTHRU = 0 + miopenActivationTANH = 2 + miopenActivationRELU = 3 + miopenActivationCLIPPEDRELU = 7 + miopenActivationELU = 9 + + miopenRNNDataSeqMajorNotPadded = 1 + miopenRNNDataSeqMajorPadded = 2 + miopenRNNDataBatchMajorPadded = 3 + + MIOPEN_NOT_PROPAGATE_NAN = 0 + MIOPEN_PROPAGATE_NAN = 1 + + miopenTensorNCHW = 0 + miopenTensorNHWC = 1 + + miopenTensorOpAdd = 0 + miopenTensorOpMul = 1 + miopenTensorOpMin = 2 + miopenTensorOpMax = 3 + + MIOPEN_REDUCE_TENSOR_ADD = 0 + MIOPEN_REDUCE_TENSOR_MUL = 1 + MIOPEN_REDUCE_TENSOR_MIN = 2 + MIOPEN_REDUCE_TENSOR_MAX = 3 + MIOPEN_REDUCE_TENSOR_AMAX = 4 + MIOPEN_REDUCE_TENSOR_AVG = 5 + MIOPEN_REDUCE_TENSOR_NORM1 = 6 + MIOPEN_REDUCE_TENSOR_NORM2 = 7 + + MIOPEN_REDUCE_TENSOR_NO_INDICES = 0 + MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES = 1 + + MIOPEN_32BIT_INDICES = 0 + MIOPEN_64BIT_INDICES = 1 + MIOPEN_16BIT_INDICES = 2 + MIOPEN_8BIT_INDICES = 3 + + miopenConvolution = 0 + miopenTranspose = 1 + + MIOPEN_SOFTMAX_FAST = 0 + MIOPEN_SOFTMAX_ACCURATE = 1 + MIOPEN_SOFTMAX_LOG = 2 + + MIOPEN_SOFTMAX_MODE_INSTANCE = 0 + MIOPEN_SOFTMAX_MODE_CHANNEL = 1 + + miopenBNPerActivation = 0 + miopenBNSpatial = 1 + + MIOPEN_CTC_LOSS_ALGO_DETERMINISTIC = 0 + + miopenRNNRELU = 0 + miopenRNNTANH = 1 + miopenLSTM = 2 + miopenGRU = 3 + + miopenRNNunidirection = 0 + miopenRNNbidirection = 1 + + miopenRNNIONotPadded = 0 + miopenRNNIOWithPadding = 1 + + miopenRNNlinear = 0 + miopenRNNskip = 1 + + miopenStatusSuccess = 0 + + MIOPEN_RNG_PSEUDO_XORWOW = 0 + +IF CUPY_HIP_VERSION == 0: + ############################################################################### + # Class + ############################################################################### + + cdef class CuDNNAlgoPerf: + cdef: + int algo + int status + float time + size_t memory + int determinism + int mathType + ############################################################################### + # Version + ############################################################################### + + cpdef size_t getVersion() except? 0 + + ############################################################################### + # Runtime error checking + ############################################################################### + cpdef queryRuntimeError(intptr_t handle, int mode) + + ############################################################################### + # Initialization and CUDA cooperation + ############################################################################### + + cpdef intptr_t create() except? 0 + cpdef destroy(intptr_t handle) + cpdef setStream(intptr_t handle, size_t stream) + cpdef size_t getStream(intptr_t handle) except? 0 + + + ############################################################################### + # Tensor manipulation + ############################################################################### + + cpdef size_t createTensorDescriptor() except? 0 + cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w) + cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride) + cpdef tuple getTensor4dDescriptor(size_t tensorDesc) + cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA) + cpdef destroyTensorDescriptor(size_t tensorDesc) + cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y) + + + ############################################################################### + # Tensor operations + ############################################################################### + + cpdef size_t createOpTensorDescriptor() except? 0 + cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt) + cpdef getOpTensorDescriptor(size_t opTensorDesc) + cpdef destroyOpTensorDescriptor(size_t opTensorDesc) + cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C) + + + ############################################################################### + # Tensor reductions + ############################################################################### + + cpdef size_t createReduceTensorDescriptor() except? 0 + cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, + int reduceTensorCompType, int reduceTensorNanOpt, + int reduceTensorIndices, int reduceTensorIndicesType) + cpdef getReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef size_t getReductionIndicesSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef size_t getReductionWorkspaceSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef reduceTensor( + intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C) + cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr) + cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha) + + + ############################################################################### + # Filter manipulation + ############################################################################### + + cpdef size_t createFilterDescriptor() except? 0 + cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, int format, int k, int c, int h, int w) + cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA) + cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested) + cpdef destroyFilterDescriptor(size_t filterDesc) + + + ############################################################################### + # Convolution + ############################################################################### + + cpdef size_t createConvolutionDescriptor() except? 0 + cpdef setConvolutionMathType( + size_t convDesc, size_t mathType) + cpdef size_t getConvolutionMathType(size_t convDesc) except? 0 + cpdef setConvolutionGroupCount( + size_t convDesc, int groupCount) + cpdef int getConvolutionGroupCount(size_t convDesc) except? -1 + cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode) + cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType) + cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType) + cpdef destroyConvolutionDescriptor(size_t convDesc) + cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, size_t yDesc, + int requestedAlgoCount) + cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1 + cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData) + cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData) + cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1 + cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1 + cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + + + ############################################################################### + # Pooling + ############################################################################### + + cpdef size_t createPoolingDescriptor() except? 0 + cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride) + cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA) + cpdef destroyPoolingDescriptor(size_t poolingDesc) + cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + ############################################################################### + # Batch Normalization + ############################################################################### + + cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode) + + cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance) + + cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon) + + cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance) + + cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0 + + cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0 + + cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0 + + + ############################################################################### + # Activation + ############################################################################### + + cpdef size_t createActivationDescriptor() except? 0 + cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling) + cpdef destroyActivationDescriptor(size_t activationDesc) + cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData) + cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + + ############################################################################### + # Dropout + ############################################################################### + cpdef size_t createDropoutDescriptor() except? 0 + cpdef destroyDropoutDescriptor(size_t dropoutDesc) + cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1 + cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed) + cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0 + cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxtDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # CTC + ############################################################################### + + cpdef size_t createCTCLossDescriptor() except? 0 + cpdef destroyCTCLossDescriptor(size_t ctcLossDesc) + cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType) + cpdef getCTCLossDescriptor(size_t ctcLossDesc) + cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0 + cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, int algo, + size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes) + + + ############################################################################### + # RNN + ############################################################################### + + cpdef size_t createRNNDescriptor() except? 0 + cpdef destroyRNNDescriptor(size_t rnnDesc) + cpdef size_t createPersistentRNNPlan( + size_t rnnDesc, int minibatch, int dataType) except? 0 + cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan) + cpdef destroyPersistentRNNPlan(size_t plan) + cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType) + cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType) + cpdef setRNNPaddingMode(size_t rnnDesc, int paddingMode) + cpdef getRNNPaddingMode(size_t rnnDesc) + cpdef size_t createRNNDataDescriptor() except? 0 + cpdef destroyRNNDataDescriptor(size_t RNNDataDesc) + cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill) + cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill) + cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType) + cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat) + cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias) + cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # Spatial Transformer + ############################################################################### + + cpdef size_t createSpatialTransformerDescriptor() except? 0 + cpdef destroySpatialTransformerDescriptor(size_t stDesc) + cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA) + cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid) + cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta) + cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y) + cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid) + + ############################################################################### + # Fused Ops + ############################################################################### + + cpdef createFusedOpsConstParamPack(int ops) + cpdef destroyFusedOpsConstParamPack(size_t constPack) + cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef createFusedOpsVariantParamPack(int ops) + cpdef destroyFusedOpsVariantParamPack(size_t varPack) + cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef createFusedOpsPlan(int ops) + cpdef destroyFusedOpsPlan(size_t plan) + cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) + cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) + diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx new file mode 100644 index 00000000000..912f2884001 --- /dev/null +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -0,0 +1,516 @@ +# distutils: language = c++ + +"""Thin wrapper of cuDNN.""" +# NOTE: This wrapper does not cover all APIs of cuDNN v4. +cimport cython # NOQA +from libcpp cimport vector +from libcpp cimport bool +from cupy_backends.cuda.api cimport driver +from cupy_backends.cuda.api cimport runtime +from cupy_backends.cuda cimport stream as stream_module +from cupy_backends.cuda.libs cimport miopen +############################################################################### +# Extern +############################################################################### + +cdef extern from '../../cupy_miopen.h' nogil: + # Types + ctypedef int ActivationMode 'miopenActivationMode_t' + ctypedef int BatchNormMode 'miopenBatchNormMode_t' + ctypedef int ConvolutionBwdDataAlgo 'miopenConvBwdDataAlgorithm_t' + ctypedef int ConvolutionBwdFilterAlgo 'miopenConvBwdWeightsAlgorithm_t' + ctypedef int ConvolutionFwdAlgo 'miopenConvFwdAlgorithm_t' + ctypedef int ConvolutionMode 'miopenConvolutionMode_t' + ctypedef int DataType 'miopenDataType_t' + ctypedef int DirectionMode 'miopenRNNDirectionMode_t' + ctypedef int NanPropagation 'miopenNanPropagation_t' + ctypedef int PoolingMode 'miopenPoolingMode_t' + ctypedef int RNNInputMode 'miopenRNNInputMode_t' + ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' + ctypedef int RNNMode 'miopenRNNMode_t' + ctypedef int RNNAlgo 'miopenRNNAlgo_t' + ctypedef int RNNDataLayout 'miopenRNNBaseLayout_t' + ctypedef int RNNPaddingMode 'miopenRNNPaddingMode_t' + ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' + ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' + ctypedef int Status 'miopenStatus_t' + ctypedef int TensorFormat 'miopenTensorLayout_t' + ctypedef int OpTensorOp 'miopenTensorOp_t' + ctypedef int RNGType_t 'miopenRNGType_t' + ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' + ctypedef int ReduceTensorIndices 'miopenReduceTensorIndices_t' + ctypedef int IndicesType 'miopenIndicesType_t' + ctypedef void* ActivationDescriptor 'miopenActivationDescriptor_t' + ctypedef void* ConvolutionDescriptor 'miopenConvolutionDescriptor_t' + ctypedef void* DropoutDescriptor 'miopenDropoutDescriptor_t' + ctypedef void* Handle 'miopenHandle_t' + ctypedef void* PoolingDescriptor 'miopenPoolingDescriptor_t' + ctypedef void* CTCLossDescriptor 'miopenCTCLossDescriptor_t' + ctypedef void* RNNDescriptor 'miopenRNNDescriptor_t' + ctypedef void* RNNDataDescriptor 'miopenRNNDataDescriptor_t' + ctypedef void* TensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* FilterDescriptor 'miopenTensorDescriptor_t' + ctypedef void* OpTensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* ReduceTensorDescriptor 'miopenReduceTensorDescriptor_t' + ctypedef void* Stream 'miopenAcceleratorQueue_t' + # Error handling + const char* miopenGetErrorString(Status status) + + # Version + #size_t miopenGetVersion() + + # Runtime error checking + #int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + # ErrQueryMode mode, RuntimeTag *tag) + + # Initialization and CUDA cooperation + int miopenCreate(Handle* handle) + int miopenDestroy(Handle handle) + int miopenSetStream(Handle handle, driver.Stream stream) + int miopenGetStream(Handle handle, driver.Stream* stream) + + # Tensor manipulation + int miopenCreateTensorDescriptor(TensorDescriptor* descriptor) + int miopenSet4dTensorDescriptor( + TensorDescriptor tensorDesc, + DataType dataType, int n, int c, int h, int w) + int miopenSet4dTensorDescriptorEx( + TensorDescriptor tensorDesc, DataType dataType, + int n, int c, int h, int w, + int nStride, int cStride, int hStride, int wStride) + int miopenGet4dTensorDescriptor( + TensorDescriptor tensorDesc, DataType* dataType, + int* n, int* c, int* h, int* w, + int* nStride, int* cStride, int* hStride, int* wStride) + int miopenDestroyTensorDescriptor(TensorDescriptor tensorDesc) + + # Tensor operations + int miopenOpTensor( + Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, + TensorDescriptor aDesc, void* A, void* alpha2, + TensorDescriptor bDesc, void* B, void* beta, + TensorDescriptor cDesc, void* C) + + # Tensor reductions + int miopenCreateReduceTensorDescriptor( + ReduceTensorDescriptor* reduceTensorDesc) + int miopenSetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, + DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, + ReduceTensorIndices reduceTensorIndices, + IndicesType reduceTensorIndicesType) + int miopenGetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, + ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, + NanPropagation* reduceTensorNanOpt, + ReduceTensorIndices* reduceTensorIndices, + IndicesType* reduceTensorIndicesType) + int miopenDestroyReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc) + int miopenGetReductionIndicesSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int miopenGetReductionWorkspaceSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int miopenReduceTensor( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, + size_t indicesSizeInBytes, void* workspace, + size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, + void* A, void* beta, TensorDescriptor cDesc, void* c) + int miopenSetTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) + int miopenScaleTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* alpha) + + # Filter manipulation + + # Convolution + int miopenCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) + int miopenSetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int groupCount) + int miopenGetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int *groupCount) + int miopenDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) + int miopenConvolutionForwardGetWorkSpaceSize( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, + size_t* sizeInBytes) + int miopenConvolutionBackwardDataGetWorkSpaceSize( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + size_t* sizeInBytes) + + # Pooling + int miopenCreatePoolingDescriptor(PoolingDescriptor* desc) + int miopenDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) + # Batch Normalization + int miopenDeriveBNTensorDescriptor( + TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, + BatchNormMode mode) + int miopenBatchNormalizationForwardTraining( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, void* resultSaveMean, + void* resultSaveInvVariance) + int miopenBatchNormalizationForwardInference( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, void* estimatedMean, void* estimatedVariance, + double epsilon) + int miopenBatchNormalizationBackward( + Handle handle, BatchNormMode mode, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, void* bnScale, + void* dBnScaleResult, void* dBnBiasResult, + double epsilon, void* savedMean, void* savedInvVariance) + + + # Activation + int miopenCreateActivationDescriptor( + ActivationDescriptor* activationDesc) + int miopenSetActivationDescriptor( + ActivationDescriptor activationDesc, ActivationMode mode, double activAlpha, + double activBeta, + double activGamma) + int miopenDestroyActivationDescriptor( + ActivationDescriptor activationDesc) + int miopenSoftmaxForward( + Handle handle, + void* alpha, TensorDescriptor srcDesc, void* srcData, + void* beta, TensorDescriptor dstDesc, void* dstData) + int miopenSoftmaxBackward( + Handle handle, + void* alpha, TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + int miopenActivationForward( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int miopenActivationBackward( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + + # Dropout + int miopenCreateDropoutDescriptor(DropoutDescriptor* desc) + int miopenDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) + int miopenDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) + int miopenDropoutGetReserveSpaceSize( + TensorDescriptor xDesc, size_t* sizeInBytes) + int miopenSetDropoutDescriptor( + DropoutDescriptor dropoutDesc, Handle handle, float dropout, + void* states, size_t stateSizeInBytes, unsigned long long seed, + bool use_mask, bool state_evo, RNGType_t rng_mode) + int miopenDropoutForward( + Handle handle, DropoutDescriptor dropoutDesc, TensorDescriptor noise_shape, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor dstDesc, void* dstData, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + # CTC + int miopenCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) + int miopenDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) + int miopenGetCTCLossWorkspaceSize( + Handle handle, TensorDescriptor probsDesc, + TensorDescriptor gradientsDesc, int* labels, + int* labelLengths, int* inputLengths, CTCLossAlgo algo, + CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) + int miopenCTCLoss( + Handle handle, TensorDescriptor probsDesc, + void* probs, int* labels, int* labelLengths, int* inputLengths, + void* costs, TensorDescriptor gradientsDesc, void* gradients, + CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, + void* workspace, size_t workSpaceSizeInBytes) + # RNN + int miopenCreateRNNDescriptor(RNNDescriptor* rnnDesc) + int miopenDestroyRNNDescriptor(RNNDescriptor rnnDesc) + int miopenGetRNNWorkspaceSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int miopenGetRNNTrainingReserveSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int miopenGetRNNParamsSize( + Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, + size_t* sizeInBytes, DataType dataType) + int miopenRNNForwardInference( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, + void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, + void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, + void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, + void* cy, void* workspace, size_t workSpaceSizeInBytes) + int miopenRNNForwardTraining( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, + TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, + FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, + TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, + void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + + # Build-time version + int HIP_VERSION + + # Constants + double _EPSILON 'EPSILON' +""" +cdef class CuDNNAlgoPerf: + + def __init__(self, algo, status, time, memory, determinism, mathType): + self.algo = algo + self.status = status + self.time = time + self.memory = memory + self.determinism = determinism + self.mathType = mathType +""" + +############################################################################### +# Error handling +############################################################################### + +class CuDNNError(RuntimeError): + + def __init__(self, int status): + self.status = status + msg = miopenGetErrorString(status) + super(CuDNNError, self).__init__( + 'cuDNN Error: {}'.format(msg.decode())) + self._infos = [] + + def add_info(self, info): + assert isinstance(info, str) + self._infos.append(info) + + def add_infos(self, infos): + assert isinstance(infos, list) + self._infos.extend(infos) + + def __str__(self): + base = super(CuDNNError, self).__str__() + return base + ''.join( + '\n ' + info for info in self._infos) + + def __reduce__(self): + return (type(self), (self.status,)) + + +@cython.profile(False) +cpdef inline check_status(int status): + if status != 0: + raise CuDNNError(status) + + +############################################################################### +# Build-time version +############################################################################### + +def get_build_version(): + return CUPY_HIP_VERSION + + +############################################################################### +# Version +############################################################################### + +cpdef size_t getVersion() except? 0: + return CUPY_HIP_VERSION + + +############################################################################### +# Runtime error checking +############################################################################### + +#cpdef queryRuntimeError(intptr_t handle, int mode): +# cdef Status rstatus +# with nogil: +# status = cudnnQueryRuntimeError(handle, &rstatus, +# mode, 0) +# check_status(status) +# return rstatus + + +############################################################################### +# Initialization and CUDA cooperation +############################################################################### + +cpdef intptr_t create() except? 0: + cdef Handle handle + with nogil: + status = miopenCreate(&handle) + check_status(status) + return handle + + +cpdef destroy(intptr_t handle): + with nogil: + status = miopenDestroy(handle) + check_status(status) + + +cpdef setStream(intptr_t handle, size_t stream): + # TODO(leofang): The support of stream capture is not mentioned at all in + # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. + if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): + raise NotImplementedError( + 'calling cuDNN API during stream capture is currently ' + 'unsupported') + status = miopenSetStream(handle, stream) + check_status(status) + + +cpdef size_t getStream(intptr_t handle) except? 0: + cdef Stream stream + status = miopenGetStream(handle, &stream) + check_status(status) + return stream + + +cdef _setStream(intptr_t handle): + """Set current stream""" + setStream(handle, stream_module.get_current_stream_ptr()) + +############################################################################### +# Tensor manipulation +############################################################################### + +cpdef size_t createTensorDescriptor() except? 0: + cdef TensorDescriptor descriptor + status = miopenCreateTensorDescriptor(&descriptor) + check_status(status) + return descriptor + +cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w): + status = miopenSet4dTensorDescriptor( + tensorDesc, + dataType, n, c, h, w) + check_status(status) + +cpdef destroyTensorDescriptor(size_t tensorDesc): + status = miopenDestroyTensorDescriptor(tensorDesc) + check_status(status) + + +cpdef size_t createDropoutDescriptor() except? 0: + cdef DropoutDescriptor desc + status = miopenCreateDropoutDescriptor(&desc) + check_status(status) + return desc + +cpdef destroyDropoutDescriptor(size_t dropoutDesc): + status = miopenDestroyDropoutDescriptor(dropoutDesc) + check_status(status) + + +cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: + cdef size_t sizeInBytes + status = miopenDropoutGetStatesSize( + handle, &sizeInBytes) + check_status(status) + return sizeInBytes + +cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = miopenDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + +cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed, bool use_mask, bool state_evo, int rngtype): + status = miopenSetDropoutDescriptor( + dropoutDesc, handle, dropout, + states, stateSizeInBytes, seed, use_mask, state_evo, rngtype) + check_status(status) + +cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = miopenDropoutForward( + handle, dropoutDesc, srcDesc, + srcDesc, srcData, + dstDesc, dstData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + +############################################################################### +# Activation +############################################################################### + +cpdef size_t createActivationDescriptor() except? 0: + cdef ActivationDescriptor activationDesc + status = miopenCreateActivationDescriptor(&activationDesc) + check_status(status) + return activationDesc + + +cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): + status = miopenSetActivationDescriptor( + activationDesc, mode, 1.0, 0.0, 0.0) + check_status(status) + + +cpdef destroyActivationDescriptor(size_t activationDesc): + status = miopenDestroyActivationDescriptor( + activationDesc) + check_status(status) + + +cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = miopenSoftmaxForward( + handle, alpha, srcDesc, srcData, + beta, dstDesc, dstData) + check_status(status) + + +cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData): + _setStream(handle) + with nogil: + status = miopenSoftmaxBackward( + handle, alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + check_status(status) + + +cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = miopenActivationForward( + handle, activationDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) diff --git a/cupy_backends/cupy_cudnn.h b/cupy_backends/cupy_cudnn.h index a514f63d200..4e32789fe45 100644 --- a/cupy_backends/cupy_cudnn.h +++ b/cupy_backends/cupy_cudnn.h @@ -1,9 +1,13 @@ // This file is a stub header file of cudnn for Read the Docs. + #ifndef INCLUDE_GUARD_CUPY_CUDNN_H #define INCLUDE_GUARD_CUPY_CUDNN_H +#if CUPY_USE_HIP + +#include "miopen/miopen.h" -#ifndef CUPY_NO_CUDA +#elif !defined(CUPY_NO_CUDA) #include @@ -12,21 +16,15 @@ #include "stub/cupy_cuda_common.h" #include "stub/cupy_cudnn.h" -#else - -#include "hip/cupy_hip_common.h" -#include "stub/cupy_cudnn.h" #endif // #ifdef CUPY_NO_CUDA - - /////////////////////////////////////////////////////////////////////////////// // Definitions are for compatibility with cuDNN v5 and v6. /////////////////////////////////////////////////////////////////////////////// extern "C" { -#if defined(CUPY_NO_CUDA) || (CUDNN_VERSION < 6000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 6000) typedef enum {} cudnnRNNAlgo_t; typedef enum {} cudnnReduceTensorOp_t; diff --git a/cupy_backends/cupy_miopen.h b/cupy_backends/cupy_miopen.h new file mode 100644 index 00000000000..15247b50530 --- /dev/null +++ b/cupy_backends/cupy_miopen.h @@ -0,0 +1,21 @@ +// This file is a stub header file of cudnn for Read the Docs. + + +#ifndef INCLUDE_GUARD_CUPY_CUDNN_H +#define INCLUDE_GUARD_CUPY_CUDNN_H +#if CUPY_USE_HIP + +#include + +#elif !defined(CUPY_NO_CUDA) + +#include + +#elif defined(CUPY_NO_CUDA) + +#include "stub/cupy_cuda_common.h" +#include "stub/cupy_cudnn.h" + + +#endif // #ifdef CUPY_NO_CUDA +#endif // #ifndef INCLUDE_GUARD_CUPY_CUDNN_H diff --git a/cupyx/cudnn.pyx b/cupyx/cudnn.pyx index fcfb9e98c10..5038a7d8192 100644 --- a/cupyx/cudnn.pyx +++ b/cupyx/cudnn.pyx @@ -15,12 +15,17 @@ from cupy._core.core cimport _ndarray_base from cupy._core cimport internal from cupy.cuda cimport device from cupy.cuda cimport memory as _memory -from cupy_backends.cuda.libs cimport cudnn +IF CUPY_HIP_VERSION != 0: + from cupy_backends.cuda.libs import miopen as cudnn + from cupy_backends.cuda.libs.cudnn import * +ELSE: + from cupy_backends.cuda.libs cimport cudnn from cupy._core._ufuncs import elementwise_copy as _elementwise_copy from cupy import _util from cupy.cuda import cudnn as _py_cudnn +from cupy_backends.cuda.libs import cudnn as _cudnn cdef int _cudnn_version = -1 @@ -109,11 +114,11 @@ cdef class Descriptor: cpdef int get_data_type(dtype) except? -1: cdef char t = ord(dtype.char) if t == b'f': - return cudnn.CUDNN_DATA_FLOAT + return _cudnn.CUDNN_DATA_FLOAT elif t == b'd': - return cudnn.CUDNN_DATA_DOUBLE + return _cudnn.CUDNN_DATA_DOUBLE elif t == b'e': - return cudnn.CUDNN_DATA_HALF + return _cudnn.CUDNN_DATA_HALF else: raise TypeError('Dtype {} is not supported in cuDNN'.format(dtype)) @@ -153,9 +158,7 @@ cpdef _create_tensor_nd_descriptor( desc, data_type, arr._shape.size(), c_shape.data(), c_strides.data()) - -cpdef _create_tensor_descriptor(size_t desc, _ndarray_base arr, - int format=cudnn.CUDNN_TENSOR_NCHW): +cpdef _create_tensor_descriptor(size_t desc, _ndarray_base arr,int format=cudnn.miopenTensorNCHW): if not arr._c_contiguous: raise ValueError('cupyx.cudnn supports c-contiguous arrays only') if arr._shape.size() == 4: @@ -180,12 +183,12 @@ cpdef _create_tensor_descriptor_as4darray(size_t desc, if arr._shape.size() > 0: dim1 = arr._shape[0] dim2 = arr.size // dim1 - cudnn.setTensor4dDescriptor(desc, cudnn.CUDNN_TENSOR_NCHW, data_type, + cudnn.setTensor4dDescriptor(desc, _cudnn.CUDNN_TENSOR_NCHW, data_type, dim1, dim2, 1, 1) cpdef _create_filter_descriptor( - size_t desc, _ndarray_base arr, int format=cudnn.CUDNN_TENSOR_NCHW): + size_t desc, _ndarray_base arr, int format=_cudnn.CUDNN_TENSOR_NCHW): cdef vector.vector[int] c_shape cdef Py_ssize_t s, ndim = arr._shape.size() data_type = get_data_type(arr.dtype) @@ -268,7 +271,7 @@ cpdef _ndarray_base _ascontiguousarray_normalized_strides(_ndarray_base a): return newarray -def create_tensor_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): +def create_tensor_descriptor(arr, format=_cudnn.CUDNN_TENSOR_NCHW): desc = Descriptor(cudnn.createTensorDescriptor(), _py_cudnn.destroyTensorDescriptor) _create_tensor_descriptor(desc.value, arr, format) @@ -306,7 +309,7 @@ def create_tensor_nd_descriptor(_ndarray_base arr): return desc -def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): +def create_filter_descriptor(arr, format=_cudnn.CUDNN_TENSOR_NCHW): desc = Descriptor(cudnn.createFilterDescriptor(), _py_cudnn.destroyFilterDescriptor) _create_filter_descriptor(desc.value, arr, format) @@ -314,7 +317,7 @@ def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): def create_convolution_descriptor(pad, stride, dtype, - mode=cudnn.CUDNN_CROSS_CORRELATION, + mode=_cudnn.CUDNN_CROSS_CORRELATION, dilation=None, use_tensor_core=False, groups=1): @@ -616,7 +619,7 @@ def rnn_backward_weights_ex( return dw -def create_activation_descriptor(mode, nan_prop_mode=cudnn.CUDNN_PROPAGATE_NAN, +def create_activation_descriptor(mode, nan_prop_mode=_cudnn.CUDNN_PROPAGATE_NAN, coef=0.0): desc = Descriptor(cudnn.createActivationDescriptor(), _py_cudnn.destroyActivationDescriptor) @@ -645,7 +648,7 @@ def activation_forward(_ndarray_base x, int mode, double coef=0.0): try: _create_tensor_descriptor_as4darray(desc, x) cudnn.setActivationDescriptor( - act_desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, coef) + act_desc, mode, _cudnn.CUDNN_NOT_PROPAGATE_NAN, coef) cudnn.activationForward_v4( handle, act_desc, one, desc, x.data.ptr, zero, desc, y.data.ptr) @@ -773,13 +776,13 @@ def create_dropout_descriptor( desc = Descriptor(cudnn.createDropoutDescriptor(), _py_cudnn.destroyDropoutDescriptor) cudnn.setDropoutDescriptor(desc.value, handle, dropout, - states, state_size_in_bytes, seed) + states, state_size_in_bytes, seed, False, False, 0) return desc def set_dropout_descriptor(desc, handle, dropout): # When the fourth argument is NULL, random state is not updated. - cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0) + cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0, False, False, 0) def _create_ctc_loss_descriptor(data_type): @@ -1356,7 +1359,7 @@ cpdef _warn_algorithm_fwd( .format(x.shape, W.shape, y.shape, conv_param[0], conv_param[1]), _util.PerformanceWarning) - +""" cpdef _Algorithm _find_algorithm_fwd( _ndarray_base x, _ndarray_base W, _ndarray_base y, tuple conv_param, size_t handle, size_t x_desc, size_t filter_desc, size_t conv_desc, @@ -1639,7 +1642,6 @@ cpdef _Algorithm _get_algorithm_bwd_data( _get_algorithm_bwd_data_cache[key] = algo return algo - cpdef bint _should_use_tensor_core( tensor_core_mode, object dtype) except *: if tensor_core_mode == 'auto': @@ -1970,7 +1972,7 @@ def convolution_backward_data( cudnn.destroyFilterDescriptor(filter_desc) cudnn.destroyConvolutionDescriptor(conv_desc) - +""" def pooling_forward( _ndarray_base x, _ndarray_base y, tuple ksize, tuple stride, tuple pad, int mode): @@ -2044,7 +2046,7 @@ def pooling_backward( cdef _create_tensor_descriptor_for_bn( size_t desc, _ndarray_base arr, bint is_for_conv2d, - int format=cudnn.CUDNN_TENSOR_NCHW): + int format=_cudnn.CUDNN_TENSOR_NCHW): assert arr._c_contiguous if is_for_conv2d: _create_tensor_descriptor(desc, arr, format) @@ -2077,7 +2079,7 @@ def batch_normalization_forward_training( _ndarray_base running_mean, _ndarray_base running_var, mean, inv_std, double eps, double decay, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): + int d_layout=_cudnn.CUDNN_TENSOR_NCHW): reserve_space, y, save_mean, save_inv_std = ( _batch_normalization_forward_training( @@ -2106,7 +2108,7 @@ def batch_normalization_forward_training_ex( _ndarray_base running_mean, _ndarray_base running_var, mean, inv_std, double eps, double decay, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): + int d_layout=_cudnn.CUDNN_TENSOR_NCHW): reserve_space, y, save_mean, save_inv_std = ( _batch_normalization_forward_training( @@ -2129,7 +2131,7 @@ cdef _batch_normalization_forward_training( _ndarray_base running_mean, _ndarray_base running_var, mean, inv_std, double eps, double decay, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): + int d_layout=_cudnn.CUDNN_TENSOR_NCHW): cdef _memory.MemoryPointer workspace = None cdef _memory.MemoryPointer reserve_space = None @@ -2282,7 +2284,7 @@ def batch_normalization_forward_inference( _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, _ndarray_base mean, _ndarray_base var, double eps, bint is_for_conv2d, int cudnn_mode, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): + int d_layout=_cudnn.CUDNN_TENSOR_NCHW): x = core._internal_ascontiguousarray(x) dtype = x.dtype y = _core.ndarray(x._shape, dtype) @@ -2327,7 +2329,7 @@ def batch_normalization_backward( _ndarray_base x, _ndarray_base gamma, _ndarray_base gy, _ndarray_base mean, _ndarray_base inv_std, double eps, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW, + int d_layout=_cudnn.CUDNN_TENSOR_NCHW, *, _memory.MemoryPointer reserve_space=None, ): @@ -2440,7 +2442,7 @@ def batch_normalization_backward( return gx, ggamma, gbeta -def create_activation_descriptor(mode, relu_nan_opt=cudnn.CUDNN_PROPAGATE_NAN, +def create_activation_descriptor(mode, relu_nan_opt=_cudnn.CUDNN_PROPAGATE_NAN, coef=0.0): desc = Descriptor(cudnn.createActivationDescriptor(), _py_cudnn.destroyActivationDescriptor) diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index 4ad926b4249..25a9d1b5eb4 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -164,6 +164,9 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'cupyx.cusolver', 'cupy_backends.cuda.libs.curand_hip', 'cupy_backends.cuda.libs.nvrtc_hip', + 'cupy_backends.cuda.libs.miopen', + 'cupy_backends.cuda.libs.cudnn', + 'cupyx.cudnn', ], 'include': [ 'hip/hip_runtime_api.h', @@ -175,6 +178,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'roctx.h', 'rocsolver/rocsolver.h' if rocm_version >= 560 else 'rocsolver.h', 'hipsolver/hipsolver.h' if rocm_version >= 560 else 'hipsolver.h', + 'miopen/miopen.h', ], 'libraries': [ 'amdhip64', # was hiprtc and hip_hcc before ROCm 3.8.0 @@ -188,6 +192,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'rocsolver', 'rocsparse', 'hipsolver', + 'MIOpen', ], 'check_method': build.check_hip_version, 'version_method': build.get_hip_version, diff --git a/tests/cupyx_tests/test_cudnn.py b/tests/cupyx_tests/test_cudnn.py index 84ef7b02071..940c6375f97 100644 --- a/tests/cupyx_tests/test_cudnn.py +++ b/tests/cupyx_tests/test_cudnn.py @@ -13,7 +13,6 @@ if cudnn_enabled: modes = [ - libcudnn.CUDNN_ACTIVATION_SIGMOID, libcudnn.CUDNN_ACTIVATION_RELU, libcudnn.CUDNN_ACTIVATION_TANH, ] @@ -40,7 +39,6 @@ 'dtype': [numpy.float32, numpy.float64], 'mode': modes, })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnActivation: @pytest.fixture(autouse=True) @@ -60,7 +58,6 @@ def test_activation_backward(self): 'dtype': [numpy.float32, numpy.float64], 'mode': coef_modes, })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnActivationCoef: @pytest.fixture(autouse=True) @@ -83,7 +80,6 @@ def test_activation_backward(self): 'ratio': [0.0, 0.1, 0.2, 0.5], 'seed': [0, 100] })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnDropout: @pytest.fixture(autouse=True) @@ -136,7 +132,6 @@ def test_dropout_seed(self): 'bias': [True, False], 'layout': layouts, }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionForward: @pytest.fixture(autouse=True) @@ -224,7 +219,6 @@ def test_call(self): 'auto_tune': [True, False], 'deterministic': [True, False], }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionBackwardFilter: @pytest.fixture(autouse=True) @@ -303,7 +297,6 @@ def test_call(self): 'deterministic': [True, False], 'bias': [True, False], }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionBackwardData: @pytest.fixture(autouse=True)