From 8952ad251bc3e40c24fdcbfd120cc36b51da16d4 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 27 Oct 2023 18:14:10 +0000 Subject: [PATCH 01/26] cudnn , miopen changes on 6.1 branch --- cupy_backends/cuda/libs/cudnn.pyx | 563 ++++-- cupy_backends/cuda/libs/miopen.pyx | 2543 ++++++++++++++++++++++++++++ 2 files changed, 2949 insertions(+), 157 deletions(-) create mode 100644 cupy_backends/cuda/libs/miopen.pyx diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 464c59d8a00..bd4c50f3d41 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -9,6 +9,7 @@ from cupy_backends.cuda.api cimport driver from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module +from cupy_backends.cuda.libs.miopen import * ############################################################################### # Extern ############################################################################### @@ -758,7 +759,10 @@ class CuDNNError(RuntimeError): def __init__(self, int status): self.status = status - msg = cudnnGetErrorString(status) + if runtime._is_hip_environment: + msg = miopenGetErrorString(status) + else: + msg = cudnnGetErrorString(status) super(CuDNNError, self).__init__( 'cuDNN Error: {}'.format(msg.decode())) self._infos = [] @@ -799,7 +803,10 @@ def get_build_version(): ############################################################################### cpdef size_t getVersion() except? 0: - return cudnnGetVersion() + if runtime._is_hip_environment: + return miopenGetVersion() + else: + return cudnnGetVersion() ############################################################################### @@ -822,14 +829,20 @@ cpdef queryRuntimeError(intptr_t handle, int mode): cpdef intptr_t create() except? 0: cdef Handle handle with nogil: - status = cudnnCreate(&handle) + if runtime._is_hip_environment: + status = miopenCreate(&handle) + else: + status = cudnnCreate(&handle) check_status(status) return handle cpdef destroy(intptr_t handle): with nogil: - status = cudnnDestroy(handle) + if runtime._is_hip_environment: + status = miopenDestroy(handle) + else: + status = cudnnDestroy(handle) check_status(status) @@ -840,14 +853,19 @@ cpdef setStream(intptr_t handle, size_t stream): raise NotImplementedError( 'calling cuDNN API during stream capture is currently ' 'unsupported') - - status = cudnnSetStream(handle, stream) + if runtime._is_hip_environment: + status = miopenSetStream(handle, stream) + else: + status = cudnnSetStream(handle, stream) check_status(status) cpdef size_t getStream(intptr_t handle) except? 0: cdef driver.Stream stream - status = cudnnGetStream(handle, &stream) + if runtime._is_hip_environment: + status = cudnnGetStream(handle, &stream) + else: + status = miopenGetStream(handle, &stream) check_status(status) return stream @@ -862,7 +880,10 @@ cdef _setStream(intptr_t handle): cpdef size_t createTensorDescriptor() except? 0: cdef TensorDescriptor descriptor - status = cudnnCreateTensorDescriptor(&descriptor) + if runtime._is_hip_environment: + status = miopenCreateTensorDescriptor(&descriptor) + else: + status = cudnnCreateTensorDescriptor(&descriptor) check_status(status) return descriptor @@ -903,7 +924,10 @@ cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, cpdef destroyTensorDescriptor(size_t tensorDesc): - status = cudnnDestroyTensorDescriptor(tensorDesc) + if runtime._is_hip_environment: + status = miopenDestroyTensorDescriptor(tensorDesc) + else: + status = cudnnDestroyTensorDescriptor(tensorDesc) check_status(status) @@ -957,11 +981,18 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, size_t B, size_t beta, size_t cDesc, size_t C): _setStream(handle) with nogil: - status = cudnnOpTensor( - handle, opTensorDesc, alpha1, - aDesc, A, alpha2, - bDesc, B, beta, - cDesc, C) + if runtime._is_hip_environment: + status = miopenOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) + else: + status = cudnnOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) check_status(status) @@ -971,7 +1002,10 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, cpdef size_t createReduceTensorDescriptor() except? 0: cdef ReduceTensorDescriptor reduceTensorDesc - status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) + if runtime._is_hip_environment: + status = miopenCreateReduceTensorDescriptor(&reduceTensorDesc) + else: + status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) check_status(status) return reduceTensorDesc @@ -979,12 +1013,20 @@ cpdef setReduceTensorDescriptor( size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, int reduceTensorNanOpt, int reduceTensorIndices, int reduceTensorIndicesType): - status = cudnnSetReduceTensorDescriptor( - reduceTensorDesc, - reduceTensorOp, - reduceTensorCompType, reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType) + if runtime._is_hip_environment: + status = miopenSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) + else: + status = cudnnSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) check_status(status) @@ -994,25 +1036,39 @@ cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): cdef NanPropagation redNanOpt cdef ReduceTensorIndices redIndices cdef IndicesType redIndicesType - status = cudnnGetReduceTensorDescriptor( - reduceTensorDesc, &redOp, - &redCompType, &redNanOpt, &redIndices, &redIndicesType) + if runtime._is_hip_environment: + status = miopenGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) + else: + status = cudnnGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) check_status(status) return redOp, redCompType, redNanOpt, redIndices, redIndicesType cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - status = cudnnDestroyReduceTensorDescriptor( - reduceTensorDesc) + if runtime._is_hip_environment: + status = miopenDestroyReduceTensorDescriptor( + reduceTensorDesc) + else: + status = cudnnDestroyReduceTensorDescriptor( + reduceTensorDesc) check_status(status) cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes - status = cudnnGetReductionIndicesSize( - handle, reduceTensorDesc, - aDesc, cDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) + else: + status = cudnnGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -1021,10 +1077,16 @@ cpdef size_t getReductionWorkspaceSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes - status = cudnnGetReductionWorkspaceSize( - handle, reduceTensorDesc, - aDesc, cDesc, - &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) + else: + status = cudnnGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) check_status(status) return sizeInBytes @@ -1035,29 +1097,46 @@ cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, size_t A, size_t beta, size_t cDesc, size_t C): _setStream(handle) with nogil: - status = cudnnReduceTensor( - handle, reduceTensorDesc, - indices, indicesSizeInBytes, workspace, - workspaceSizeInBytes, alpha, aDesc, - A, beta, cDesc, C) + if runtime._is_hip_environment: + status = miopenReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) + else: + status = cudnnReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) check_status(status) cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): _setStream(handle) with nogil: - status = cudnnSetTensor( - handle, yDesc, y, - valuePtr) + if runtime._is_hip_environment: + status = miopenSetTensor( + handle, yDesc, y, + valuePtr) + else: + status = cudnnSetTensor( + handle, yDesc, y, + valuePtr) check_status(status) cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): _setStream(handle) with nogil: - status = cudnnScaleTensor( - handle, yDesc, y, - alpha) + if runtime._is_hip_environment: + status = miopenScaleTensor( + handle, yDesc, y, + alpha) + else: + status = cudnnScaleTensor( + handle, yDesc, y, + alpha) check_status(status) @@ -1115,7 +1194,10 @@ cpdef destroyFilterDescriptor(size_t filterDesc): cpdef size_t createConvolutionDescriptor() except? 0: cdef ConvolutionDescriptor desc - status = cudnnCreateConvolutionDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreateConvolutionDescriptor(&desc) + else: + status = cudnnCreateConvolutionDescriptor(&desc) check_status(status) return desc @@ -1130,21 +1212,27 @@ cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: cdef MathType mathType status = cudnnGetConvolutionMathType( convDesc, &mathType) - check_status(status) return mathType cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - status = cudnnSetConvolutionGroupCount( - convDesc, groupCount) + if runtime._is_hip_environment: + status = miopenSetConvolutionGroupCount( + convDesc, groupCount) + else: + status = cudnnSetConvolutionGroupCount( + convDesc, groupCount) check_status(status) cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: cdef int groupCount - status = cudnnGetConvolutionGroupCount( - convDesc, &groupCount) - check_status(status) + if runtime._is_hip_environment: + status = miopenGetConvolutionGroupCount( + convDesc, &groupCount) + else: + status = cudnnGetConvolutionGroupCount( + convDesc, &groupCount) return groupCount @@ -1177,8 +1265,12 @@ cpdef setConvolutionNdDescriptor_v3( cpdef destroyConvolutionDescriptor(size_t convDesc): - status = cudnnDestroyConvolutionDescriptor( - convDesc) + if runtime._is_hip_environment: + status = miopenDestroyConvolutionDescriptor( + convDesc) + else: + status = cudnnDestroyConvolutionDescriptor( + convDesc) check_status(status) @@ -1286,13 +1378,21 @@ cpdef convolutionForward( size_t destDesc, size_t destData): _setStream(handle) with nogil: - status = cudnnConvolutionForward( - handle, alpha, - srcDesc, srcData, - filterDesc, filterData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - destDesc, destData) + if runtime._is_hip_environment: + status = miopenConvolutionForward(handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) + else: + status = cudnnConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) check_status(status) @@ -1301,10 +1401,16 @@ cpdef convolutionBackwardBias( size_t beta, size_t destDesc, size_t destData): _setStream(handle) with nogil: - status = cudnnConvolutionBackwardBias( - handle, alpha, - srcDesc, srcData, beta, - destDesc, destData) + if runtime._is_hip_environment: + status = miopenConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) + else: + status = cudnnConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) check_status(status) @@ -1545,7 +1651,10 @@ cpdef convolutionBackwardData_v3( cpdef size_t createPoolingDescriptor() except? 0: cdef PoolingDescriptor desc - status = cudnnCreatePoolingDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreatePoolingDescriptor(&desc) + else: + status = cudnnCreatePoolingDescriptor(&desc) check_status(status) return desc @@ -1572,7 +1681,10 @@ cpdef setPoolingNdDescriptor_v4( cpdef destroyPoolingDescriptor(size_t poolingDesc): - status = cudnnDestroyPoolingDescriptor(poolingDesc) + if runtime._is_hip_environment: + status = miopenDestroyPoolingDescriptor(poolingDesc) + else: + status = cudnnDestroyPoolingDescriptor(poolingDesc) check_status(status) @@ -1611,9 +1723,14 @@ CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON cpdef deriveBNTensorDescriptor( size_t derivedBnDesc, size_t xDesc, int mode): - status = cudnnDeriveBNTensorDescriptor( - derivedBnDesc, xDesc, - mode) + if runtime._is_hip_environment: + status = miopenDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) + else: + status = cudnnDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) check_status(status) @@ -1627,14 +1744,24 @@ cpdef batchNormalizationForwardTraining( double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): _setStream(handle) with nogil: - status = cudnnBatchNormalizationForwardTraining( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance) + if runtime._is_hip_environment: + status = miopenBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) + else: + status = cudnnBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) check_status(status) @@ -1647,13 +1774,22 @@ cpdef batchNormalizationForwardInference( double epsilon): _setStream(handle) with nogil: - status = cudnnBatchNormalizationForwardInference( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, estimatedMean, estimatedVariance, - epsilon) + if runtime._is_hip_environment: + status = miopenBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) + else: + status = cudnnBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) check_status(status) @@ -1668,16 +1804,28 @@ cpdef batchNormalizationBackward( double epsilon, size_t savedMean, size_t savedInvVariance): _setStream(handle) with nogil: - status = cudnnBatchNormalizationBackward( - handle, mode, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - dyDesc, dy, - dxDesc, dx, - dBnScaleBiasDesc, bnScale, - dBnScaleResult, dBnBiasResult, - epsilon, savedMean, savedInvVariance) + if runtime._is_hip_environment: + status = miopenBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) + else: + status = cudnnBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) check_status(status) @@ -1823,7 +1971,10 @@ cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( cpdef size_t createActivationDescriptor() except? 0: cdef ActivationDescriptor activationDesc - status = cudnnCreateActivationDescriptor(&activationDesc) + if runtime._is_hip_environment: + status = miopenCreateActivationDescriptor(&activationDesc) + else: + status = cudnnCreateActivationDescriptor(&activationDesc) check_status(status) return activationDesc @@ -1837,8 +1988,12 @@ cpdef setActivationDescriptor( cpdef destroyActivationDescriptor(size_t activationDesc): - status = cudnnDestroyActivationDescriptor( - activationDesc) + if runtime._is_hip_environment: + status = miopenDestroyActivationDescriptor( + activationDesc) + else: + status = cudnnDestroyActivationDescriptor( + activationDesc) check_status(status) @@ -1847,10 +2002,16 @@ cpdef softmaxForward( size_t srcData, size_t beta, size_t dstDesc, size_t dstData): _setStream(handle) with nogil: - status = cudnnSoftmaxForward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - beta, dstDesc, dstData) + if runtime._is_hip_environment: + status = miopenSoftmaxForward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) + else: + status = cudnnSoftmaxForward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) check_status(status) @@ -1860,11 +2021,18 @@ cpdef softmaxBackward( size_t destDiffDesc, size_t destDiffData): _setStream(handle) with nogil: - status = cudnnSoftmaxBackward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - srcDiffDesc, srcDiffData, beta, - destDiffDesc, destDiffData) + if runtime._is_hip_environment: + status = miopenSoftmaxBackward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + else: + status = cudnnSoftmaxBackward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) check_status(status) @@ -1902,20 +2070,30 @@ cpdef activationBackward_v4( cpdef size_t createDropoutDescriptor() except? 0: cdef DropoutDescriptor desc - status = cudnnCreateDropoutDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreateDropoutDescriptor(&desc) + else: + status = cudnnCreateDropoutDescriptor(&desc) check_status(status) return desc cpdef destroyDropoutDescriptor(size_t dropoutDesc): - status = cudnnDestroyDropoutDescriptor(dropoutDesc) + if runtime._is_hip_environment: + status = miopenDestroyDropoutDescriptor(dropoutDesc) + else: + status = cudnnDestroyDropoutDescriptor(dropoutDesc) check_status(status) cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: cdef size_t sizeInBytes - status = cudnnDropoutGetStatesSize( - handle, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenDropoutGetStatesSize( + handle, &sizeInBytes) + else: + status = cudnnDropoutGetStatesSize( + handle, &sizeInBytes) check_status(status) return sizeInBytes @@ -1931,8 +2109,12 @@ cpdef setDropoutDescriptor( cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: cdef size_t sizeInBytes - status = cudnnDropoutGetReserveSpaceSize( - xDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + else: + status = cudnnDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -1972,12 +2154,18 @@ cpdef dropoutBackward( ############################################################################### cpdef size_t createCTCLossDescriptor() except? 0: cdef CTCLossDescriptor desc - status = cudnnCreateCTCLossDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreateCTCLossDescriptor(&desc) + else: + status = cudnnCreateCTCLossDescriptor(&desc) check_status(status) return desc cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) + if runtime._is_hip_environment: + status = miopenDestroyCTCLossDescriptor(ctcLossDesc) + else: + status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) check_status(status) cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): @@ -1997,11 +2185,18 @@ cpdef size_t getCTCLossWorkspaceSize( size_t labels, size_t labelLengths, size_t inputLengths, int algo, size_t ctcLossDesc) except? 0: cdef size_t sizeInBytes - status = cudnnGetCTCLossWorkspaceSize( - handle, probsDesc, - gradientsDesc, - labels, labelLengths, inputLengths, - algo, ctcLossDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) + else: + status = cudnnGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -2011,12 +2206,20 @@ cpdef CTCLoss( size_t costs, size_t gradientsDesc, size_t gradients, int algo, size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes): - status = cudnnCTCLoss( - handle, probsDesc, probs, - labels, labelLengths, inputLengths, - costs, gradientsDesc, gradients, - algo, ctcLossDesc, - workspace, workSpaceSizeInBytes) + if runtime._is_hip_environment: + status = miopenCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) + else: + status = cudnnCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) check_status(status) @@ -2026,13 +2229,19 @@ cpdef CTCLoss( cpdef size_t createRNNDescriptor() except? 0: cdef RNNDescriptor desc - status = cudnnCreateRNNDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreateRNNDescriptor(&desc) + else: + status = cudnnCreateRNNDescriptor(&desc) check_status(status) return desc cpdef destroyRNNDescriptor(size_t rnnDesc): - status = cudnnDestroyRNNDescriptor(rnnDesc) + if runtime._is_hip_environment: + status = miopenDestroyRNNDescriptor(rnnDesc) + else: + status = cudnnDestroyRNNDescriptor(rnnDesc) check_status(status) @@ -2134,9 +2343,14 @@ cpdef getRNNDataDescriptor( cpdef getRNNWorkspaceSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes - status = cudnnGetRNNWorkspaceSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetRNNWorkspaceSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + else: + status = cudnnGetRNNWorkspaceSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -2144,9 +2358,14 @@ cpdef getRNNWorkspaceSize( cpdef getRNNTrainingReserveSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes - status = cudnnGetRNNTrainingReserveSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetRNNTrainingReserveSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + else: + status = cudnnGetRNNTrainingReserveSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -2154,9 +2373,14 @@ cpdef getRNNTrainingReserveSize( cpdef getRNNParamsSize( intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): cdef size_t sizeInBytes - status = cudnnGetRNNParamsSize( - handle, rnnDesc, xDesc, - &sizeInBytes, dataType) + if runtime._is_hip_environment: + status = miopenGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) + else: + status = cudnnGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) check_status(status) return sizeInBytes @@ -2190,16 +2414,28 @@ cpdef RNNForwardInference( size_t cy, size_t workspace, size_t workSpaceSizeInBytes): _setStream(handle) with nogil: - status = cudnnRNNForwardInference( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes) + if runtime._is_hip_environment: + status = miopenRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) + else: + status = cudnnRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) check_status(status) @@ -2212,17 +2448,30 @@ cpdef RNNForwardTraining( size_t reserveSpaceSizeInBytes): _setStream(handle) with nogil: - status = cudnnRNNForwardTraining( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) + if runtime._is_hip_environment: + status = miopenRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + else: + status = cudnnRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) check_status(status) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx new file mode 100644 index 00000000000..c7c3811c885 --- /dev/null +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -0,0 +1,2543 @@ +# distutils: language = c++ + +"""Thin wrapper of cuDNN.""" +# NOTE: This wrapper does not cover all APIs of cuDNN v4. +cimport cython # NOQA +from libcpp cimport vector + +from cupy_backends.cuda.api cimport driver +from cupy_backends.cuda.api cimport runtime +from cupy_backends.cuda cimport stream as stream_module + +############################################################################### +# Extern +############################################################################### + +cdef extern from '../../cupy_cudnn.h' nogil: + # Types + ctypedef int ActivationMode 'miopenActivationMode_t' + ctypedef int AddMode 'cudnnAddMode_t' + ctypedef int BatchNormMode 'miopenBatchNormMode_t' + ctypedef int BatchNormOps 'cudnnBatchNormOps_t' + ctypedef int ConvolutionBwdDataAlgo 'miopenBwdDataAlgorithm_t' + ctypedef int ConvolutionBwdDataPreference \ + 'cudnnConvolutionBwdDataPreference_t' + ctypedef struct ConvolutionBwdDataAlgoPerf \ + 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ + 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionBwdFilterAlgo 'miopenConvBwdWeightsAlgorithm_t' + ctypedef int ConvolutionBwdFilterPreference \ + 'cudnnConvolutionBwdFilterPreference_t' + ctypedef struct ConvolutionBwdFilterAlgoPerf \ + 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ + 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionFwdAlgo 'miopenConvolutionFwdAlgorithm_t' + ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' + ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionFwdAlgoPerf_v7 \ + 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionMode 'miopenConvolutionMode_t' + ctypedef int DataType 'miopenDataType_t' + ctypedef int MathType 'cudnnMathType_t' + ctypedef int DirectionMode 'miopenRNNDirectionMode_t' + ctypedef int NanPropagation 'miopenNanPropagation_t' + ctypedef int PoolingMode 'miopenPoolingMode_t' + ctypedef int RNNInputMode 'miopenRNNInputMode_t' + ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' + ctypedef int RNNMode 'miopenRNNMode_t' + ctypedef int RNNAlgo 'miopenRNNAlgo_t' + ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' + ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' + ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' + ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' + ctypedef int Status 'miopenStatus_t' + ctypedef int TensorFormat 'cudnnTensorFormat_t' + ctypedef int OpTensorOp 'miopenTensorOp_t' + + ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' + ctypedef int ReduceTensorIndices 'miopenReduceTensorIndices_t' + ctypedef int IndicesType 'miopenIndicesType_t' + ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' + ctypedef int FusedOps 'cudnnFusedOps_t' + ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' + ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' + ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' + ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' + + ctypedef void* ActivationDescriptor 'miopenActivationDescriptor_t' + ctypedef void* ConvolutionDescriptor 'miopenConvolutionDescriptor_t' + ctypedef void* DropoutDescriptor 'miopenDropoutDescriptor_t' + ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' + ctypedef void* Handle 'miopenHandle_t' + ctypedef void* PoolingDescriptor 'miopenPoolingDescriptor_t' + ctypedef void* CTCLossDescriptor 'miopenCTCLossDescriptor_t' + ctypedef void* RNNDescriptor 'miopenRNNDescriptor_t' + ctypedef void* RNNDataDescriptor 'miopenRNNDataDescriptor_t' + ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' + ctypedef void* TensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* OpTensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* ReduceTensorDescriptor 'miopenReduceTensorDescriptor_t' + ctypedef void* SpatialTransformerDescriptor \ + 'cudnnSpatialTransformerDescriptor_t' + ctypedef void* SamplerType 'cudnnSamplerType_t' + ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' + ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' + ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' + + # Error handling + const char* miopenGetErrorString(Status status) + + # Version + size_t miopenGetVersion() + + # Runtime error checking + int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + ErrQueryMode mode, RuntimeTag *tag) + + # Initialization and CUDA cooperation + int miopenCreate(Handle* handle) + int miopenDestroy(Handle handle) + int miopenSetStream(Handle handle, driver.Stream stream) + int miopenGetStream(Handle handle, driver.Stream* stream) + + # Tensor manipulation + int miopenCreateTensorDescriptor(TensorDescriptor* descriptor) + int miopenSet4dTensorDescriptor( + TensorDescriptor tensorDesc, + DataType dataType, int n, int c, int h, int w) + int miopenSet4dTensorDescriptorEx( + TensorDescriptor tensorDesc, DataType dataType, + int n, int c, int h, int w, + int nStride, int cStride, int hStride, int wStride) + int miopenGet4dTensorDescriptor( + TensorDescriptor tensorDesc, DataType* dataType, + int* n, int* c, int* h, int* w, + int* nStride, int* cStride, int* hStride, int* wStride) + int cudnnSetTensorNdDescriptor( + TensorDescriptor tensorDesc, DataType dataType, int nbDims, + int* dimA, int* strideA) + int miopenDestroyTensorDescriptor(TensorDescriptor tensorDesc) + int cudnnAddTensor_v3( + Handle handle, void* alpha, TensorDescriptor bDesc, + void* b, void* beta, TensorDescriptor yDesc, void* y) + + # Tensor operations + int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) + int cudnnSetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, + DataType opTensorCompType, NanPropagation opTensorNanOpt) + int cudnnGetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, + DataType* opTensorCompType, NanPropagation* opTensorNanOpt) + int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) + int miopenOpTensor( + Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, + TensorDescriptor aDesc, void* A, void* alpha2, + TensorDescriptor bDesc, void* B, void* beta, + TensorDescriptor cDesc, void* C) + + # Tensor reductions + int miopenCreateReduceTensorDescriptor( + ReduceTensorDescriptor* reduceTensorDesc) + int miopenSetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, + DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, + ReduceTensorIndices reduceTensorIndices, + IndicesType reduceTensorIndicesType) + int miopenGetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, + ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, + NanPropagation* reduceTensorNanOpt, + ReduceTensorIndices* reduceTensorIndices, + IndicesType* reduceTensorIndicesType) + int miopenDestroyReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc) + int miopenGetReductionIndicesSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int miopenGetReductionWorkspaceSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int miopenReduceTensor( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, + size_t indicesSizeInBytes, void* workspace, + size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, + void* A, void* beta, TensorDescriptor cDesc, void* c) + int miopenSetTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) + int miopenScaleTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* alpha) + + # Filter manipulation + int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) + int cudnnSetFilter4dDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int k, int c, int h, int w) + int cudnnSetFilterNdDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int nbDims, const int filterDimA[]) + int cudnnGetFilterNdDescriptor_v4( + FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, + TensorFormat* format, int* nbDims, int filterDimA[]) + int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) + + # Convolution + int miopenCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) + int cudnnSetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType mathType) + int cudnnGetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType *mathType) + int miopenSetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int groupCount) + int miopenGetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int *groupCount) + int cudnnSetConvolution2dDescriptor_v4( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode) + int cudnnSetConvolution2dDescriptor_v5( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode, + DataType computeType) + int cudnnSetConvolutionNdDescriptor_v3( + ConvolutionDescriptor convDesc, int arrayLength, int* padA, + int* filterStrideA, int* dilationA, ConvolutionMode mode, + DataType dataType) + int miopenDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) + int cudnnFindConvolutionForwardAlgorithm( + Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, + ConvolutionDescriptor convDesc, TensorDescriptor yDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionFwdAlgoPerf* perfResults) + int cudnnFindConvolutionForwardAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionForwardAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionForwardAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, ConvolutionFwdPreference preference, + size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) + int cudnnGetConvolutionForwardAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) + int miopenConvolutionForwardGetWorkSpaceSize( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, + size_t* sizeInBytes) + int cudnnConvolutionForward( + Handle handle, void* alpha, TensorDescriptor srcDesc, + void* srcData, FilterDescriptor filterDesc, void* filterData, + ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnConvolutionBackwardBias( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnFindConvolutionBackwardFilterAlgorithm( + Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardFilterAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionBackwardFilterAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) + int cudnnGetConvolutionBackwardFilterAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf_v7* perfResults) + int cudnnGetConvolutionBackwardFilterWorkspaceSize( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) + int cudnnConvolutionBackwardFilter_v3( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + FilterDescriptor gradDesc, void* gradData) + int cudnnGetConvolutionBackwardDataAlgorithm_v6( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + ConvolutionBwdDataPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) + int cudnnGetConvolutionBackwardDataAlgorithm_v7( + Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf_v7* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithm( + Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithmEx( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int miopenConvolutionBackwardDataGetWorkSpaceSize( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + size_t* sizeInBytes) + int cudnnConvolutionBackwardData_v3( + Handle handle, void* alpha, + FilterDescriptor filterDesc, void* filterData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor gradDesc, void* gradData) + + # Pooling + int miopenCreatePoolingDescriptor(PoolingDescriptor* desc) + int cudnnSetPooling2dDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, + int verticalPadding, int horizontalPadding, int verticalStride, + int horizontalStride) + int cudnnSetPoolingNdDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int nbDims, + int* windowDimA, int* paddingA, int* strideA) + int miopenDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) + int cudnnPoolingForward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnPoolingBackward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Batch Normalization + int miopenDeriveBNTensorDescriptor( + TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, + BatchNormMode mode) + int miopenBatchNormalizationForwardTraining( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, void* resultSaveMean, + void* resultSaveInvVariance) + int miopenBatchNormalizationForwardInference( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, void* estimatedMean, void* estimatedVariance, + double epsilon) + int miopenBatchNormalizationBackward( + Handle handle, BatchNormMode mode, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, void* bnScale, + void* dBnScaleResult, void* dBnBiasResult, + double epsilon, void* savedMean, void* savedInvVariance) + + int cudnnBatchNormalizationForwardTrainingEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + void* alpha, void* beta, + TensorDescriptor xDesc, void* x, + TensorDescriptor zDesc, void* z, + TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, + void* bnScale, void* bnBias, + double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, + void* resultSaveMean, void* resultSaveInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor zDesc, + TensorDescriptor yDesc, + TensorDescriptor bnScaleBiasMeanVarDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnBatchNormalizationBackwardEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnops, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor yDesc, void* y, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dzDesc, void* dz, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, + void* bnScaleData, void* bnBiasData, + void* dBnScaleData, void* dBnBiasData, + double epsilon, + void* savedMean, void* savedInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationBackwardExWorkspaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor yDesc, + TensorDescriptor dyDesc, + TensorDescriptor dzDesc, + TensorDescriptor dxDesc, + TensorDescriptor dBnScaleBiasDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + ActivationDescriptor activationDesc, + TensorDescriptor xDesc, + size_t* sizeInBytes) + + # Activation + int miopenCreateActivationDescriptor( + ActivationDescriptor* activationDesc) + int cudnnSetActivationDescriptor( + ActivationDescriptor activationDesc, ActivationMode mode, + NanPropagation reluNanOpt, double reluCeiling) + int miopenDestroyActivationDescriptor( + ActivationDescriptor activationDesc) + int miopenSoftmaxForward( + Handle handle, + void* alpha, TensorDescriptor srcDesc, void* srcData, + void* beta, TensorDescriptor dstDesc, void* dstData) + int miopenSoftmaxBackward( + Handle handle, + void* alpha, TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + int cudnnActivationForward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnActivationBackward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Dropout + int miopenCreateDropoutDescriptor(DropoutDescriptor* desc) + int miopenDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) + int miopenDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) + int miopenDropoutGetReserveSpaceSize( + TensorDescriptor xDesc, size_t* sizeInBytes) + int cudnnSetDropoutDescriptor( + DropoutDescriptor dropoutDesc, Handle handle, float dropout, + void* states, size_t stateSizeInBytes, unsigned long long seed) + int cudnnDropoutForward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor dstDesc, void* dstData, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnDropoutBackward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, + void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # CTC + int miopenCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) + int miopenDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) + int cudnnSetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType dataType) + int cudnnGetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType* dataType) + int miopenGetCTCLossWorkspaceSize( + Handle handle, TensorDescriptor probsDesc, + TensorDescriptor gradientsDesc, int* labels, + int* labelLengths, int* inputLengths, CTCLossAlgo algo, + CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) + int miopenCTCLoss( + Handle handle, TensorDescriptor probsDesc, + void* probs, int* labels, int* labelLengths, int* inputLengths, + void* costs, TensorDescriptor gradientsDesc, void* gradients, + CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, + void* workspace, size_t workSpaceSizeInBytes) + # RNN + int miopenCreateRNNDescriptor(RNNDescriptor* rnnDesc) + int miopenDestroyRNNDescriptor(RNNDescriptor rnnDesc) + int cudnnCreatePersistentRNNPlan( + RNNDescriptor rnnDesc, + const int minibatch, DataType dataType, + PersistentRNNPlan* plan) + int cudnnSetPersistentRNNPlan( + RNNDescriptor rnnDesc, PersistentRNNPlan plan) + int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) + int cudnnSetRNNDescriptor_v5( + RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, DataType dataType) + int cudnnSetRNNDescriptor_v6( + Handle handle, RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) + int cudnnSetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) + int cudnnGetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) + int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) + int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) + int cudnnSetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, + int maxSeqLength, int batchSize, int vectorSize, + const int seqLengthArray[], void *paddingFill) + int cudnnGetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType* dataType, + RNNDataLayout* layout, int* maxSeqLength, int* batchSize, + int* vectorSize, int arrayLengthRequested, int seqLengthArray[], + void* paddingFill) + int miopenGetRNNWorkspaceSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int miopenGetRNNTrainingReserveSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int miopenGetRNNParamsSize( + Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, + size_t* sizeInBytes, DataType dataType) + int cudnnGetRNNLinLayerMatrixParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerMatDesc, + void** linLayerMat) + int cudnnGetRNNLinLayerBiasParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerBiasDesc, + void** linLayerBias) + int miopenRNNForwardInference( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, + void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, + void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, + void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, + void* cy, void* workspace, size_t workSpaceSizeInBytes) + int miopenRNNForwardTraining( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, + TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, + FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, + TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, + void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardData( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* yDesc, void* y, + TensorDescriptor* dyDesc, void* dy, + TensorDescriptor dhyDesc, void* dhy, + TensorDescriptor dcyDesc, void* dcy, + FilterDescriptor wDesc, void* w, + TensorDescriptor hxDesc, void* hx, + TensorDescriptor cxDesc, void* cx, + TensorDescriptor* dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, void* workspace, + size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeights( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, + TensorDescriptor* yDesc, void* y, + void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, + void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + int cudnnRNNForwardInferenceEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnRNNForwardTrainingEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardDataEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor yDesc, const void* y, + RNNDataDescriptor dyDesc, const void* dy, + RNNDataDescriptor dcDesc, const void* dcAttn, + TensorDescriptor dhyDesc, const void* dhy, + TensorDescriptor dcyDesc, const void* dcy, + FilterDescriptor wDesc, const void* w, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + RNNDataDescriptor dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, + RNNDataDescriptor dkDesc, void* dkeys, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeightsEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + RNNDataDescriptor yDesc, const void* y, + void* workSpace, size_t workSpaceSizeInBytes, + FilterDescriptor dwDesc, void* dw, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # Spatial Transformer + int cudnnCreateSpatialTransformerDescriptor( + SpatialTransformerDescriptor* stDesc) + int cudnnDestroySpatialTransformerDescriptor( + SpatialTransformerDescriptor stDesc) + int cudnnSetSpatialTransformerNdDescriptor( + SpatialTransformerDescriptor stDesc, SamplerType samplerType, + DataType dataType, int nbDims, int dimA[]) + int cudnnSpatialTfGridGeneratorForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* theta, void* grid) + int cudnnSpatialTfGridGeneratorBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* dgrid, void* dtheta) + int cudnnSpatialTfSamplerForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, + void* grid, void* beta, TensorDescriptor yDesc, void* y) + int cudnnSpatialTfSamplerBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, void* beta, + TensorDescriptor dxDesc, void* dx, void* alphaDgrid, + TensorDescriptor dyDesc, void* dy, void* grid, + void* betaDgrid, void* dgrid) + + # Fused Ops + int cudnnCreateFusedOpsConstParamPack( + FusedOpsConstParamPack* constPack, int ops) + int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) + int cudnnSetFusedOpsConstParamPackAttribute( + FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, + const void *param) + int cudnnGetFusedOpsConstParamPackAttribute( + const FusedOpsConstParamPack constPack, + FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) + int cudnnCreateFusedOpsVariantParamPack( + FusedOpsVariantParamPack *varPack, FusedOps ops) + int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) + int cudnnSetFusedOpsVariantParamPackAttribute( + FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, + void *ptr) + int cudnnGetFusedOpsVariantParamPackAttribute( + const FusedOpsVariantParamPack varPack, + FusedOpsVariantParamLabel paramLabel, void *ptr) + int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) + int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) + int cudnnMakeFusedOpsPlan( + Handle handle, FusedOpsPlan plan, + const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) + int cudnnFusedOpsExecute( + Handle handle, const FusedOpsPlan plan, + FusedOpsVariantParamPack varPack) + + # Build-time version + int CUDNN_VERSION + + # Constants + double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' + + +cdef class CuDNNAlgoPerf: + + def __init__(self, algo, status, time, memory, determinism, mathType): + self.algo = algo + self.status = status + self.time = time + self.memory = memory + self.determinism = determinism + self.mathType = mathType + + +############################################################################### +# Error handling +############################################################################### + +class CuDNNError(RuntimeError): + + def __init__(self, int status): + self.status = status + msg = cudnnGetErrorString(status) + super(CuDNNError, self).__init__( + 'cuDNN Error: {}'.format(msg.decode())) + self._infos = [] + + def add_info(self, info): + assert isinstance(info, str) + self._infos.append(info) + + def add_infos(self, infos): + assert isinstance(infos, list) + self._infos.extend(infos) + + def __str__(self): + base = super(CuDNNError, self).__str__() + return base + ''.join( + '\n ' + info for info in self._infos) + + def __reduce__(self): + return (type(self), (self.status,)) + + +@cython.profile(False) +cpdef inline check_status(int status): + if status != 0: + raise CuDNNError(status) + + +############################################################################### +# Build-time version +############################################################################### + +def get_build_version(): + return CUDNN_VERSION + + +############################################################################### +# Version +############################################################################### + +cpdef size_t getVersion() except? 0: + return cudnnGetVersion() + + +############################################################################### +# Runtime error checking +############################################################################### + +cpdef queryRuntimeError(intptr_t handle, int mode): + cdef Status rstatus + with nogil: + status = cudnnQueryRuntimeError(handle, &rstatus, + mode, 0) + check_status(status) + return rstatus + + +############################################################################### +# Initialization and CUDA cooperation +############################################################################### + +cpdef intptr_t create() except? 0: + cdef Handle handle + with nogil: + status = miopenCreate(&handle) + check_status(status) + return handle + + +cpdef destroy(intptr_t handle): + with nogil: + status = miopenDestroy(handle) + check_status(status) + + +cpdef setStream(intptr_t handle, size_t stream): + # TODO(leofang): The support of stream capture is not mentioned at all in + # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. + if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): + raise NotImplementedError( + 'calling cuDNN API during stream capture is currently ' + 'unsupported') + + status = miopenSetStream(handle, stream) + check_status(status) + + +cpdef size_t getStream(intptr_t handle) except? 0: + cdef driver.Stream stream + status = miopenGetStream(handle, &stream) + check_status(status) + return stream + + +cdef _setStream(intptr_t handle): + """Set current stream""" + setStream(handle, stream_module.get_current_stream_ptr()) + +############################################################################### +# Tensor manipulation +############################################################################### + +cpdef size_t createTensorDescriptor() except? 0: + cdef TensorDescriptor descriptor + status = miopenCreateTensorDescriptor(&descriptor) + check_status(status) + return descriptor + + +cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w): + status = miopenSet4dTensorDescriptor( + tensorDesc, + dataType, n, c, h, w) + check_status(status) + + +cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride): + status = miopenSet4dTensorDescriptorEx( + tensorDesc, dataType, n, c, h, w, + nStride, cStride, hStride, wStride) + check_status(status) + + +cpdef tuple getTensor4dDescriptor(size_t tensorDesc): + cdef DataType dataType + cdef int n, c, h, w, nStride, cStride, hStride, wStride + status = miopenGet4dTensorDescriptor( + tensorDesc, &dataType, + &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) + check_status(status) + return dataType, n, c, h, w, nStride, cStride, hStride, wStride + + +cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA): + status = cudnnSetTensorNdDescriptor( + tensorDesc, dataType, nbDims, + dimA, strideA) + check_status(status) + + +cpdef destroyTensorDescriptor(size_t tensorDesc): + status = miopenDestroyTensorDescriptor(tensorDesc) + check_status(status) + + +cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnAddTensor_v3( + handle, alpha, bDesc, + b, beta, yDesc, y) + check_status(status) + + +############################################################################### +# Tensor operations +############################################################################### + +cpdef size_t createOpTensorDescriptor() except? 0: + cdef OpTensorDescriptor opTensorDesc + status = cudnnCreateOpTensorDescriptor(&opTensorDesc) + check_status(status) + return opTensorDesc + + +cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt): + status = cudnnSetOpTensorDescriptor( + opTensorDesc, opTensorOp, + opTensorCompType, opTensorNanOpt) + check_status(status) + + +cpdef getOpTensorDescriptor(size_t opTensorDesc): + cdef OpTensorOp opTensorOp + cdef DataType opTensorCompType + cdef NanPropagation opTensorNanOpt + status = cudnnGetOpTensorDescriptor( + opTensorDesc, &opTensorOp, &opTensorCompType, + &opTensorNanOpt) + check_status(status) + return opTensorOp, opTensorCompType, opTensorNanOpt + + +cpdef destroyOpTensorDescriptor(size_t opTensorDesc): + status = cudnnDestroyOpTensorDescriptor(opTensorDesc) + check_status(status) + + +cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) + check_status(status) + + +############################################################################### +# Tensor reductions +############################################################################### + +cpdef size_t createReduceTensorDescriptor() except? 0: + cdef ReduceTensorDescriptor reduceTensorDesc + status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) + check_status(status) + return reduceTensorDesc + +cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, + int reduceTensorNanOpt, int reduceTensorIndices, + int reduceTensorIndicesType): + status = cudnnSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) + check_status(status) + + +cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): + cdef ReduceTensorOp redOp + cdef DataType redCompType + cdef NanPropagation redNanOpt + cdef ReduceTensorIndices redIndices + cdef IndicesType redIndicesType + status = cudnnGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) + check_status(status) + return redOp, redCompType, redNanOpt, redIndices, redIndicesType + + +cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): + status = cudnnDestroyReduceTensorDescriptor( + reduceTensorDesc) + check_status(status) + + +cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef size_t getReductionWorkspaceSize(intptr_t handle, + size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) + check_status(status) + + +cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): + _setStream(handle) + with nogil: + status = cudnnSetTensor( + handle, yDesc, y, + valuePtr) + check_status(status) + + +cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): + _setStream(handle) + with nogil: + status = cudnnScaleTensor( + handle, yDesc, y, + alpha) + check_status(status) + + +############################################################################### +# Filter manipulation +############################################################################### + +cpdef size_t createFilterDescriptor() except? 0: + cdef FilterDescriptor desc + status = cudnnCreateFilterDescriptor(&desc) + check_status(status) + return desc + + +cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, + int format, int k, int c, int h, int w): + status = cudnnSetFilter4dDescriptor_v4( + filterDesc, dataType, + format, k, c, h, w) + check_status(status) + + +cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, + int format, int nbDims, size_t filterDimA): + status = cudnnSetFilterNdDescriptor_v4( + filterDesc, dataType, + format, nbDims, filterDimA) + check_status(status) + + +cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): + cdef DataType dataType + cdef TensorFormat format + cdef int nbDims + cdef vector.vector[int] filterDimA + filterDimA.resize(nbDimsRequested) + + status = cudnnGetFilterNdDescriptor_v4( + wDesc, nbDimsRequested, &dataType, + &format, &nbDims, filterDimA.data()) + check_status(status) + return dataType, format, nbDims, tuple(filterDimA) + + +cpdef destroyFilterDescriptor(size_t filterDesc): + status = cudnnDestroyFilterDescriptor(filterDesc) + check_status(status) + + +############################################################################### +# Convolution +############################################################################### + +cpdef size_t createConvolutionDescriptor() except? 0: + cdef ConvolutionDescriptor desc + status = miopenCreateConvolutionDescriptor(&desc) + check_status(status) + return desc + + +cpdef setConvolutionMathType(size_t convDesc, size_t mathType): + status = cudnnSetConvolutionMathType( + convDesc, mathType) + check_status(status) + + +cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: + cdef MathType mathType + status = cudnnGetConvolutionMathType( + convDesc, &mathType) + check_status(status) + return mathType + + +cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): + status = miopenSetConvolutionGroupCount( + convDesc, groupCount) + check_status(status) + + +cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: + cdef int groupCount + status = cudnnGetConvolutionGroupCount( + convDesc, &groupCount) + check_status(status) + return groupCount + + +cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode): + status = cudnnSetConvolution2dDescriptor_v4( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode) + check_status(status) + + +cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType): + status = cudnnSetConvolution2dDescriptor_v5( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode, computeType) + check_status(status) + + +cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType): + status = cudnnSetConvolutionNdDescriptor_v3( + convDesc, arrayLength, padA, + filterStrideA, dilationA, mode, + dataType) + check_status(status) + + +cpdef destroyConvolutionDescriptor(size_t convDesc): + status = miopenDestroyConvolutionDescriptor( + convDesc) + check_status(status) + + +cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, + size_t yDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithm( + handle, xDesc, wDesc, + convDesc, yDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + +cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + +cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx_v7( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: + cdef ConvolutionFwdAlgo algo + status = cudnnGetConvolutionForwardAlgorithm_v6( + handle, srcDesc, + filterDesc, convDesc, + destDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + +cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionForwardAlgorithm_v7( + handle, srcDesc, + filterDesc, convDesc, + destDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = miopenConvolutionForwardGetWorkSpaceSize( + handle, srcDesc, + filterDesc, convDesc, + destDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) + check_status(status) + + +cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) + check_status(status) + + +cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithm( + handle, xDesc, dyDesc, + convDesc, dwDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + +cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + +cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdFilterAlgo algo + status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, + preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + +cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( + handle, srcDesc, diffDesc, + convDesc, gradDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, algo, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardFilter_v3( + handle, alpha, + srcDesc, srcData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + + +cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithm( + handle, wDesc, dyDesc, + convDesc, dxDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + +cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + +cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdDataAlgo algo + status = cudnnGetConvolutionBackwardDataAlgorithm_v6( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + +cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardDataAlgorithm_v7( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = miopenConvolutionBackwardDataGetWorkSpaceSize( + handle, filterDesc, + diffDesc, + convDesc, gradDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardData_v3( + handle, alpha, + filterDesc, filterData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + +############################################################################### +# Pooling +############################################################################### + +cpdef size_t createPoolingDescriptor() except? 0: + cdef PoolingDescriptor desc + status = miopenCreatePoolingDescriptor(&desc) + check_status(status) + return desc + + +cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride): + status = cudnnSetPooling2dDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, windowHeight, windowWidth, + verticalPadding, horizontalPadding, verticalStride, horizontalStride) + check_status(status) + + +cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA): + status = cudnnSetPoolingNdDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, nbDims, + windowDimA, paddingA, strideA) + check_status(status) + + +cpdef destroyPoolingDescriptor(size_t poolingDesc): + status = miopenDestroyPoolingDescriptor(poolingDesc) + check_status(status) + + +cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnPoolingForward( + handle, poolingDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + +cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnPoolingBackward( + handle, poolingDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + +############################################################################### +# Batch Normalization +############################################################################### + +CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON + +cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode): + status = miopenDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) + check_status(status) + + +cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): + _setStream(handle) + with nogil: + status = miopenBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) + check_status(status) + + +cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon): + _setStream(handle) + with nogil: + status = miopenBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) + check_status(status) + + +cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance): + _setStream(handle) + with nogil: + status = miopenBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) + check_status(status) + + +cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardTrainingEx( + handle, mode, bnOps, + alpha, beta, + xDesc, x, + zDesc, z, + yDesc, y, + bnScaleBiasMeanVarDesc, + bnScale, bnBias, + exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + handle, + mode, bnOps, + xDesc, + zDesc, + yDesc, + bnScaleBiasMeanVarDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationBackwardEx( + handle, + mode, bnops, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + yDesc, y, + dyDesc, dy, + dzDesc, dz, + dxDesc, dx, + dBnScaleBiasDesc, + bnScaleData, bnBiasData, + dBnScaleData, dBnBiasData, + epsilon, + savedMean, savedInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( + handle, + mode, + bnOps, + xDesc, + yDesc, + dyDesc, + dzDesc, + dxDesc, + dBnScaleBiasDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + handle, + mode, + bnOps, + activationDesc, + xDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +############################################################################### +# Activation +############################################################################### + +cpdef size_t createActivationDescriptor() except? 0: + cdef ActivationDescriptor activationDesc + status = miopenCreateActivationDescriptor(&activationDesc) + check_status(status) + return activationDesc + + +cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): + status = cudnnSetActivationDescriptor( + activationDesc, mode, + reluNanOpt, reluCeiling) + check_status(status) + + +cpdef destroyActivationDescriptor(size_t activationDesc): + status = miopenDestroyActivationDescriptor( + activationDesc) + check_status(status) + + +cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = miopenSoftmaxForward( + handle, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) + check_status(status) + + +cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData): + _setStream(handle) + with nogil: + status = miopenSoftmaxBackward( + handle, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + check_status(status) + + +cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnActivationForward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + +cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnActivationBackward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + + +############################################################################### +# Dropout +############################################################################### + +cpdef size_t createDropoutDescriptor() except? 0: + cdef DropoutDescriptor desc + status = miopenCreateDropoutDescriptor(&desc) + check_status(status) + return desc + + +cpdef destroyDropoutDescriptor(size_t dropoutDesc): + status = miopenDestroyDropoutDescriptor(dropoutDesc) + check_status(status) + + +cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: + cdef size_t sizeInBytes + status = miopenDropoutGetStatesSize( + handle, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed): + status = cudnnSetDropoutDescriptor( + dropoutDesc, handle, dropout, + states, stateSizeInBytes, seed) + check_status(status) + + +cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = miopenDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutForward( + handle, dropoutDesc, + srcDesc, srcData, + dstDesc, dstData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutBackward( + handle, dropoutDesc, + dyDesc, dyData, + dxDesc, dxData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +############################################################################### +# CTC +############################################################################### +cpdef size_t createCTCLossDescriptor() except? 0: + cdef CTCLossDescriptor desc + status = miopenCreateCTCLossDescriptor(&desc) + check_status(status) + return desc + +cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): + status = miopenDestroyCTCLossDescriptor(ctcLossDesc) + check_status(status) + +cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): + status = cudnnSetCTCLossDescriptor( + ctcLossDesc, dataType) + check_status(status) + +cpdef getCTCLossDescriptor(size_t ctcLossDesc): + cdef DataType compType + status = cudnnGetCTCLossDescriptor( + ctcLossDesc, &compType) + check_status(status) + return compType + +cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0: + cdef size_t sizeInBytes + status = miopenGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + +cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, + int algo, size_t ctcLossDesc, + size_t workspace, size_t workSpaceSizeInBytes): + status = miopenCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) + check_status(status) + + +############################################################################### +# RNN +############################################################################### + +cpdef size_t createRNNDescriptor() except? 0: + cdef RNNDescriptor desc + status = miopenCreateRNNDescriptor(&desc) + check_status(status) + return desc + + +cpdef destroyRNNDescriptor(size_t rnnDesc): + status = miopenDestroyRNNDescriptor(rnnDesc) + check_status(status) + + +cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, + int dataType) except? 0: + cdef PersistentRNNPlan plan + status = cudnnCreatePersistentRNNPlan( + rnnDesc, + minibatch, dataType, &plan) + check_status(status) + return plan + + +cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): + status = cudnnSetPersistentRNNPlan( + rnnDesc, plan) + check_status(status) + + +cpdef destroyPersistentRNNPlan(size_t plan): + status = cudnnDestroyPersistentRNNPlan(plan) + check_status(status) + + +cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType): + status = cudnnSetRNNDescriptor_v5( + rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, dataType) + check_status(status) + + +cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType): + status = cudnnSetRNNDescriptor_v6( + handle, rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, algo, + dataType) + check_status(status) + + +cpdef setRNNPaddingMode( + size_t rnnDesc, int paddingMode): + status = cudnnSetRNNPaddingMode( + rnnDesc, paddingMode) + check_status(status) + + +cpdef getRNNPaddingMode(size_t rnnDesc): + cdef RNNPaddingMode paddingMode + status = cudnnGetRNNPaddingMode( + rnnDesc, &paddingMode) + check_status(status) + return paddingMode + + +cpdef size_t createRNNDataDescriptor() except? 0: + cdef RNNDataDescriptor desc + status = cudnnCreateRNNDataDescriptor(&desc) + check_status(status) + return desc + + +cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): + status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) + check_status(status) + + +cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill): + status = cudnnSetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, vectorSize, + seqLengthArray, paddingFill) + check_status(status) + + +cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill): + status = cudnnGetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, + vectorSize, arrayLengthRequested, seqLengthArray, + paddingFill) + check_status(status) + + +cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = miopenGetRNNWorkspaceSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = miopenGetRNNTrainingReserveSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): + cdef size_t sizeInBytes + status = miopenGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) + check_status(status) + return sizeInBytes + + +cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): + status = cudnnGetRNNLinLayerMatrixParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerMatDesc, linLayerMat) + check_status(status) + + +cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias): + status = cudnnGetRNNLinLayerBiasParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerBiasDesc, linLayerBias) + check_status(status) + + +cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = miopenRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) + check_status(status) + + +cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = miopenRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardData( + handle, rnnDesc, seqLength, + yDesc, y, + dyDesc, dy, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeights( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + yDesc, y, + workspace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardInferenceEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes) + check_status(status) + + +cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardTrainingEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardDataEx( + handle, rnnDesc, + yDesc, y, + dyDesc, dy, + dcDesc, dcAttn, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + dkDesc, dkeys, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeightsEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + yDesc, y, + workSpace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +############################################################################### +# Spatial Transformer +############################################################################### + +cpdef size_t createSpatialTransformerDescriptor() except? 0: + cdef SpatialTransformerDescriptor stDesc + status = cudnnCreateSpatialTransformerDescriptor(&stDesc) + check_status(status) + return stDesc + + +cpdef destroySpatialTransformerDescriptor(size_t stDesc): + status = cudnnDestroySpatialTransformerDescriptor( + stDesc) + check_status(status) + + +cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA): + status = cudnnSetSpatialTransformerNdDescriptor( + stDesc, samplerType, + dataType, nbDims, dimA) + check_status(status) + + +cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorForward( + handle, stDesc, + theta, grid) + check_status(status) + + +cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorBackward( + handle, stDesc, + dgrid, dtheta) + check_status(status) + + +cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerForward( + handle, stDesc, + alpha, xDesc, x, grid, + beta, yDesc, y) + check_status(status) + + +cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerBackward( + handle, stDesc, + alpha, xDesc, x, beta, + dxDesc, dx, alphaDgrid, + dyDesc, dy, grid, + betaDgrid, dgrid) + check_status(status) + +############################################################################### +# Fused Ops +############################################################################### + +cpdef createFusedOpsConstParamPack(int ops): + cdef FusedOpsConstParamPack constPack + with nogil: + status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) + check_status(status) + return constPack + +cpdef destroyFusedOpsConstParamPack(size_t constPack): + with nogil: + status = cudnnDestroyFusedOpsConstParamPack( + constPack) + check_status(status) + +cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + with nogil: + status = cudnnSetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param) + check_status(status) + +cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + cdef int isNULL = 0 + with nogil: + status = cudnnGetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param, &isNULL) + check_status(status) + return isNULL + +cpdef createFusedOpsVariantParamPack(int ops): + cdef FusedOpsVariantParamPack varPack + with nogil: + status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) + check_status(status) + return varPack + +cpdef destroyFusedOpsVariantParamPack(size_t varPack): + with nogil: + status = cudnnDestroyFusedOpsVariantParamPack( + varPack) + check_status(status) + +cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnSetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + +cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnGetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + +cpdef createFusedOpsPlan(int ops): + cdef FusedOpsPlan plan + with nogil: + status = cudnnCreateFusedOpsPlan(&plan, ops) + check_status(status) + return plan + +cpdef destroyFusedOpsPlan(size_t plan): + with nogil: + status = cudnnDestroyFusedOpsPlan(plan) + check_status(status) + +cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): + cdef size_t workspaceSizeInBytes + _setStream(handle) + with nogil: + status = cudnnMakeFusedOpsPlan(handle, plan, + constPack, + &workspaceSizeInBytes) + check_status(status) + return workspaceSizeInBytes + +cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): + _setStream(handle) + with nogil: + status = cudnnFusedOpsExecute(handle, plan, + varPack) + check_status(status) + From cef800e8d7ae851e608418f65c059a21e973141b Mon Sep 17 00:00:00 2001 From: bmedishe Date: Mon, 27 Nov 2023 21:47:19 +0000 Subject: [PATCH 02/26] update miopen.pyx --- cupy_backends/cuda/libs/miopen.pyx | 1802 ---------------------------- 1 file changed, 1802 deletions(-) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index c7c3811c885..cd68ca9f693 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -739,1805 +739,3 @@ cdef extern from '../../cupy_cudnn.h' nogil: # Constants double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' - -cdef class CuDNNAlgoPerf: - - def __init__(self, algo, status, time, memory, determinism, mathType): - self.algo = algo - self.status = status - self.time = time - self.memory = memory - self.determinism = determinism - self.mathType = mathType - - -############################################################################### -# Error handling -############################################################################### - -class CuDNNError(RuntimeError): - - def __init__(self, int status): - self.status = status - msg = cudnnGetErrorString(status) - super(CuDNNError, self).__init__( - 'cuDNN Error: {}'.format(msg.decode())) - self._infos = [] - - def add_info(self, info): - assert isinstance(info, str) - self._infos.append(info) - - def add_infos(self, infos): - assert isinstance(infos, list) - self._infos.extend(infos) - - def __str__(self): - base = super(CuDNNError, self).__str__() - return base + ''.join( - '\n ' + info for info in self._infos) - - def __reduce__(self): - return (type(self), (self.status,)) - - -@cython.profile(False) -cpdef inline check_status(int status): - if status != 0: - raise CuDNNError(status) - - -############################################################################### -# Build-time version -############################################################################### - -def get_build_version(): - return CUDNN_VERSION - - -############################################################################### -# Version -############################################################################### - -cpdef size_t getVersion() except? 0: - return cudnnGetVersion() - - -############################################################################### -# Runtime error checking -############################################################################### - -cpdef queryRuntimeError(intptr_t handle, int mode): - cdef Status rstatus - with nogil: - status = cudnnQueryRuntimeError(handle, &rstatus, - mode, 0) - check_status(status) - return rstatus - - -############################################################################### -# Initialization and CUDA cooperation -############################################################################### - -cpdef intptr_t create() except? 0: - cdef Handle handle - with nogil: - status = miopenCreate(&handle) - check_status(status) - return handle - - -cpdef destroy(intptr_t handle): - with nogil: - status = miopenDestroy(handle) - check_status(status) - - -cpdef setStream(intptr_t handle, size_t stream): - # TODO(leofang): The support of stream capture is not mentioned at all in - # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. - if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): - raise NotImplementedError( - 'calling cuDNN API during stream capture is currently ' - 'unsupported') - - status = miopenSetStream(handle, stream) - check_status(status) - - -cpdef size_t getStream(intptr_t handle) except? 0: - cdef driver.Stream stream - status = miopenGetStream(handle, &stream) - check_status(status) - return stream - - -cdef _setStream(intptr_t handle): - """Set current stream""" - setStream(handle, stream_module.get_current_stream_ptr()) - -############################################################################### -# Tensor manipulation -############################################################################### - -cpdef size_t createTensorDescriptor() except? 0: - cdef TensorDescriptor descriptor - status = miopenCreateTensorDescriptor(&descriptor) - check_status(status) - return descriptor - - -cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, - int n, int c, int h, int w): - status = miopenSet4dTensorDescriptor( - tensorDesc, - dataType, n, c, h, w) - check_status(status) - - -cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, - int n, int c, int h, int w, int nStride, - int cStride, int hStride, int wStride): - status = miopenSet4dTensorDescriptorEx( - tensorDesc, dataType, n, c, h, w, - nStride, cStride, hStride, wStride) - check_status(status) - - -cpdef tuple getTensor4dDescriptor(size_t tensorDesc): - cdef DataType dataType - cdef int n, c, h, w, nStride, cStride, hStride, wStride - status = miopenGet4dTensorDescriptor( - tensorDesc, &dataType, - &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) - check_status(status) - return dataType, n, c, h, w, nStride, cStride, hStride, wStride - - -cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, - size_t dimA, size_t strideA): - status = cudnnSetTensorNdDescriptor( - tensorDesc, dataType, nbDims, - dimA, strideA) - check_status(status) - - -cpdef destroyTensorDescriptor(size_t tensorDesc): - status = miopenDestroyTensorDescriptor(tensorDesc) - check_status(status) - - -cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, - size_t b, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnAddTensor_v3( - handle, alpha, bDesc, - b, beta, yDesc, y) - check_status(status) - - -############################################################################### -# Tensor operations -############################################################################### - -cpdef size_t createOpTensorDescriptor() except? 0: - cdef OpTensorDescriptor opTensorDesc - status = cudnnCreateOpTensorDescriptor(&opTensorDesc) - check_status(status) - return opTensorDesc - - -cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, - int opTensorCompType, int opTensorNanOpt): - status = cudnnSetOpTensorDescriptor( - opTensorDesc, opTensorOp, - opTensorCompType, opTensorNanOpt) - check_status(status) - - -cpdef getOpTensorDescriptor(size_t opTensorDesc): - cdef OpTensorOp opTensorOp - cdef DataType opTensorCompType - cdef NanPropagation opTensorNanOpt - status = cudnnGetOpTensorDescriptor( - opTensorDesc, &opTensorOp, &opTensorCompType, - &opTensorNanOpt) - check_status(status) - return opTensorOp, opTensorCompType, opTensorNanOpt - - -cpdef destroyOpTensorDescriptor(size_t opTensorDesc): - status = cudnnDestroyOpTensorDescriptor(opTensorDesc) - check_status(status) - - -cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, - size_t aDesc, size_t A, size_t alpha2, size_t bDesc, - size_t B, size_t beta, size_t cDesc, size_t C): - _setStream(handle) - with nogil: - status = cudnnOpTensor( - handle, opTensorDesc, alpha1, - aDesc, A, alpha2, - bDesc, B, beta, - cDesc, C) - check_status(status) - - -############################################################################### -# Tensor reductions -############################################################################### - -cpdef size_t createReduceTensorDescriptor() except? 0: - cdef ReduceTensorDescriptor reduceTensorDesc - status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) - check_status(status) - return reduceTensorDesc - -cpdef setReduceTensorDescriptor( - size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, - int reduceTensorNanOpt, int reduceTensorIndices, - int reduceTensorIndicesType): - status = cudnnSetReduceTensorDescriptor( - reduceTensorDesc, - reduceTensorOp, - reduceTensorCompType, reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType) - check_status(status) - - -cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): - cdef ReduceTensorOp redOp - cdef DataType redCompType - cdef NanPropagation redNanOpt - cdef ReduceTensorIndices redIndices - cdef IndicesType redIndicesType - status = cudnnGetReduceTensorDescriptor( - reduceTensorDesc, &redOp, - &redCompType, &redNanOpt, &redIndices, &redIndicesType) - check_status(status) - return redOp, redCompType, redNanOpt, redIndices, redIndicesType - - -cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - status = cudnnDestroyReduceTensorDescriptor( - reduceTensorDesc) - check_status(status) - - -cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetReductionIndicesSize( - handle, reduceTensorDesc, - aDesc, cDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getReductionWorkspaceSize(intptr_t handle, - size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetReductionWorkspaceSize( - handle, reduceTensorDesc, - aDesc, cDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, - size_t indicesSizeInBytes, size_t workspace, - size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, - size_t A, size_t beta, size_t cDesc, size_t C): - _setStream(handle) - with nogil: - status = cudnnReduceTensor( - handle, reduceTensorDesc, - indices, indicesSizeInBytes, workspace, - workspaceSizeInBytes, alpha, aDesc, - A, beta, cDesc, C) - check_status(status) - - -cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): - _setStream(handle) - with nogil: - status = cudnnSetTensor( - handle, yDesc, y, - valuePtr) - check_status(status) - - -cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): - _setStream(handle) - with nogil: - status = cudnnScaleTensor( - handle, yDesc, y, - alpha) - check_status(status) - - -############################################################################### -# Filter manipulation -############################################################################### - -cpdef size_t createFilterDescriptor() except? 0: - cdef FilterDescriptor desc - status = cudnnCreateFilterDescriptor(&desc) - check_status(status) - return desc - - -cpdef setFilter4dDescriptor_v4( - size_t filterDesc, int dataType, - int format, int k, int c, int h, int w): - status = cudnnSetFilter4dDescriptor_v4( - filterDesc, dataType, - format, k, c, h, w) - check_status(status) - - -cpdef setFilterNdDescriptor_v4( - size_t filterDesc, int dataType, - int format, int nbDims, size_t filterDimA): - status = cudnnSetFilterNdDescriptor_v4( - filterDesc, dataType, - format, nbDims, filterDimA) - check_status(status) - - -cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): - cdef DataType dataType - cdef TensorFormat format - cdef int nbDims - cdef vector.vector[int] filterDimA - filterDimA.resize(nbDimsRequested) - - status = cudnnGetFilterNdDescriptor_v4( - wDesc, nbDimsRequested, &dataType, - &format, &nbDims, filterDimA.data()) - check_status(status) - return dataType, format, nbDims, tuple(filterDimA) - - -cpdef destroyFilterDescriptor(size_t filterDesc): - status = cudnnDestroyFilterDescriptor(filterDesc) - check_status(status) - - -############################################################################### -# Convolution -############################################################################### - -cpdef size_t createConvolutionDescriptor() except? 0: - cdef ConvolutionDescriptor desc - status = miopenCreateConvolutionDescriptor(&desc) - check_status(status) - return desc - - -cpdef setConvolutionMathType(size_t convDesc, size_t mathType): - status = cudnnSetConvolutionMathType( - convDesc, mathType) - check_status(status) - - -cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: - cdef MathType mathType - status = cudnnGetConvolutionMathType( - convDesc, &mathType) - check_status(status) - return mathType - - -cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - status = miopenSetConvolutionGroupCount( - convDesc, groupCount) - check_status(status) - - -cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: - cdef int groupCount - status = cudnnGetConvolutionGroupCount( - convDesc, &groupCount) - check_status(status) - return groupCount - - -cpdef setConvolution2dDescriptor_v4( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode): - status = cudnnSetConvolution2dDescriptor_v4( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode) - check_status(status) - - -cpdef setConvolution2dDescriptor_v5( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode, size_t computeType): - status = cudnnSetConvolution2dDescriptor_v5( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode, computeType) - check_status(status) - - -cpdef setConvolutionNdDescriptor_v3( - size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, - size_t dilationA, int mode, int dataType): - status = cudnnSetConvolutionNdDescriptor_v3( - convDesc, arrayLength, padA, - filterStrideA, dilationA, mode, - dataType) - check_status(status) - - -cpdef destroyConvolutionDescriptor(size_t convDesc): - status = miopenDestroyConvolutionDescriptor( - convDesc) - check_status(status) - - -cpdef findConvolutionForwardAlgorithm( - intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, - size_t yDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithm( - handle, xDesc, wDesc, - convDesc, yDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionForwardAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionForwardAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx_v7( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionForwardAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: - cdef ConvolutionFwdAlgo algo - status = cudnnGetConvolutionForwardAlgorithm_v6( - handle, srcDesc, - filterDesc, convDesc, - destDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionForwardAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionForwardAlgorithm_v7( - handle, srcDesc, - filterDesc, convDesc, - destDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = miopenConvolutionForwardGetWorkSpaceSize( - handle, srcDesc, - filterDesc, convDesc, - destDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionForward( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t filterDesc, size_t filterData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t destDesc, size_t destData): - _setStream(handle) - with nogil: - status = cudnnConvolutionForward( - handle, alpha, - srcDesc, srcData, - filterDesc, filterData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - destDesc, destData) - check_status(status) - - -cpdef convolutionBackwardBias( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t beta, size_t destDesc, size_t destData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardBias( - handle, alpha, - srcDesc, srcData, beta, - destDesc, destData) - check_status(status) - - -cpdef findConvolutionBackwardFilterAlgorithm( - intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, - size_t dwDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithm( - handle, xDesc, dyDesc, - convDesc, dwDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionBackwardFilterAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithmEx( - handle, xDesc, x, - dyDesc, dy, convDesc, - dwDesc, dw, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( - handle, xDesc, x, - dyDesc, dy, convDesc, - dwDesc, dw, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionBackwardFilterAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int preference, - size_t memoryLimitInbytes) except? -1: - cdef ConvolutionBwdFilterAlgo algo - status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( - handle, srcDesc, - diffDesc, convDesc, - filterDesc, - preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionBackwardFilterAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( - handle, srcDesc, diffDesc, - convDesc, gradDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle, srcDesc, - diffDesc, convDesc, - filterDesc, algo, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionBackwardFilter_v3( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardFilter_v3( - handle, alpha, - srcDesc, srcData, - diffDesc, diffData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - gradDesc, gradData) - check_status(status) - - -cpdef findConvolutionBackwardDataAlgorithm( - intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, - size_t dxDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithm( - handle, wDesc, dyDesc, - convDesc, dxDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionBackwardDataAlgorithmEx( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithmEx( - handle, wDesc, w, - dyDesc, dy, convDesc, - dxDesc, dx, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionBackwardDataAlgorithmEx_v7( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( - handle, wDesc, w, - dyDesc, dy, convDesc, - dxDesc, dx, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionBackwardDataAlgorithm_v6( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, size_t preference, - size_t memoryLimitInbytes) except? -1: - cdef ConvolutionBwdDataAlgo algo - status = cudnnGetConvolutionBackwardDataAlgorithm_v6( - handle, filterDesc, - diffDesc, convDesc, - gradDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionBackwardDataAlgorithm_v7( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionBackwardDataAlgorithm_v7( - handle, filterDesc, - diffDesc, convDesc, - gradDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = miopenConvolutionBackwardDataGetWorkSpaceSize( - handle, filterDesc, - diffDesc, - convDesc, gradDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionBackwardData_v3( - intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardData_v3( - handle, alpha, - filterDesc, filterData, - diffDesc, diffData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - gradDesc, gradData) - check_status(status) - -############################################################################### -# Pooling -############################################################################### - -cpdef size_t createPoolingDescriptor() except? 0: - cdef PoolingDescriptor desc - status = miopenCreatePoolingDescriptor(&desc) - check_status(status) - return desc - - -cpdef setPooling2dDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, - int windowWidth, int verticalPadding, int horizontalPadding, - int verticalStride, int horizontalStride): - status = cudnnSetPooling2dDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, windowHeight, windowWidth, - verticalPadding, horizontalPadding, verticalStride, horizontalStride) - check_status(status) - - -cpdef setPoolingNdDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, - size_t windowDimA, size_t paddingA, size_t strideA): - status = cudnnSetPoolingNdDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, nbDims, - windowDimA, paddingA, strideA) - check_status(status) - - -cpdef destroyPoolingDescriptor(size_t poolingDesc): - status = miopenDestroyPoolingDescriptor(poolingDesc) - check_status(status) - - -cpdef poolingForward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnPoolingForward( - handle, poolingDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef poolingBackward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnPoolingBackward( - handle, poolingDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - -############################################################################### -# Batch Normalization -############################################################################### - -CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON - -cpdef deriveBNTensorDescriptor( - size_t derivedBnDesc, size_t xDesc, int mode): - status = miopenDeriveBNTensorDescriptor( - derivedBnDesc, xDesc, - mode) - check_status(status) - - -cpdef batchNormalizationForwardTraining( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): - _setStream(handle) - with nogil: - status = miopenBatchNormalizationForwardTraining( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardInference( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, size_t estimatedMean, size_t estimatedVariance, - double epsilon): - _setStream(handle) - with nogil: - status = miopenBatchNormalizationForwardInference( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, estimatedMean, estimatedVariance, - epsilon) - check_status(status) - - -cpdef batchNormalizationBackward( - intptr_t handle, int mode, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, size_t dyDesc, - size_t dy, size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, size_t bnScale, - size_t dBnScaleResult, size_t dBnBiasResult, - double epsilon, size_t savedMean, size_t savedInvVariance): - _setStream(handle) - with nogil: - status = miopenBatchNormalizationBackward( - handle, mode, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - dyDesc, dy, - dxDesc, dx, - dBnScaleBiasDesc, bnScale, - dBnScaleResult, dBnBiasResult, - epsilon, savedMean, savedInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardTrainingEx( - intptr_t handle, int mode, int bnOps, - size_t alpha, size_t beta, - size_t xDesc, size_t x, - size_t zDesc, size_t z, - size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, - size_t bnScale, size_t bnBias, - double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardTrainingEx( - handle, mode, bnOps, - alpha, beta, - xDesc, x, - zDesc, z, - yDesc, y, - bnScaleBiasMeanVarDesc, - bnScale, bnBias, - exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t zDesc, - size_t yDesc, - size_t bnScaleBiasMeanVarDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - handle, - mode, bnOps, - xDesc, - zDesc, - yDesc, - bnScaleBiasMeanVarDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef batchNormalizationBackwardEx( - intptr_t handle, int mode, int bnops, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, - size_t yDesc, size_t y, - size_t dyDesc, size_t dy, - size_t dzDesc, size_t dz, - size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, - size_t bnScaleData, size_t bnBiasData, - size_t dBnScaleData, size_t dBnBiasData, - double epsilon, - size_t savedMean, size_t savedInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationBackwardEx( - handle, - mode, bnops, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - yDesc, y, - dyDesc, dy, - dzDesc, dz, - dxDesc, dx, - dBnScaleBiasDesc, - bnScaleData, bnBiasData, - dBnScaleData, dBnBiasData, - epsilon, - savedMean, savedInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t yDesc, - size_t dyDesc, - size_t dzDesc, - size_t dxDesc, - size_t dBnScaleBiasDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( - handle, - mode, - bnOps, - xDesc, - yDesc, - dyDesc, - dzDesc, - dxDesc, - dBnScaleBiasDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( - intptr_t handle, int mode, int bnOps, - size_t activationDesc, - size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - handle, - mode, - bnOps, - activationDesc, - xDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -############################################################################### -# Activation -############################################################################### - -cpdef size_t createActivationDescriptor() except? 0: - cdef ActivationDescriptor activationDesc - status = miopenCreateActivationDescriptor(&activationDesc) - check_status(status) - return activationDesc - - -cpdef setActivationDescriptor( - size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): - status = cudnnSetActivationDescriptor( - activationDesc, mode, - reluNanOpt, reluCeiling) - check_status(status) - - -cpdef destroyActivationDescriptor(size_t activationDesc): - status = miopenDestroyActivationDescriptor( - activationDesc) - check_status(status) - - -cpdef softmaxForward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = miopenSoftmaxForward( - handle, - alpha, srcDesc, srcData, - beta, dstDesc, dstData) - check_status(status) - - -cpdef softmaxBackward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, - size_t destDiffDesc, size_t destDiffData): - _setStream(handle) - with nogil: - status = miopenSoftmaxBackward( - handle, - alpha, srcDesc, srcData, - srcDiffDesc, srcDiffData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -cpdef activationForward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnActivationForward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef activationBackward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnActivationBackward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -############################################################################### -# Dropout -############################################################################### - -cpdef size_t createDropoutDescriptor() except? 0: - cdef DropoutDescriptor desc - status = miopenCreateDropoutDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyDropoutDescriptor(size_t dropoutDesc): - status = miopenDestroyDropoutDescriptor(dropoutDesc) - check_status(status) - - -cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: - cdef size_t sizeInBytes - status = miopenDropoutGetStatesSize( - handle, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef setDropoutDescriptor( - size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed): - status = cudnnSetDropoutDescriptor( - dropoutDesc, handle, dropout, - states, stateSizeInBytes, seed) - check_status(status) - - -cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = miopenDropoutGetReserveSpaceSize( - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef dropoutForward( - intptr_t handle, size_t dropoutDesc, - size_t srcDesc, size_t srcData, - size_t dstDesc, size_t dstData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutForward( - handle, dropoutDesc, - srcDesc, srcData, - dstDesc, dstData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef dropoutBackward( - intptr_t handle, size_t dropoutDesc, - size_t dyDesc, size_t dyData, - size_t dxDesc, size_t dxData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutBackward( - handle, dropoutDesc, - dyDesc, dyData, - dxDesc, dxData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# CTC -############################################################################### -cpdef size_t createCTCLossDescriptor() except? 0: - cdef CTCLossDescriptor desc - status = miopenCreateCTCLossDescriptor(&desc) - check_status(status) - return desc - -cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - status = miopenDestroyCTCLossDescriptor(ctcLossDesc) - check_status(status) - -cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): - status = cudnnSetCTCLossDescriptor( - ctcLossDesc, dataType) - check_status(status) - -cpdef getCTCLossDescriptor(size_t ctcLossDesc): - cdef DataType compType - status = cudnnGetCTCLossDescriptor( - ctcLossDesc, &compType) - check_status(status) - return compType - -cpdef size_t getCTCLossWorkspaceSize( - intptr_t handle, size_t probsDesc, size_t gradientsDesc, - size_t labels, size_t labelLengths, size_t inputLengths, - int algo, size_t ctcLossDesc) except? 0: - cdef size_t sizeInBytes - status = miopenGetCTCLossWorkspaceSize( - handle, probsDesc, - gradientsDesc, - labels, labelLengths, inputLengths, - algo, ctcLossDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - -cpdef CTCLoss( - intptr_t handle, size_t probsDesc, - size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, - size_t costs, size_t gradientsDesc, size_t gradients, - int algo, size_t ctcLossDesc, - size_t workspace, size_t workSpaceSizeInBytes): - status = miopenCTCLoss( - handle, probsDesc, probs, - labels, labelLengths, inputLengths, - costs, gradientsDesc, gradients, - algo, ctcLossDesc, - workspace, workSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# RNN -############################################################################### - -cpdef size_t createRNNDescriptor() except? 0: - cdef RNNDescriptor desc - status = miopenCreateRNNDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDescriptor(size_t rnnDesc): - status = miopenDestroyRNNDescriptor(rnnDesc) - check_status(status) - - -cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, - int dataType) except? 0: - cdef PersistentRNNPlan plan - status = cudnnCreatePersistentRNNPlan( - rnnDesc, - minibatch, dataType, &plan) - check_status(status) - return plan - - -cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): - status = cudnnSetPersistentRNNPlan( - rnnDesc, plan) - check_status(status) - - -cpdef destroyPersistentRNNPlan(size_t plan): - status = cudnnDestroyPersistentRNNPlan(plan) - check_status(status) - - -cpdef setRNNDescriptor_v5( - size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int dataType): - status = cudnnSetRNNDescriptor_v5( - rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, dataType) - check_status(status) - - -cpdef setRNNDescriptor_v6( - intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int algo, int dataType): - status = cudnnSetRNNDescriptor_v6( - handle, rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, algo, - dataType) - check_status(status) - - -cpdef setRNNPaddingMode( - size_t rnnDesc, int paddingMode): - status = cudnnSetRNNPaddingMode( - rnnDesc, paddingMode) - check_status(status) - - -cpdef getRNNPaddingMode(size_t rnnDesc): - cdef RNNPaddingMode paddingMode - status = cudnnGetRNNPaddingMode( - rnnDesc, &paddingMode) - check_status(status) - return paddingMode - - -cpdef size_t createRNNDataDescriptor() except? 0: - cdef RNNDataDescriptor desc - status = cudnnCreateRNNDataDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): - status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) - check_status(status) - - -cpdef setRNNDataDescriptor( - size_t RNNDataDesc, int dataType, size_t layout, - int maxSeqLength, int batchSize, int vectorSize, - size_t seqLengthArray, size_t paddingFill): - status = cudnnSetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, vectorSize, - seqLengthArray, paddingFill) - check_status(status) - - -cpdef getRNNDataDescriptor( - size_t RNNDataDesc, size_t dataType, - size_t layout, size_t maxSeqLength, size_t batchSize, - size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, - size_t paddingFill): - status = cudnnGetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, - vectorSize, arrayLengthRequested, seqLengthArray, - paddingFill) - check_status(status) - - -cpdef getRNNWorkspaceSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - status = miopenGetRNNWorkspaceSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNTrainingReserveSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - status = miopenGetRNNTrainingReserveSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNParamsSize( - intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): - cdef size_t sizeInBytes - status = miopenGetRNNParamsSize( - handle, rnnDesc, xDesc, - &sizeInBytes, dataType) - check_status(status) - return sizeInBytes - - -cpdef getRNNLinLayerMatrixParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): - status = cudnnGetRNNLinLayerMatrixParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerMatDesc, linLayerMat) - check_status(status) - - -cpdef getRNNLinLayerBiasParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerBiasDesc, - size_t linLayerBias): - status = cudnnGetRNNLinLayerBiasParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerBiasDesc, linLayerBias) - check_status(status) - - -cpdef RNNForwardInference( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, - size_t x, size_t hxDesc, size_t hx, size_t cxDesc, - size_t cx, size_t wDesc, size_t w, size_t yDesc, - size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t workspace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = miopenRNNForwardInference( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTraining( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t wDesc, size_t w, size_t yDesc, size_t y, - size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, - size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = miopenRNNForwardTraining( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardData( - intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, - size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, - size_t dcxDesc, size_t dcx, size_t workspace, - size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardData( - handle, rnnDesc, seqLength, - yDesc, y, - dyDesc, dy, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeights( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, - size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeights( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - yDesc, y, - workspace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardInferenceEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardInferenceEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTrainingEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardTrainingEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardDataEx( - intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, - size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, - size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, - size_t dkDesc, size_t dkeys, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardDataEx( - handle, rnnDesc, - yDesc, y, - dyDesc, dy, - dcDesc, dcAttn, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - dkDesc, dkeys, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeightsEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t dwDesc, size_t dw, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeightsEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - yDesc, y, - workSpace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# Spatial Transformer -############################################################################### - -cpdef size_t createSpatialTransformerDescriptor() except? 0: - cdef SpatialTransformerDescriptor stDesc - status = cudnnCreateSpatialTransformerDescriptor(&stDesc) - check_status(status) - return stDesc - - -cpdef destroySpatialTransformerDescriptor(size_t stDesc): - status = cudnnDestroySpatialTransformerDescriptor( - stDesc) - check_status(status) - - -cpdef setSpatialTransformerDescriptor( - size_t stDesc, size_t samplerType, int dataType, - int nbDims, size_t dimA): - status = cudnnSetSpatialTransformerNdDescriptor( - stDesc, samplerType, - dataType, nbDims, dimA) - check_status(status) - - -cpdef spatialTfGridGeneratorForward( - intptr_t handle, size_t stDesc, size_t theta, size_t grid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorForward( - handle, stDesc, - theta, grid) - check_status(status) - - -cpdef spatialTfGridGeneratorBackward( - intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorBackward( - handle, stDesc, - dgrid, dtheta) - check_status(status) - - -cpdef spatialTfSamplerForward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerForward( - handle, stDesc, - alpha, xDesc, x, grid, - beta, yDesc, y) - check_status(status) - - -cpdef spatialTfSamplerBackward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, - size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerBackward( - handle, stDesc, - alpha, xDesc, x, beta, - dxDesc, dx, alphaDgrid, - dyDesc, dy, grid, - betaDgrid, dgrid) - check_status(status) - -############################################################################### -# Fused Ops -############################################################################### - -cpdef createFusedOpsConstParamPack(int ops): - cdef FusedOpsConstParamPack constPack - with nogil: - status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) - check_status(status) - return constPack - -cpdef destroyFusedOpsConstParamPack(size_t constPack): - with nogil: - status = cudnnDestroyFusedOpsConstParamPack( - constPack) - check_status(status) - -cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - with nogil: - status = cudnnSetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param) - check_status(status) - -cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - cdef int isNULL = 0 - with nogil: - status = cudnnGetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param, &isNULL) - check_status(status) - return isNULL - -cpdef createFusedOpsVariantParamPack(int ops): - cdef FusedOpsVariantParamPack varPack - with nogil: - status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) - check_status(status) - return varPack - -cpdef destroyFusedOpsVariantParamPack(size_t varPack): - with nogil: - status = cudnnDestroyFusedOpsVariantParamPack( - varPack) - check_status(status) - -cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnSetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnGetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef createFusedOpsPlan(int ops): - cdef FusedOpsPlan plan - with nogil: - status = cudnnCreateFusedOpsPlan(&plan, ops) - check_status(status) - return plan - -cpdef destroyFusedOpsPlan(size_t plan): - with nogil: - status = cudnnDestroyFusedOpsPlan(plan) - check_status(status) - -cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): - cdef size_t workspaceSizeInBytes - _setStream(handle) - with nogil: - status = cudnnMakeFusedOpsPlan(handle, plan, - constPack, - &workspaceSizeInBytes) - check_status(status) - return workspaceSizeInBytes - -cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): - _setStream(handle) - with nogil: - status = cudnnFusedOpsExecute(handle, plan, - varPack) - check_status(status) - From 842331a89cdd1a760cb9bebf8c705617265b129d Mon Sep 17 00:00:00 2001 From: bmedishe Date: Mon, 27 Nov 2023 21:49:34 +0000 Subject: [PATCH 03/26] do not skip tests --- tests/cupyx_tests/test_cudnn.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/cupyx_tests/test_cudnn.py b/tests/cupyx_tests/test_cudnn.py index 84ef7b02071..0087a1c661b 100644 --- a/tests/cupyx_tests/test_cudnn.py +++ b/tests/cupyx_tests/test_cudnn.py @@ -40,7 +40,6 @@ 'dtype': [numpy.float32, numpy.float64], 'mode': modes, })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnActivation: @pytest.fixture(autouse=True) @@ -60,7 +59,6 @@ def test_activation_backward(self): 'dtype': [numpy.float32, numpy.float64], 'mode': coef_modes, })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnActivationCoef: @pytest.fixture(autouse=True) @@ -83,7 +81,6 @@ def test_activation_backward(self): 'ratio': [0.0, 0.1, 0.2, 0.5], 'seed': [0, 100] })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnDropout: @pytest.fixture(autouse=True) @@ -136,7 +133,6 @@ def test_dropout_seed(self): 'bias': [True, False], 'layout': layouts, }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionForward: @pytest.fixture(autouse=True) @@ -224,7 +220,6 @@ def test_call(self): 'auto_tune': [True, False], 'deterministic': [True, False], }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionBackwardFilter: @pytest.fixture(autouse=True) @@ -303,7 +298,6 @@ def test_call(self): 'deterministic': [True, False], 'bias': [True, False], }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionBackwardData: @pytest.fixture(autouse=True) From 685bc5ac61af3004f2c767be27aad5455e726f1c Mon Sep 17 00:00:00 2001 From: root Date: Fri, 27 Oct 2023 18:14:10 +0000 Subject: [PATCH 04/26] cudnn , miopen changes on 6.1 branch --- cupy_backends/cuda/libs/cudnn.pyx | 563 ++++-- cupy_backends/cuda/libs/miopen.pyx | 2543 ++++++++++++++++++++++++++++ 2 files changed, 2949 insertions(+), 157 deletions(-) create mode 100644 cupy_backends/cuda/libs/miopen.pyx diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 464c59d8a00..bd4c50f3d41 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -9,6 +9,7 @@ from cupy_backends.cuda.api cimport driver from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module +from cupy_backends.cuda.libs.miopen import * ############################################################################### # Extern ############################################################################### @@ -758,7 +759,10 @@ class CuDNNError(RuntimeError): def __init__(self, int status): self.status = status - msg = cudnnGetErrorString(status) + if runtime._is_hip_environment: + msg = miopenGetErrorString(status) + else: + msg = cudnnGetErrorString(status) super(CuDNNError, self).__init__( 'cuDNN Error: {}'.format(msg.decode())) self._infos = [] @@ -799,7 +803,10 @@ def get_build_version(): ############################################################################### cpdef size_t getVersion() except? 0: - return cudnnGetVersion() + if runtime._is_hip_environment: + return miopenGetVersion() + else: + return cudnnGetVersion() ############################################################################### @@ -822,14 +829,20 @@ cpdef queryRuntimeError(intptr_t handle, int mode): cpdef intptr_t create() except? 0: cdef Handle handle with nogil: - status = cudnnCreate(&handle) + if runtime._is_hip_environment: + status = miopenCreate(&handle) + else: + status = cudnnCreate(&handle) check_status(status) return handle cpdef destroy(intptr_t handle): with nogil: - status = cudnnDestroy(handle) + if runtime._is_hip_environment: + status = miopenDestroy(handle) + else: + status = cudnnDestroy(handle) check_status(status) @@ -840,14 +853,19 @@ cpdef setStream(intptr_t handle, size_t stream): raise NotImplementedError( 'calling cuDNN API during stream capture is currently ' 'unsupported') - - status = cudnnSetStream(handle, stream) + if runtime._is_hip_environment: + status = miopenSetStream(handle, stream) + else: + status = cudnnSetStream(handle, stream) check_status(status) cpdef size_t getStream(intptr_t handle) except? 0: cdef driver.Stream stream - status = cudnnGetStream(handle, &stream) + if runtime._is_hip_environment: + status = cudnnGetStream(handle, &stream) + else: + status = miopenGetStream(handle, &stream) check_status(status) return stream @@ -862,7 +880,10 @@ cdef _setStream(intptr_t handle): cpdef size_t createTensorDescriptor() except? 0: cdef TensorDescriptor descriptor - status = cudnnCreateTensorDescriptor(&descriptor) + if runtime._is_hip_environment: + status = miopenCreateTensorDescriptor(&descriptor) + else: + status = cudnnCreateTensorDescriptor(&descriptor) check_status(status) return descriptor @@ -903,7 +924,10 @@ cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, cpdef destroyTensorDescriptor(size_t tensorDesc): - status = cudnnDestroyTensorDescriptor(tensorDesc) + if runtime._is_hip_environment: + status = miopenDestroyTensorDescriptor(tensorDesc) + else: + status = cudnnDestroyTensorDescriptor(tensorDesc) check_status(status) @@ -957,11 +981,18 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, size_t B, size_t beta, size_t cDesc, size_t C): _setStream(handle) with nogil: - status = cudnnOpTensor( - handle, opTensorDesc, alpha1, - aDesc, A, alpha2, - bDesc, B, beta, - cDesc, C) + if runtime._is_hip_environment: + status = miopenOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) + else: + status = cudnnOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) check_status(status) @@ -971,7 +1002,10 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, cpdef size_t createReduceTensorDescriptor() except? 0: cdef ReduceTensorDescriptor reduceTensorDesc - status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) + if runtime._is_hip_environment: + status = miopenCreateReduceTensorDescriptor(&reduceTensorDesc) + else: + status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) check_status(status) return reduceTensorDesc @@ -979,12 +1013,20 @@ cpdef setReduceTensorDescriptor( size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, int reduceTensorNanOpt, int reduceTensorIndices, int reduceTensorIndicesType): - status = cudnnSetReduceTensorDescriptor( - reduceTensorDesc, - reduceTensorOp, - reduceTensorCompType, reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType) + if runtime._is_hip_environment: + status = miopenSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) + else: + status = cudnnSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) check_status(status) @@ -994,25 +1036,39 @@ cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): cdef NanPropagation redNanOpt cdef ReduceTensorIndices redIndices cdef IndicesType redIndicesType - status = cudnnGetReduceTensorDescriptor( - reduceTensorDesc, &redOp, - &redCompType, &redNanOpt, &redIndices, &redIndicesType) + if runtime._is_hip_environment: + status = miopenGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) + else: + status = cudnnGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) check_status(status) return redOp, redCompType, redNanOpt, redIndices, redIndicesType cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - status = cudnnDestroyReduceTensorDescriptor( - reduceTensorDesc) + if runtime._is_hip_environment: + status = miopenDestroyReduceTensorDescriptor( + reduceTensorDesc) + else: + status = cudnnDestroyReduceTensorDescriptor( + reduceTensorDesc) check_status(status) cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes - status = cudnnGetReductionIndicesSize( - handle, reduceTensorDesc, - aDesc, cDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) + else: + status = cudnnGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -1021,10 +1077,16 @@ cpdef size_t getReductionWorkspaceSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes - status = cudnnGetReductionWorkspaceSize( - handle, reduceTensorDesc, - aDesc, cDesc, - &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) + else: + status = cudnnGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) check_status(status) return sizeInBytes @@ -1035,29 +1097,46 @@ cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, size_t A, size_t beta, size_t cDesc, size_t C): _setStream(handle) with nogil: - status = cudnnReduceTensor( - handle, reduceTensorDesc, - indices, indicesSizeInBytes, workspace, - workspaceSizeInBytes, alpha, aDesc, - A, beta, cDesc, C) + if runtime._is_hip_environment: + status = miopenReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) + else: + status = cudnnReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) check_status(status) cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): _setStream(handle) with nogil: - status = cudnnSetTensor( - handle, yDesc, y, - valuePtr) + if runtime._is_hip_environment: + status = miopenSetTensor( + handle, yDesc, y, + valuePtr) + else: + status = cudnnSetTensor( + handle, yDesc, y, + valuePtr) check_status(status) cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): _setStream(handle) with nogil: - status = cudnnScaleTensor( - handle, yDesc, y, - alpha) + if runtime._is_hip_environment: + status = miopenScaleTensor( + handle, yDesc, y, + alpha) + else: + status = cudnnScaleTensor( + handle, yDesc, y, + alpha) check_status(status) @@ -1115,7 +1194,10 @@ cpdef destroyFilterDescriptor(size_t filterDesc): cpdef size_t createConvolutionDescriptor() except? 0: cdef ConvolutionDescriptor desc - status = cudnnCreateConvolutionDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreateConvolutionDescriptor(&desc) + else: + status = cudnnCreateConvolutionDescriptor(&desc) check_status(status) return desc @@ -1130,21 +1212,27 @@ cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: cdef MathType mathType status = cudnnGetConvolutionMathType( convDesc, &mathType) - check_status(status) return mathType cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - status = cudnnSetConvolutionGroupCount( - convDesc, groupCount) + if runtime._is_hip_environment: + status = miopenSetConvolutionGroupCount( + convDesc, groupCount) + else: + status = cudnnSetConvolutionGroupCount( + convDesc, groupCount) check_status(status) cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: cdef int groupCount - status = cudnnGetConvolutionGroupCount( - convDesc, &groupCount) - check_status(status) + if runtime._is_hip_environment: + status = miopenGetConvolutionGroupCount( + convDesc, &groupCount) + else: + status = cudnnGetConvolutionGroupCount( + convDesc, &groupCount) return groupCount @@ -1177,8 +1265,12 @@ cpdef setConvolutionNdDescriptor_v3( cpdef destroyConvolutionDescriptor(size_t convDesc): - status = cudnnDestroyConvolutionDescriptor( - convDesc) + if runtime._is_hip_environment: + status = miopenDestroyConvolutionDescriptor( + convDesc) + else: + status = cudnnDestroyConvolutionDescriptor( + convDesc) check_status(status) @@ -1286,13 +1378,21 @@ cpdef convolutionForward( size_t destDesc, size_t destData): _setStream(handle) with nogil: - status = cudnnConvolutionForward( - handle, alpha, - srcDesc, srcData, - filterDesc, filterData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - destDesc, destData) + if runtime._is_hip_environment: + status = miopenConvolutionForward(handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) + else: + status = cudnnConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) check_status(status) @@ -1301,10 +1401,16 @@ cpdef convolutionBackwardBias( size_t beta, size_t destDesc, size_t destData): _setStream(handle) with nogil: - status = cudnnConvolutionBackwardBias( - handle, alpha, - srcDesc, srcData, beta, - destDesc, destData) + if runtime._is_hip_environment: + status = miopenConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) + else: + status = cudnnConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) check_status(status) @@ -1545,7 +1651,10 @@ cpdef convolutionBackwardData_v3( cpdef size_t createPoolingDescriptor() except? 0: cdef PoolingDescriptor desc - status = cudnnCreatePoolingDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreatePoolingDescriptor(&desc) + else: + status = cudnnCreatePoolingDescriptor(&desc) check_status(status) return desc @@ -1572,7 +1681,10 @@ cpdef setPoolingNdDescriptor_v4( cpdef destroyPoolingDescriptor(size_t poolingDesc): - status = cudnnDestroyPoolingDescriptor(poolingDesc) + if runtime._is_hip_environment: + status = miopenDestroyPoolingDescriptor(poolingDesc) + else: + status = cudnnDestroyPoolingDescriptor(poolingDesc) check_status(status) @@ -1611,9 +1723,14 @@ CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON cpdef deriveBNTensorDescriptor( size_t derivedBnDesc, size_t xDesc, int mode): - status = cudnnDeriveBNTensorDescriptor( - derivedBnDesc, xDesc, - mode) + if runtime._is_hip_environment: + status = miopenDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) + else: + status = cudnnDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) check_status(status) @@ -1627,14 +1744,24 @@ cpdef batchNormalizationForwardTraining( double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): _setStream(handle) with nogil: - status = cudnnBatchNormalizationForwardTraining( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance) + if runtime._is_hip_environment: + status = miopenBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) + else: + status = cudnnBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) check_status(status) @@ -1647,13 +1774,22 @@ cpdef batchNormalizationForwardInference( double epsilon): _setStream(handle) with nogil: - status = cudnnBatchNormalizationForwardInference( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, estimatedMean, estimatedVariance, - epsilon) + if runtime._is_hip_environment: + status = miopenBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) + else: + status = cudnnBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) check_status(status) @@ -1668,16 +1804,28 @@ cpdef batchNormalizationBackward( double epsilon, size_t savedMean, size_t savedInvVariance): _setStream(handle) with nogil: - status = cudnnBatchNormalizationBackward( - handle, mode, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - dyDesc, dy, - dxDesc, dx, - dBnScaleBiasDesc, bnScale, - dBnScaleResult, dBnBiasResult, - epsilon, savedMean, savedInvVariance) + if runtime._is_hip_environment: + status = miopenBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) + else: + status = cudnnBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) check_status(status) @@ -1823,7 +1971,10 @@ cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( cpdef size_t createActivationDescriptor() except? 0: cdef ActivationDescriptor activationDesc - status = cudnnCreateActivationDescriptor(&activationDesc) + if runtime._is_hip_environment: + status = miopenCreateActivationDescriptor(&activationDesc) + else: + status = cudnnCreateActivationDescriptor(&activationDesc) check_status(status) return activationDesc @@ -1837,8 +1988,12 @@ cpdef setActivationDescriptor( cpdef destroyActivationDescriptor(size_t activationDesc): - status = cudnnDestroyActivationDescriptor( - activationDesc) + if runtime._is_hip_environment: + status = miopenDestroyActivationDescriptor( + activationDesc) + else: + status = cudnnDestroyActivationDescriptor( + activationDesc) check_status(status) @@ -1847,10 +2002,16 @@ cpdef softmaxForward( size_t srcData, size_t beta, size_t dstDesc, size_t dstData): _setStream(handle) with nogil: - status = cudnnSoftmaxForward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - beta, dstDesc, dstData) + if runtime._is_hip_environment: + status = miopenSoftmaxForward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) + else: + status = cudnnSoftmaxForward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) check_status(status) @@ -1860,11 +2021,18 @@ cpdef softmaxBackward( size_t destDiffDesc, size_t destDiffData): _setStream(handle) with nogil: - status = cudnnSoftmaxBackward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - srcDiffDesc, srcDiffData, beta, - destDiffDesc, destDiffData) + if runtime._is_hip_environment: + status = miopenSoftmaxBackward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + else: + status = cudnnSoftmaxBackward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) check_status(status) @@ -1902,20 +2070,30 @@ cpdef activationBackward_v4( cpdef size_t createDropoutDescriptor() except? 0: cdef DropoutDescriptor desc - status = cudnnCreateDropoutDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreateDropoutDescriptor(&desc) + else: + status = cudnnCreateDropoutDescriptor(&desc) check_status(status) return desc cpdef destroyDropoutDescriptor(size_t dropoutDesc): - status = cudnnDestroyDropoutDescriptor(dropoutDesc) + if runtime._is_hip_environment: + status = miopenDestroyDropoutDescriptor(dropoutDesc) + else: + status = cudnnDestroyDropoutDescriptor(dropoutDesc) check_status(status) cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: cdef size_t sizeInBytes - status = cudnnDropoutGetStatesSize( - handle, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenDropoutGetStatesSize( + handle, &sizeInBytes) + else: + status = cudnnDropoutGetStatesSize( + handle, &sizeInBytes) check_status(status) return sizeInBytes @@ -1931,8 +2109,12 @@ cpdef setDropoutDescriptor( cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: cdef size_t sizeInBytes - status = cudnnDropoutGetReserveSpaceSize( - xDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + else: + status = cudnnDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -1972,12 +2154,18 @@ cpdef dropoutBackward( ############################################################################### cpdef size_t createCTCLossDescriptor() except? 0: cdef CTCLossDescriptor desc - status = cudnnCreateCTCLossDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreateCTCLossDescriptor(&desc) + else: + status = cudnnCreateCTCLossDescriptor(&desc) check_status(status) return desc cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) + if runtime._is_hip_environment: + status = miopenDestroyCTCLossDescriptor(ctcLossDesc) + else: + status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) check_status(status) cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): @@ -1997,11 +2185,18 @@ cpdef size_t getCTCLossWorkspaceSize( size_t labels, size_t labelLengths, size_t inputLengths, int algo, size_t ctcLossDesc) except? 0: cdef size_t sizeInBytes - status = cudnnGetCTCLossWorkspaceSize( - handle, probsDesc, - gradientsDesc, - labels, labelLengths, inputLengths, - algo, ctcLossDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) + else: + status = cudnnGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -2011,12 +2206,20 @@ cpdef CTCLoss( size_t costs, size_t gradientsDesc, size_t gradients, int algo, size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes): - status = cudnnCTCLoss( - handle, probsDesc, probs, - labels, labelLengths, inputLengths, - costs, gradientsDesc, gradients, - algo, ctcLossDesc, - workspace, workSpaceSizeInBytes) + if runtime._is_hip_environment: + status = miopenCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) + else: + status = cudnnCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) check_status(status) @@ -2026,13 +2229,19 @@ cpdef CTCLoss( cpdef size_t createRNNDescriptor() except? 0: cdef RNNDescriptor desc - status = cudnnCreateRNNDescriptor(&desc) + if runtime._is_hip_environment: + status = miopenCreateRNNDescriptor(&desc) + else: + status = cudnnCreateRNNDescriptor(&desc) check_status(status) return desc cpdef destroyRNNDescriptor(size_t rnnDesc): - status = cudnnDestroyRNNDescriptor(rnnDesc) + if runtime._is_hip_environment: + status = miopenDestroyRNNDescriptor(rnnDesc) + else: + status = cudnnDestroyRNNDescriptor(rnnDesc) check_status(status) @@ -2134,9 +2343,14 @@ cpdef getRNNDataDescriptor( cpdef getRNNWorkspaceSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes - status = cudnnGetRNNWorkspaceSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetRNNWorkspaceSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + else: + status = cudnnGetRNNWorkspaceSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -2144,9 +2358,14 @@ cpdef getRNNWorkspaceSize( cpdef getRNNTrainingReserveSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes - status = cudnnGetRNNTrainingReserveSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) + if runtime._is_hip_environment: + status = miopenGetRNNTrainingReserveSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + else: + status = cudnnGetRNNTrainingReserveSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) check_status(status) return sizeInBytes @@ -2154,9 +2373,14 @@ cpdef getRNNTrainingReserveSize( cpdef getRNNParamsSize( intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): cdef size_t sizeInBytes - status = cudnnGetRNNParamsSize( - handle, rnnDesc, xDesc, - &sizeInBytes, dataType) + if runtime._is_hip_environment: + status = miopenGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) + else: + status = cudnnGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) check_status(status) return sizeInBytes @@ -2190,16 +2414,28 @@ cpdef RNNForwardInference( size_t cy, size_t workspace, size_t workSpaceSizeInBytes): _setStream(handle) with nogil: - status = cudnnRNNForwardInference( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes) + if runtime._is_hip_environment: + status = miopenRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) + else: + status = cudnnRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) check_status(status) @@ -2212,17 +2448,30 @@ cpdef RNNForwardTraining( size_t reserveSpaceSizeInBytes): _setStream(handle) with nogil: - status = cudnnRNNForwardTraining( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) + if runtime._is_hip_environment: + status = miopenRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + else: + status = cudnnRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) check_status(status) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx new file mode 100644 index 00000000000..c7c3811c885 --- /dev/null +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -0,0 +1,2543 @@ +# distutils: language = c++ + +"""Thin wrapper of cuDNN.""" +# NOTE: This wrapper does not cover all APIs of cuDNN v4. +cimport cython # NOQA +from libcpp cimport vector + +from cupy_backends.cuda.api cimport driver +from cupy_backends.cuda.api cimport runtime +from cupy_backends.cuda cimport stream as stream_module + +############################################################################### +# Extern +############################################################################### + +cdef extern from '../../cupy_cudnn.h' nogil: + # Types + ctypedef int ActivationMode 'miopenActivationMode_t' + ctypedef int AddMode 'cudnnAddMode_t' + ctypedef int BatchNormMode 'miopenBatchNormMode_t' + ctypedef int BatchNormOps 'cudnnBatchNormOps_t' + ctypedef int ConvolutionBwdDataAlgo 'miopenBwdDataAlgorithm_t' + ctypedef int ConvolutionBwdDataPreference \ + 'cudnnConvolutionBwdDataPreference_t' + ctypedef struct ConvolutionBwdDataAlgoPerf \ + 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ + 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionBwdFilterAlgo 'miopenConvBwdWeightsAlgorithm_t' + ctypedef int ConvolutionBwdFilterPreference \ + 'cudnnConvolutionBwdFilterPreference_t' + ctypedef struct ConvolutionBwdFilterAlgoPerf \ + 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ + 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionFwdAlgo 'miopenConvolutionFwdAlgorithm_t' + ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' + ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionFwdAlgoPerf_v7 \ + 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionMode 'miopenConvolutionMode_t' + ctypedef int DataType 'miopenDataType_t' + ctypedef int MathType 'cudnnMathType_t' + ctypedef int DirectionMode 'miopenRNNDirectionMode_t' + ctypedef int NanPropagation 'miopenNanPropagation_t' + ctypedef int PoolingMode 'miopenPoolingMode_t' + ctypedef int RNNInputMode 'miopenRNNInputMode_t' + ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' + ctypedef int RNNMode 'miopenRNNMode_t' + ctypedef int RNNAlgo 'miopenRNNAlgo_t' + ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' + ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' + ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' + ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' + ctypedef int Status 'miopenStatus_t' + ctypedef int TensorFormat 'cudnnTensorFormat_t' + ctypedef int OpTensorOp 'miopenTensorOp_t' + + ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' + ctypedef int ReduceTensorIndices 'miopenReduceTensorIndices_t' + ctypedef int IndicesType 'miopenIndicesType_t' + ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' + ctypedef int FusedOps 'cudnnFusedOps_t' + ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' + ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' + ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' + ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' + + ctypedef void* ActivationDescriptor 'miopenActivationDescriptor_t' + ctypedef void* ConvolutionDescriptor 'miopenConvolutionDescriptor_t' + ctypedef void* DropoutDescriptor 'miopenDropoutDescriptor_t' + ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' + ctypedef void* Handle 'miopenHandle_t' + ctypedef void* PoolingDescriptor 'miopenPoolingDescriptor_t' + ctypedef void* CTCLossDescriptor 'miopenCTCLossDescriptor_t' + ctypedef void* RNNDescriptor 'miopenRNNDescriptor_t' + ctypedef void* RNNDataDescriptor 'miopenRNNDataDescriptor_t' + ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' + ctypedef void* TensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* OpTensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* ReduceTensorDescriptor 'miopenReduceTensorDescriptor_t' + ctypedef void* SpatialTransformerDescriptor \ + 'cudnnSpatialTransformerDescriptor_t' + ctypedef void* SamplerType 'cudnnSamplerType_t' + ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' + ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' + ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' + + # Error handling + const char* miopenGetErrorString(Status status) + + # Version + size_t miopenGetVersion() + + # Runtime error checking + int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + ErrQueryMode mode, RuntimeTag *tag) + + # Initialization and CUDA cooperation + int miopenCreate(Handle* handle) + int miopenDestroy(Handle handle) + int miopenSetStream(Handle handle, driver.Stream stream) + int miopenGetStream(Handle handle, driver.Stream* stream) + + # Tensor manipulation + int miopenCreateTensorDescriptor(TensorDescriptor* descriptor) + int miopenSet4dTensorDescriptor( + TensorDescriptor tensorDesc, + DataType dataType, int n, int c, int h, int w) + int miopenSet4dTensorDescriptorEx( + TensorDescriptor tensorDesc, DataType dataType, + int n, int c, int h, int w, + int nStride, int cStride, int hStride, int wStride) + int miopenGet4dTensorDescriptor( + TensorDescriptor tensorDesc, DataType* dataType, + int* n, int* c, int* h, int* w, + int* nStride, int* cStride, int* hStride, int* wStride) + int cudnnSetTensorNdDescriptor( + TensorDescriptor tensorDesc, DataType dataType, int nbDims, + int* dimA, int* strideA) + int miopenDestroyTensorDescriptor(TensorDescriptor tensorDesc) + int cudnnAddTensor_v3( + Handle handle, void* alpha, TensorDescriptor bDesc, + void* b, void* beta, TensorDescriptor yDesc, void* y) + + # Tensor operations + int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) + int cudnnSetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, + DataType opTensorCompType, NanPropagation opTensorNanOpt) + int cudnnGetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, + DataType* opTensorCompType, NanPropagation* opTensorNanOpt) + int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) + int miopenOpTensor( + Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, + TensorDescriptor aDesc, void* A, void* alpha2, + TensorDescriptor bDesc, void* B, void* beta, + TensorDescriptor cDesc, void* C) + + # Tensor reductions + int miopenCreateReduceTensorDescriptor( + ReduceTensorDescriptor* reduceTensorDesc) + int miopenSetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, + DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, + ReduceTensorIndices reduceTensorIndices, + IndicesType reduceTensorIndicesType) + int miopenGetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, + ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, + NanPropagation* reduceTensorNanOpt, + ReduceTensorIndices* reduceTensorIndices, + IndicesType* reduceTensorIndicesType) + int miopenDestroyReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc) + int miopenGetReductionIndicesSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int miopenGetReductionWorkspaceSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int miopenReduceTensor( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, + size_t indicesSizeInBytes, void* workspace, + size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, + void* A, void* beta, TensorDescriptor cDesc, void* c) + int miopenSetTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) + int miopenScaleTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* alpha) + + # Filter manipulation + int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) + int cudnnSetFilter4dDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int k, int c, int h, int w) + int cudnnSetFilterNdDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int nbDims, const int filterDimA[]) + int cudnnGetFilterNdDescriptor_v4( + FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, + TensorFormat* format, int* nbDims, int filterDimA[]) + int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) + + # Convolution + int miopenCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) + int cudnnSetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType mathType) + int cudnnGetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType *mathType) + int miopenSetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int groupCount) + int miopenGetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int *groupCount) + int cudnnSetConvolution2dDescriptor_v4( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode) + int cudnnSetConvolution2dDescriptor_v5( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode, + DataType computeType) + int cudnnSetConvolutionNdDescriptor_v3( + ConvolutionDescriptor convDesc, int arrayLength, int* padA, + int* filterStrideA, int* dilationA, ConvolutionMode mode, + DataType dataType) + int miopenDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) + int cudnnFindConvolutionForwardAlgorithm( + Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, + ConvolutionDescriptor convDesc, TensorDescriptor yDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionFwdAlgoPerf* perfResults) + int cudnnFindConvolutionForwardAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionForwardAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionForwardAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, ConvolutionFwdPreference preference, + size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) + int cudnnGetConvolutionForwardAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) + int miopenConvolutionForwardGetWorkSpaceSize( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, + size_t* sizeInBytes) + int cudnnConvolutionForward( + Handle handle, void* alpha, TensorDescriptor srcDesc, + void* srcData, FilterDescriptor filterDesc, void* filterData, + ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnConvolutionBackwardBias( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnFindConvolutionBackwardFilterAlgorithm( + Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardFilterAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionBackwardFilterAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) + int cudnnGetConvolutionBackwardFilterAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf_v7* perfResults) + int cudnnGetConvolutionBackwardFilterWorkspaceSize( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) + int cudnnConvolutionBackwardFilter_v3( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + FilterDescriptor gradDesc, void* gradData) + int cudnnGetConvolutionBackwardDataAlgorithm_v6( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + ConvolutionBwdDataPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) + int cudnnGetConvolutionBackwardDataAlgorithm_v7( + Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf_v7* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithm( + Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithmEx( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int miopenConvolutionBackwardDataGetWorkSpaceSize( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + size_t* sizeInBytes) + int cudnnConvolutionBackwardData_v3( + Handle handle, void* alpha, + FilterDescriptor filterDesc, void* filterData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor gradDesc, void* gradData) + + # Pooling + int miopenCreatePoolingDescriptor(PoolingDescriptor* desc) + int cudnnSetPooling2dDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, + int verticalPadding, int horizontalPadding, int verticalStride, + int horizontalStride) + int cudnnSetPoolingNdDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int nbDims, + int* windowDimA, int* paddingA, int* strideA) + int miopenDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) + int cudnnPoolingForward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnPoolingBackward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Batch Normalization + int miopenDeriveBNTensorDescriptor( + TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, + BatchNormMode mode) + int miopenBatchNormalizationForwardTraining( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, void* resultSaveMean, + void* resultSaveInvVariance) + int miopenBatchNormalizationForwardInference( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, void* estimatedMean, void* estimatedVariance, + double epsilon) + int miopenBatchNormalizationBackward( + Handle handle, BatchNormMode mode, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, void* bnScale, + void* dBnScaleResult, void* dBnBiasResult, + double epsilon, void* savedMean, void* savedInvVariance) + + int cudnnBatchNormalizationForwardTrainingEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + void* alpha, void* beta, + TensorDescriptor xDesc, void* x, + TensorDescriptor zDesc, void* z, + TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, + void* bnScale, void* bnBias, + double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, + void* resultSaveMean, void* resultSaveInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor zDesc, + TensorDescriptor yDesc, + TensorDescriptor bnScaleBiasMeanVarDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnBatchNormalizationBackwardEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnops, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor yDesc, void* y, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dzDesc, void* dz, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, + void* bnScaleData, void* bnBiasData, + void* dBnScaleData, void* dBnBiasData, + double epsilon, + void* savedMean, void* savedInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationBackwardExWorkspaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor yDesc, + TensorDescriptor dyDesc, + TensorDescriptor dzDesc, + TensorDescriptor dxDesc, + TensorDescriptor dBnScaleBiasDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + ActivationDescriptor activationDesc, + TensorDescriptor xDesc, + size_t* sizeInBytes) + + # Activation + int miopenCreateActivationDescriptor( + ActivationDescriptor* activationDesc) + int cudnnSetActivationDescriptor( + ActivationDescriptor activationDesc, ActivationMode mode, + NanPropagation reluNanOpt, double reluCeiling) + int miopenDestroyActivationDescriptor( + ActivationDescriptor activationDesc) + int miopenSoftmaxForward( + Handle handle, + void* alpha, TensorDescriptor srcDesc, void* srcData, + void* beta, TensorDescriptor dstDesc, void* dstData) + int miopenSoftmaxBackward( + Handle handle, + void* alpha, TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + int cudnnActivationForward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnActivationBackward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Dropout + int miopenCreateDropoutDescriptor(DropoutDescriptor* desc) + int miopenDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) + int miopenDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) + int miopenDropoutGetReserveSpaceSize( + TensorDescriptor xDesc, size_t* sizeInBytes) + int cudnnSetDropoutDescriptor( + DropoutDescriptor dropoutDesc, Handle handle, float dropout, + void* states, size_t stateSizeInBytes, unsigned long long seed) + int cudnnDropoutForward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor dstDesc, void* dstData, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnDropoutBackward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, + void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # CTC + int miopenCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) + int miopenDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) + int cudnnSetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType dataType) + int cudnnGetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType* dataType) + int miopenGetCTCLossWorkspaceSize( + Handle handle, TensorDescriptor probsDesc, + TensorDescriptor gradientsDesc, int* labels, + int* labelLengths, int* inputLengths, CTCLossAlgo algo, + CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) + int miopenCTCLoss( + Handle handle, TensorDescriptor probsDesc, + void* probs, int* labels, int* labelLengths, int* inputLengths, + void* costs, TensorDescriptor gradientsDesc, void* gradients, + CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, + void* workspace, size_t workSpaceSizeInBytes) + # RNN + int miopenCreateRNNDescriptor(RNNDescriptor* rnnDesc) + int miopenDestroyRNNDescriptor(RNNDescriptor rnnDesc) + int cudnnCreatePersistentRNNPlan( + RNNDescriptor rnnDesc, + const int minibatch, DataType dataType, + PersistentRNNPlan* plan) + int cudnnSetPersistentRNNPlan( + RNNDescriptor rnnDesc, PersistentRNNPlan plan) + int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) + int cudnnSetRNNDescriptor_v5( + RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, DataType dataType) + int cudnnSetRNNDescriptor_v6( + Handle handle, RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) + int cudnnSetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) + int cudnnGetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) + int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) + int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) + int cudnnSetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, + int maxSeqLength, int batchSize, int vectorSize, + const int seqLengthArray[], void *paddingFill) + int cudnnGetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType* dataType, + RNNDataLayout* layout, int* maxSeqLength, int* batchSize, + int* vectorSize, int arrayLengthRequested, int seqLengthArray[], + void* paddingFill) + int miopenGetRNNWorkspaceSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int miopenGetRNNTrainingReserveSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int miopenGetRNNParamsSize( + Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, + size_t* sizeInBytes, DataType dataType) + int cudnnGetRNNLinLayerMatrixParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerMatDesc, + void** linLayerMat) + int cudnnGetRNNLinLayerBiasParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerBiasDesc, + void** linLayerBias) + int miopenRNNForwardInference( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, + void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, + void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, + void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, + void* cy, void* workspace, size_t workSpaceSizeInBytes) + int miopenRNNForwardTraining( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, + TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, + FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, + TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, + void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardData( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* yDesc, void* y, + TensorDescriptor* dyDesc, void* dy, + TensorDescriptor dhyDesc, void* dhy, + TensorDescriptor dcyDesc, void* dcy, + FilterDescriptor wDesc, void* w, + TensorDescriptor hxDesc, void* hx, + TensorDescriptor cxDesc, void* cx, + TensorDescriptor* dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, void* workspace, + size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeights( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, + TensorDescriptor* yDesc, void* y, + void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, + void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + int cudnnRNNForwardInferenceEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnRNNForwardTrainingEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardDataEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor yDesc, const void* y, + RNNDataDescriptor dyDesc, const void* dy, + RNNDataDescriptor dcDesc, const void* dcAttn, + TensorDescriptor dhyDesc, const void* dhy, + TensorDescriptor dcyDesc, const void* dcy, + FilterDescriptor wDesc, const void* w, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + RNNDataDescriptor dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, + RNNDataDescriptor dkDesc, void* dkeys, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeightsEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + RNNDataDescriptor yDesc, const void* y, + void* workSpace, size_t workSpaceSizeInBytes, + FilterDescriptor dwDesc, void* dw, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # Spatial Transformer + int cudnnCreateSpatialTransformerDescriptor( + SpatialTransformerDescriptor* stDesc) + int cudnnDestroySpatialTransformerDescriptor( + SpatialTransformerDescriptor stDesc) + int cudnnSetSpatialTransformerNdDescriptor( + SpatialTransformerDescriptor stDesc, SamplerType samplerType, + DataType dataType, int nbDims, int dimA[]) + int cudnnSpatialTfGridGeneratorForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* theta, void* grid) + int cudnnSpatialTfGridGeneratorBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* dgrid, void* dtheta) + int cudnnSpatialTfSamplerForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, + void* grid, void* beta, TensorDescriptor yDesc, void* y) + int cudnnSpatialTfSamplerBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, void* beta, + TensorDescriptor dxDesc, void* dx, void* alphaDgrid, + TensorDescriptor dyDesc, void* dy, void* grid, + void* betaDgrid, void* dgrid) + + # Fused Ops + int cudnnCreateFusedOpsConstParamPack( + FusedOpsConstParamPack* constPack, int ops) + int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) + int cudnnSetFusedOpsConstParamPackAttribute( + FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, + const void *param) + int cudnnGetFusedOpsConstParamPackAttribute( + const FusedOpsConstParamPack constPack, + FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) + int cudnnCreateFusedOpsVariantParamPack( + FusedOpsVariantParamPack *varPack, FusedOps ops) + int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) + int cudnnSetFusedOpsVariantParamPackAttribute( + FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, + void *ptr) + int cudnnGetFusedOpsVariantParamPackAttribute( + const FusedOpsVariantParamPack varPack, + FusedOpsVariantParamLabel paramLabel, void *ptr) + int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) + int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) + int cudnnMakeFusedOpsPlan( + Handle handle, FusedOpsPlan plan, + const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) + int cudnnFusedOpsExecute( + Handle handle, const FusedOpsPlan plan, + FusedOpsVariantParamPack varPack) + + # Build-time version + int CUDNN_VERSION + + # Constants + double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' + + +cdef class CuDNNAlgoPerf: + + def __init__(self, algo, status, time, memory, determinism, mathType): + self.algo = algo + self.status = status + self.time = time + self.memory = memory + self.determinism = determinism + self.mathType = mathType + + +############################################################################### +# Error handling +############################################################################### + +class CuDNNError(RuntimeError): + + def __init__(self, int status): + self.status = status + msg = cudnnGetErrorString(status) + super(CuDNNError, self).__init__( + 'cuDNN Error: {}'.format(msg.decode())) + self._infos = [] + + def add_info(self, info): + assert isinstance(info, str) + self._infos.append(info) + + def add_infos(self, infos): + assert isinstance(infos, list) + self._infos.extend(infos) + + def __str__(self): + base = super(CuDNNError, self).__str__() + return base + ''.join( + '\n ' + info for info in self._infos) + + def __reduce__(self): + return (type(self), (self.status,)) + + +@cython.profile(False) +cpdef inline check_status(int status): + if status != 0: + raise CuDNNError(status) + + +############################################################################### +# Build-time version +############################################################################### + +def get_build_version(): + return CUDNN_VERSION + + +############################################################################### +# Version +############################################################################### + +cpdef size_t getVersion() except? 0: + return cudnnGetVersion() + + +############################################################################### +# Runtime error checking +############################################################################### + +cpdef queryRuntimeError(intptr_t handle, int mode): + cdef Status rstatus + with nogil: + status = cudnnQueryRuntimeError(handle, &rstatus, + mode, 0) + check_status(status) + return rstatus + + +############################################################################### +# Initialization and CUDA cooperation +############################################################################### + +cpdef intptr_t create() except? 0: + cdef Handle handle + with nogil: + status = miopenCreate(&handle) + check_status(status) + return handle + + +cpdef destroy(intptr_t handle): + with nogil: + status = miopenDestroy(handle) + check_status(status) + + +cpdef setStream(intptr_t handle, size_t stream): + # TODO(leofang): The support of stream capture is not mentioned at all in + # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. + if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): + raise NotImplementedError( + 'calling cuDNN API during stream capture is currently ' + 'unsupported') + + status = miopenSetStream(handle, stream) + check_status(status) + + +cpdef size_t getStream(intptr_t handle) except? 0: + cdef driver.Stream stream + status = miopenGetStream(handle, &stream) + check_status(status) + return stream + + +cdef _setStream(intptr_t handle): + """Set current stream""" + setStream(handle, stream_module.get_current_stream_ptr()) + +############################################################################### +# Tensor manipulation +############################################################################### + +cpdef size_t createTensorDescriptor() except? 0: + cdef TensorDescriptor descriptor + status = miopenCreateTensorDescriptor(&descriptor) + check_status(status) + return descriptor + + +cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w): + status = miopenSet4dTensorDescriptor( + tensorDesc, + dataType, n, c, h, w) + check_status(status) + + +cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride): + status = miopenSet4dTensorDescriptorEx( + tensorDesc, dataType, n, c, h, w, + nStride, cStride, hStride, wStride) + check_status(status) + + +cpdef tuple getTensor4dDescriptor(size_t tensorDesc): + cdef DataType dataType + cdef int n, c, h, w, nStride, cStride, hStride, wStride + status = miopenGet4dTensorDescriptor( + tensorDesc, &dataType, + &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) + check_status(status) + return dataType, n, c, h, w, nStride, cStride, hStride, wStride + + +cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA): + status = cudnnSetTensorNdDescriptor( + tensorDesc, dataType, nbDims, + dimA, strideA) + check_status(status) + + +cpdef destroyTensorDescriptor(size_t tensorDesc): + status = miopenDestroyTensorDescriptor(tensorDesc) + check_status(status) + + +cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnAddTensor_v3( + handle, alpha, bDesc, + b, beta, yDesc, y) + check_status(status) + + +############################################################################### +# Tensor operations +############################################################################### + +cpdef size_t createOpTensorDescriptor() except? 0: + cdef OpTensorDescriptor opTensorDesc + status = cudnnCreateOpTensorDescriptor(&opTensorDesc) + check_status(status) + return opTensorDesc + + +cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt): + status = cudnnSetOpTensorDescriptor( + opTensorDesc, opTensorOp, + opTensorCompType, opTensorNanOpt) + check_status(status) + + +cpdef getOpTensorDescriptor(size_t opTensorDesc): + cdef OpTensorOp opTensorOp + cdef DataType opTensorCompType + cdef NanPropagation opTensorNanOpt + status = cudnnGetOpTensorDescriptor( + opTensorDesc, &opTensorOp, &opTensorCompType, + &opTensorNanOpt) + check_status(status) + return opTensorOp, opTensorCompType, opTensorNanOpt + + +cpdef destroyOpTensorDescriptor(size_t opTensorDesc): + status = cudnnDestroyOpTensorDescriptor(opTensorDesc) + check_status(status) + + +cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) + check_status(status) + + +############################################################################### +# Tensor reductions +############################################################################### + +cpdef size_t createReduceTensorDescriptor() except? 0: + cdef ReduceTensorDescriptor reduceTensorDesc + status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) + check_status(status) + return reduceTensorDesc + +cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, + int reduceTensorNanOpt, int reduceTensorIndices, + int reduceTensorIndicesType): + status = cudnnSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) + check_status(status) + + +cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): + cdef ReduceTensorOp redOp + cdef DataType redCompType + cdef NanPropagation redNanOpt + cdef ReduceTensorIndices redIndices + cdef IndicesType redIndicesType + status = cudnnGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) + check_status(status) + return redOp, redCompType, redNanOpt, redIndices, redIndicesType + + +cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): + status = cudnnDestroyReduceTensorDescriptor( + reduceTensorDesc) + check_status(status) + + +cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef size_t getReductionWorkspaceSize(intptr_t handle, + size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) + check_status(status) + + +cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): + _setStream(handle) + with nogil: + status = cudnnSetTensor( + handle, yDesc, y, + valuePtr) + check_status(status) + + +cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): + _setStream(handle) + with nogil: + status = cudnnScaleTensor( + handle, yDesc, y, + alpha) + check_status(status) + + +############################################################################### +# Filter manipulation +############################################################################### + +cpdef size_t createFilterDescriptor() except? 0: + cdef FilterDescriptor desc + status = cudnnCreateFilterDescriptor(&desc) + check_status(status) + return desc + + +cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, + int format, int k, int c, int h, int w): + status = cudnnSetFilter4dDescriptor_v4( + filterDesc, dataType, + format, k, c, h, w) + check_status(status) + + +cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, + int format, int nbDims, size_t filterDimA): + status = cudnnSetFilterNdDescriptor_v4( + filterDesc, dataType, + format, nbDims, filterDimA) + check_status(status) + + +cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): + cdef DataType dataType + cdef TensorFormat format + cdef int nbDims + cdef vector.vector[int] filterDimA + filterDimA.resize(nbDimsRequested) + + status = cudnnGetFilterNdDescriptor_v4( + wDesc, nbDimsRequested, &dataType, + &format, &nbDims, filterDimA.data()) + check_status(status) + return dataType, format, nbDims, tuple(filterDimA) + + +cpdef destroyFilterDescriptor(size_t filterDesc): + status = cudnnDestroyFilterDescriptor(filterDesc) + check_status(status) + + +############################################################################### +# Convolution +############################################################################### + +cpdef size_t createConvolutionDescriptor() except? 0: + cdef ConvolutionDescriptor desc + status = miopenCreateConvolutionDescriptor(&desc) + check_status(status) + return desc + + +cpdef setConvolutionMathType(size_t convDesc, size_t mathType): + status = cudnnSetConvolutionMathType( + convDesc, mathType) + check_status(status) + + +cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: + cdef MathType mathType + status = cudnnGetConvolutionMathType( + convDesc, &mathType) + check_status(status) + return mathType + + +cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): + status = miopenSetConvolutionGroupCount( + convDesc, groupCount) + check_status(status) + + +cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: + cdef int groupCount + status = cudnnGetConvolutionGroupCount( + convDesc, &groupCount) + check_status(status) + return groupCount + + +cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode): + status = cudnnSetConvolution2dDescriptor_v4( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode) + check_status(status) + + +cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType): + status = cudnnSetConvolution2dDescriptor_v5( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode, computeType) + check_status(status) + + +cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType): + status = cudnnSetConvolutionNdDescriptor_v3( + convDesc, arrayLength, padA, + filterStrideA, dilationA, mode, + dataType) + check_status(status) + + +cpdef destroyConvolutionDescriptor(size_t convDesc): + status = miopenDestroyConvolutionDescriptor( + convDesc) + check_status(status) + + +cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, + size_t yDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithm( + handle, xDesc, wDesc, + convDesc, yDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + +cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + +cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx_v7( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: + cdef ConvolutionFwdAlgo algo + status = cudnnGetConvolutionForwardAlgorithm_v6( + handle, srcDesc, + filterDesc, convDesc, + destDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + +cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionForwardAlgorithm_v7( + handle, srcDesc, + filterDesc, convDesc, + destDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = miopenConvolutionForwardGetWorkSpaceSize( + handle, srcDesc, + filterDesc, convDesc, + destDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) + check_status(status) + + +cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) + check_status(status) + + +cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithm( + handle, xDesc, dyDesc, + convDesc, dwDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + +cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + +cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdFilterAlgo algo + status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, + preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + +cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( + handle, srcDesc, diffDesc, + convDesc, gradDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, algo, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardFilter_v3( + handle, alpha, + srcDesc, srcData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + + +cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithm( + handle, wDesc, dyDesc, + convDesc, dxDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + +cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + +cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdDataAlgo algo + status = cudnnGetConvolutionBackwardDataAlgorithm_v6( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + +cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardDataAlgorithm_v7( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + +cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = miopenConvolutionBackwardDataGetWorkSpaceSize( + handle, filterDesc, + diffDesc, + convDesc, gradDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardData_v3( + handle, alpha, + filterDesc, filterData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + +############################################################################### +# Pooling +############################################################################### + +cpdef size_t createPoolingDescriptor() except? 0: + cdef PoolingDescriptor desc + status = miopenCreatePoolingDescriptor(&desc) + check_status(status) + return desc + + +cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride): + status = cudnnSetPooling2dDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, windowHeight, windowWidth, + verticalPadding, horizontalPadding, verticalStride, horizontalStride) + check_status(status) + + +cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA): + status = cudnnSetPoolingNdDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, nbDims, + windowDimA, paddingA, strideA) + check_status(status) + + +cpdef destroyPoolingDescriptor(size_t poolingDesc): + status = miopenDestroyPoolingDescriptor(poolingDesc) + check_status(status) + + +cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnPoolingForward( + handle, poolingDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + +cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnPoolingBackward( + handle, poolingDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + +############################################################################### +# Batch Normalization +############################################################################### + +CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON + +cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode): + status = miopenDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) + check_status(status) + + +cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): + _setStream(handle) + with nogil: + status = miopenBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) + check_status(status) + + +cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon): + _setStream(handle) + with nogil: + status = miopenBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) + check_status(status) + + +cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance): + _setStream(handle) + with nogil: + status = miopenBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) + check_status(status) + + +cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardTrainingEx( + handle, mode, bnOps, + alpha, beta, + xDesc, x, + zDesc, z, + yDesc, y, + bnScaleBiasMeanVarDesc, + bnScale, bnBias, + exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + handle, + mode, bnOps, + xDesc, + zDesc, + yDesc, + bnScaleBiasMeanVarDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationBackwardEx( + handle, + mode, bnops, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + yDesc, y, + dyDesc, dy, + dzDesc, dz, + dxDesc, dx, + dBnScaleBiasDesc, + bnScaleData, bnBiasData, + dBnScaleData, dBnBiasData, + epsilon, + savedMean, savedInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( + handle, + mode, + bnOps, + xDesc, + yDesc, + dyDesc, + dzDesc, + dxDesc, + dBnScaleBiasDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + handle, + mode, + bnOps, + activationDesc, + xDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + +############################################################################### +# Activation +############################################################################### + +cpdef size_t createActivationDescriptor() except? 0: + cdef ActivationDescriptor activationDesc + status = miopenCreateActivationDescriptor(&activationDesc) + check_status(status) + return activationDesc + + +cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): + status = cudnnSetActivationDescriptor( + activationDesc, mode, + reluNanOpt, reluCeiling) + check_status(status) + + +cpdef destroyActivationDescriptor(size_t activationDesc): + status = miopenDestroyActivationDescriptor( + activationDesc) + check_status(status) + + +cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = miopenSoftmaxForward( + handle, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) + check_status(status) + + +cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData): + _setStream(handle) + with nogil: + status = miopenSoftmaxBackward( + handle, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + check_status(status) + + +cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnActivationForward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + +cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnActivationBackward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + + +############################################################################### +# Dropout +############################################################################### + +cpdef size_t createDropoutDescriptor() except? 0: + cdef DropoutDescriptor desc + status = miopenCreateDropoutDescriptor(&desc) + check_status(status) + return desc + + +cpdef destroyDropoutDescriptor(size_t dropoutDesc): + status = miopenDestroyDropoutDescriptor(dropoutDesc) + check_status(status) + + +cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: + cdef size_t sizeInBytes + status = miopenDropoutGetStatesSize( + handle, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed): + status = cudnnSetDropoutDescriptor( + dropoutDesc, handle, dropout, + states, stateSizeInBytes, seed) + check_status(status) + + +cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = miopenDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutForward( + handle, dropoutDesc, + srcDesc, srcData, + dstDesc, dstData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutBackward( + handle, dropoutDesc, + dyDesc, dyData, + dxDesc, dxData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +############################################################################### +# CTC +############################################################################### +cpdef size_t createCTCLossDescriptor() except? 0: + cdef CTCLossDescriptor desc + status = miopenCreateCTCLossDescriptor(&desc) + check_status(status) + return desc + +cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): + status = miopenDestroyCTCLossDescriptor(ctcLossDesc) + check_status(status) + +cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): + status = cudnnSetCTCLossDescriptor( + ctcLossDesc, dataType) + check_status(status) + +cpdef getCTCLossDescriptor(size_t ctcLossDesc): + cdef DataType compType + status = cudnnGetCTCLossDescriptor( + ctcLossDesc, &compType) + check_status(status) + return compType + +cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0: + cdef size_t sizeInBytes + status = miopenGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + +cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, + int algo, size_t ctcLossDesc, + size_t workspace, size_t workSpaceSizeInBytes): + status = miopenCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) + check_status(status) + + +############################################################################### +# RNN +############################################################################### + +cpdef size_t createRNNDescriptor() except? 0: + cdef RNNDescriptor desc + status = miopenCreateRNNDescriptor(&desc) + check_status(status) + return desc + + +cpdef destroyRNNDescriptor(size_t rnnDesc): + status = miopenDestroyRNNDescriptor(rnnDesc) + check_status(status) + + +cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, + int dataType) except? 0: + cdef PersistentRNNPlan plan + status = cudnnCreatePersistentRNNPlan( + rnnDesc, + minibatch, dataType, &plan) + check_status(status) + return plan + + +cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): + status = cudnnSetPersistentRNNPlan( + rnnDesc, plan) + check_status(status) + + +cpdef destroyPersistentRNNPlan(size_t plan): + status = cudnnDestroyPersistentRNNPlan(plan) + check_status(status) + + +cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType): + status = cudnnSetRNNDescriptor_v5( + rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, dataType) + check_status(status) + + +cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType): + status = cudnnSetRNNDescriptor_v6( + handle, rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, algo, + dataType) + check_status(status) + + +cpdef setRNNPaddingMode( + size_t rnnDesc, int paddingMode): + status = cudnnSetRNNPaddingMode( + rnnDesc, paddingMode) + check_status(status) + + +cpdef getRNNPaddingMode(size_t rnnDesc): + cdef RNNPaddingMode paddingMode + status = cudnnGetRNNPaddingMode( + rnnDesc, &paddingMode) + check_status(status) + return paddingMode + + +cpdef size_t createRNNDataDescriptor() except? 0: + cdef RNNDataDescriptor desc + status = cudnnCreateRNNDataDescriptor(&desc) + check_status(status) + return desc + + +cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): + status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) + check_status(status) + + +cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill): + status = cudnnSetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, vectorSize, + seqLengthArray, paddingFill) + check_status(status) + + +cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill): + status = cudnnGetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, + vectorSize, arrayLengthRequested, seqLengthArray, + paddingFill) + check_status(status) + + +cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = miopenGetRNNWorkspaceSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = miopenGetRNNTrainingReserveSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + +cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): + cdef size_t sizeInBytes + status = miopenGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) + check_status(status) + return sizeInBytes + + +cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): + status = cudnnGetRNNLinLayerMatrixParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerMatDesc, linLayerMat) + check_status(status) + + +cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias): + status = cudnnGetRNNLinLayerBiasParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerBiasDesc, linLayerBias) + check_status(status) + + +cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = miopenRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) + check_status(status) + + +cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = miopenRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardData( + handle, rnnDesc, seqLength, + yDesc, y, + dyDesc, dy, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeights( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + yDesc, y, + workspace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardInferenceEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes) + check_status(status) + + +cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardTrainingEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardDataEx( + handle, rnnDesc, + yDesc, y, + dyDesc, dy, + dcDesc, dcAttn, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + dkDesc, dkeys, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeightsEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + yDesc, y, + workSpace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + +############################################################################### +# Spatial Transformer +############################################################################### + +cpdef size_t createSpatialTransformerDescriptor() except? 0: + cdef SpatialTransformerDescriptor stDesc + status = cudnnCreateSpatialTransformerDescriptor(&stDesc) + check_status(status) + return stDesc + + +cpdef destroySpatialTransformerDescriptor(size_t stDesc): + status = cudnnDestroySpatialTransformerDescriptor( + stDesc) + check_status(status) + + +cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA): + status = cudnnSetSpatialTransformerNdDescriptor( + stDesc, samplerType, + dataType, nbDims, dimA) + check_status(status) + + +cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorForward( + handle, stDesc, + theta, grid) + check_status(status) + + +cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorBackward( + handle, stDesc, + dgrid, dtheta) + check_status(status) + + +cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerForward( + handle, stDesc, + alpha, xDesc, x, grid, + beta, yDesc, y) + check_status(status) + + +cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerBackward( + handle, stDesc, + alpha, xDesc, x, beta, + dxDesc, dx, alphaDgrid, + dyDesc, dy, grid, + betaDgrid, dgrid) + check_status(status) + +############################################################################### +# Fused Ops +############################################################################### + +cpdef createFusedOpsConstParamPack(int ops): + cdef FusedOpsConstParamPack constPack + with nogil: + status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) + check_status(status) + return constPack + +cpdef destroyFusedOpsConstParamPack(size_t constPack): + with nogil: + status = cudnnDestroyFusedOpsConstParamPack( + constPack) + check_status(status) + +cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + with nogil: + status = cudnnSetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param) + check_status(status) + +cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + cdef int isNULL = 0 + with nogil: + status = cudnnGetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param, &isNULL) + check_status(status) + return isNULL + +cpdef createFusedOpsVariantParamPack(int ops): + cdef FusedOpsVariantParamPack varPack + with nogil: + status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) + check_status(status) + return varPack + +cpdef destroyFusedOpsVariantParamPack(size_t varPack): + with nogil: + status = cudnnDestroyFusedOpsVariantParamPack( + varPack) + check_status(status) + +cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnSetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + +cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnGetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + +cpdef createFusedOpsPlan(int ops): + cdef FusedOpsPlan plan + with nogil: + status = cudnnCreateFusedOpsPlan(&plan, ops) + check_status(status) + return plan + +cpdef destroyFusedOpsPlan(size_t plan): + with nogil: + status = cudnnDestroyFusedOpsPlan(plan) + check_status(status) + +cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): + cdef size_t workspaceSizeInBytes + _setStream(handle) + with nogil: + status = cudnnMakeFusedOpsPlan(handle, plan, + constPack, + &workspaceSizeInBytes) + check_status(status) + return workspaceSizeInBytes + +cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): + _setStream(handle) + with nogil: + status = cudnnFusedOpsExecute(handle, plan, + varPack) + check_status(status) + From 0c0f0bed5b73c9fac35af5df808f4619a9b4e2ef Mon Sep 17 00:00:00 2001 From: bmedishe Date: Mon, 27 Nov 2023 21:47:19 +0000 Subject: [PATCH 05/26] update miopen.pyx --- cupy_backends/cuda/libs/miopen.pyx | 1802 ---------------------------- 1 file changed, 1802 deletions(-) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index c7c3811c885..cd68ca9f693 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -739,1805 +739,3 @@ cdef extern from '../../cupy_cudnn.h' nogil: # Constants double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' - -cdef class CuDNNAlgoPerf: - - def __init__(self, algo, status, time, memory, determinism, mathType): - self.algo = algo - self.status = status - self.time = time - self.memory = memory - self.determinism = determinism - self.mathType = mathType - - -############################################################################### -# Error handling -############################################################################### - -class CuDNNError(RuntimeError): - - def __init__(self, int status): - self.status = status - msg = cudnnGetErrorString(status) - super(CuDNNError, self).__init__( - 'cuDNN Error: {}'.format(msg.decode())) - self._infos = [] - - def add_info(self, info): - assert isinstance(info, str) - self._infos.append(info) - - def add_infos(self, infos): - assert isinstance(infos, list) - self._infos.extend(infos) - - def __str__(self): - base = super(CuDNNError, self).__str__() - return base + ''.join( - '\n ' + info for info in self._infos) - - def __reduce__(self): - return (type(self), (self.status,)) - - -@cython.profile(False) -cpdef inline check_status(int status): - if status != 0: - raise CuDNNError(status) - - -############################################################################### -# Build-time version -############################################################################### - -def get_build_version(): - return CUDNN_VERSION - - -############################################################################### -# Version -############################################################################### - -cpdef size_t getVersion() except? 0: - return cudnnGetVersion() - - -############################################################################### -# Runtime error checking -############################################################################### - -cpdef queryRuntimeError(intptr_t handle, int mode): - cdef Status rstatus - with nogil: - status = cudnnQueryRuntimeError(handle, &rstatus, - mode, 0) - check_status(status) - return rstatus - - -############################################################################### -# Initialization and CUDA cooperation -############################################################################### - -cpdef intptr_t create() except? 0: - cdef Handle handle - with nogil: - status = miopenCreate(&handle) - check_status(status) - return handle - - -cpdef destroy(intptr_t handle): - with nogil: - status = miopenDestroy(handle) - check_status(status) - - -cpdef setStream(intptr_t handle, size_t stream): - # TODO(leofang): The support of stream capture is not mentioned at all in - # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. - if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): - raise NotImplementedError( - 'calling cuDNN API during stream capture is currently ' - 'unsupported') - - status = miopenSetStream(handle, stream) - check_status(status) - - -cpdef size_t getStream(intptr_t handle) except? 0: - cdef driver.Stream stream - status = miopenGetStream(handle, &stream) - check_status(status) - return stream - - -cdef _setStream(intptr_t handle): - """Set current stream""" - setStream(handle, stream_module.get_current_stream_ptr()) - -############################################################################### -# Tensor manipulation -############################################################################### - -cpdef size_t createTensorDescriptor() except? 0: - cdef TensorDescriptor descriptor - status = miopenCreateTensorDescriptor(&descriptor) - check_status(status) - return descriptor - - -cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, - int n, int c, int h, int w): - status = miopenSet4dTensorDescriptor( - tensorDesc, - dataType, n, c, h, w) - check_status(status) - - -cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, - int n, int c, int h, int w, int nStride, - int cStride, int hStride, int wStride): - status = miopenSet4dTensorDescriptorEx( - tensorDesc, dataType, n, c, h, w, - nStride, cStride, hStride, wStride) - check_status(status) - - -cpdef tuple getTensor4dDescriptor(size_t tensorDesc): - cdef DataType dataType - cdef int n, c, h, w, nStride, cStride, hStride, wStride - status = miopenGet4dTensorDescriptor( - tensorDesc, &dataType, - &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) - check_status(status) - return dataType, n, c, h, w, nStride, cStride, hStride, wStride - - -cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, - size_t dimA, size_t strideA): - status = cudnnSetTensorNdDescriptor( - tensorDesc, dataType, nbDims, - dimA, strideA) - check_status(status) - - -cpdef destroyTensorDescriptor(size_t tensorDesc): - status = miopenDestroyTensorDescriptor(tensorDesc) - check_status(status) - - -cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, - size_t b, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnAddTensor_v3( - handle, alpha, bDesc, - b, beta, yDesc, y) - check_status(status) - - -############################################################################### -# Tensor operations -############################################################################### - -cpdef size_t createOpTensorDescriptor() except? 0: - cdef OpTensorDescriptor opTensorDesc - status = cudnnCreateOpTensorDescriptor(&opTensorDesc) - check_status(status) - return opTensorDesc - - -cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, - int opTensorCompType, int opTensorNanOpt): - status = cudnnSetOpTensorDescriptor( - opTensorDesc, opTensorOp, - opTensorCompType, opTensorNanOpt) - check_status(status) - - -cpdef getOpTensorDescriptor(size_t opTensorDesc): - cdef OpTensorOp opTensorOp - cdef DataType opTensorCompType - cdef NanPropagation opTensorNanOpt - status = cudnnGetOpTensorDescriptor( - opTensorDesc, &opTensorOp, &opTensorCompType, - &opTensorNanOpt) - check_status(status) - return opTensorOp, opTensorCompType, opTensorNanOpt - - -cpdef destroyOpTensorDescriptor(size_t opTensorDesc): - status = cudnnDestroyOpTensorDescriptor(opTensorDesc) - check_status(status) - - -cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, - size_t aDesc, size_t A, size_t alpha2, size_t bDesc, - size_t B, size_t beta, size_t cDesc, size_t C): - _setStream(handle) - with nogil: - status = cudnnOpTensor( - handle, opTensorDesc, alpha1, - aDesc, A, alpha2, - bDesc, B, beta, - cDesc, C) - check_status(status) - - -############################################################################### -# Tensor reductions -############################################################################### - -cpdef size_t createReduceTensorDescriptor() except? 0: - cdef ReduceTensorDescriptor reduceTensorDesc - status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) - check_status(status) - return reduceTensorDesc - -cpdef setReduceTensorDescriptor( - size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, - int reduceTensorNanOpt, int reduceTensorIndices, - int reduceTensorIndicesType): - status = cudnnSetReduceTensorDescriptor( - reduceTensorDesc, - reduceTensorOp, - reduceTensorCompType, reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType) - check_status(status) - - -cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): - cdef ReduceTensorOp redOp - cdef DataType redCompType - cdef NanPropagation redNanOpt - cdef ReduceTensorIndices redIndices - cdef IndicesType redIndicesType - status = cudnnGetReduceTensorDescriptor( - reduceTensorDesc, &redOp, - &redCompType, &redNanOpt, &redIndices, &redIndicesType) - check_status(status) - return redOp, redCompType, redNanOpt, redIndices, redIndicesType - - -cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - status = cudnnDestroyReduceTensorDescriptor( - reduceTensorDesc) - check_status(status) - - -cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetReductionIndicesSize( - handle, reduceTensorDesc, - aDesc, cDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getReductionWorkspaceSize(intptr_t handle, - size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetReductionWorkspaceSize( - handle, reduceTensorDesc, - aDesc, cDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, - size_t indicesSizeInBytes, size_t workspace, - size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, - size_t A, size_t beta, size_t cDesc, size_t C): - _setStream(handle) - with nogil: - status = cudnnReduceTensor( - handle, reduceTensorDesc, - indices, indicesSizeInBytes, workspace, - workspaceSizeInBytes, alpha, aDesc, - A, beta, cDesc, C) - check_status(status) - - -cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): - _setStream(handle) - with nogil: - status = cudnnSetTensor( - handle, yDesc, y, - valuePtr) - check_status(status) - - -cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): - _setStream(handle) - with nogil: - status = cudnnScaleTensor( - handle, yDesc, y, - alpha) - check_status(status) - - -############################################################################### -# Filter manipulation -############################################################################### - -cpdef size_t createFilterDescriptor() except? 0: - cdef FilterDescriptor desc - status = cudnnCreateFilterDescriptor(&desc) - check_status(status) - return desc - - -cpdef setFilter4dDescriptor_v4( - size_t filterDesc, int dataType, - int format, int k, int c, int h, int w): - status = cudnnSetFilter4dDescriptor_v4( - filterDesc, dataType, - format, k, c, h, w) - check_status(status) - - -cpdef setFilterNdDescriptor_v4( - size_t filterDesc, int dataType, - int format, int nbDims, size_t filterDimA): - status = cudnnSetFilterNdDescriptor_v4( - filterDesc, dataType, - format, nbDims, filterDimA) - check_status(status) - - -cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): - cdef DataType dataType - cdef TensorFormat format - cdef int nbDims - cdef vector.vector[int] filterDimA - filterDimA.resize(nbDimsRequested) - - status = cudnnGetFilterNdDescriptor_v4( - wDesc, nbDimsRequested, &dataType, - &format, &nbDims, filterDimA.data()) - check_status(status) - return dataType, format, nbDims, tuple(filterDimA) - - -cpdef destroyFilterDescriptor(size_t filterDesc): - status = cudnnDestroyFilterDescriptor(filterDesc) - check_status(status) - - -############################################################################### -# Convolution -############################################################################### - -cpdef size_t createConvolutionDescriptor() except? 0: - cdef ConvolutionDescriptor desc - status = miopenCreateConvolutionDescriptor(&desc) - check_status(status) - return desc - - -cpdef setConvolutionMathType(size_t convDesc, size_t mathType): - status = cudnnSetConvolutionMathType( - convDesc, mathType) - check_status(status) - - -cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: - cdef MathType mathType - status = cudnnGetConvolutionMathType( - convDesc, &mathType) - check_status(status) - return mathType - - -cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - status = miopenSetConvolutionGroupCount( - convDesc, groupCount) - check_status(status) - - -cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: - cdef int groupCount - status = cudnnGetConvolutionGroupCount( - convDesc, &groupCount) - check_status(status) - return groupCount - - -cpdef setConvolution2dDescriptor_v4( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode): - status = cudnnSetConvolution2dDescriptor_v4( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode) - check_status(status) - - -cpdef setConvolution2dDescriptor_v5( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode, size_t computeType): - status = cudnnSetConvolution2dDescriptor_v5( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode, computeType) - check_status(status) - - -cpdef setConvolutionNdDescriptor_v3( - size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, - size_t dilationA, int mode, int dataType): - status = cudnnSetConvolutionNdDescriptor_v3( - convDesc, arrayLength, padA, - filterStrideA, dilationA, mode, - dataType) - check_status(status) - - -cpdef destroyConvolutionDescriptor(size_t convDesc): - status = miopenDestroyConvolutionDescriptor( - convDesc) - check_status(status) - - -cpdef findConvolutionForwardAlgorithm( - intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, - size_t yDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithm( - handle, xDesc, wDesc, - convDesc, yDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionForwardAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionForwardAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx_v7( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionForwardAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: - cdef ConvolutionFwdAlgo algo - status = cudnnGetConvolutionForwardAlgorithm_v6( - handle, srcDesc, - filterDesc, convDesc, - destDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionForwardAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionForwardAlgorithm_v7( - handle, srcDesc, - filterDesc, convDesc, - destDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = miopenConvolutionForwardGetWorkSpaceSize( - handle, srcDesc, - filterDesc, convDesc, - destDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionForward( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t filterDesc, size_t filterData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t destDesc, size_t destData): - _setStream(handle) - with nogil: - status = cudnnConvolutionForward( - handle, alpha, - srcDesc, srcData, - filterDesc, filterData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - destDesc, destData) - check_status(status) - - -cpdef convolutionBackwardBias( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t beta, size_t destDesc, size_t destData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardBias( - handle, alpha, - srcDesc, srcData, beta, - destDesc, destData) - check_status(status) - - -cpdef findConvolutionBackwardFilterAlgorithm( - intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, - size_t dwDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithm( - handle, xDesc, dyDesc, - convDesc, dwDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionBackwardFilterAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithmEx( - handle, xDesc, x, - dyDesc, dy, convDesc, - dwDesc, dw, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( - handle, xDesc, x, - dyDesc, dy, convDesc, - dwDesc, dw, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionBackwardFilterAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int preference, - size_t memoryLimitInbytes) except? -1: - cdef ConvolutionBwdFilterAlgo algo - status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( - handle, srcDesc, - diffDesc, convDesc, - filterDesc, - preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionBackwardFilterAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( - handle, srcDesc, diffDesc, - convDesc, gradDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle, srcDesc, - diffDesc, convDesc, - filterDesc, algo, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionBackwardFilter_v3( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardFilter_v3( - handle, alpha, - srcDesc, srcData, - diffDesc, diffData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - gradDesc, gradData) - check_status(status) - - -cpdef findConvolutionBackwardDataAlgorithm( - intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, - size_t dxDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithm( - handle, wDesc, dyDesc, - convDesc, dxDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionBackwardDataAlgorithmEx( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithmEx( - handle, wDesc, w, - dyDesc, dy, convDesc, - dxDesc, dx, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionBackwardDataAlgorithmEx_v7( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( - handle, wDesc, w, - dyDesc, dy, convDesc, - dxDesc, dx, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionBackwardDataAlgorithm_v6( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, size_t preference, - size_t memoryLimitInbytes) except? -1: - cdef ConvolutionBwdDataAlgo algo - status = cudnnGetConvolutionBackwardDataAlgorithm_v6( - handle, filterDesc, - diffDesc, convDesc, - gradDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionBackwardDataAlgorithm_v7( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionBackwardDataAlgorithm_v7( - handle, filterDesc, - diffDesc, convDesc, - gradDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = miopenConvolutionBackwardDataGetWorkSpaceSize( - handle, filterDesc, - diffDesc, - convDesc, gradDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionBackwardData_v3( - intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardData_v3( - handle, alpha, - filterDesc, filterData, - diffDesc, diffData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - gradDesc, gradData) - check_status(status) - -############################################################################### -# Pooling -############################################################################### - -cpdef size_t createPoolingDescriptor() except? 0: - cdef PoolingDescriptor desc - status = miopenCreatePoolingDescriptor(&desc) - check_status(status) - return desc - - -cpdef setPooling2dDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, - int windowWidth, int verticalPadding, int horizontalPadding, - int verticalStride, int horizontalStride): - status = cudnnSetPooling2dDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, windowHeight, windowWidth, - verticalPadding, horizontalPadding, verticalStride, horizontalStride) - check_status(status) - - -cpdef setPoolingNdDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, - size_t windowDimA, size_t paddingA, size_t strideA): - status = cudnnSetPoolingNdDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, nbDims, - windowDimA, paddingA, strideA) - check_status(status) - - -cpdef destroyPoolingDescriptor(size_t poolingDesc): - status = miopenDestroyPoolingDescriptor(poolingDesc) - check_status(status) - - -cpdef poolingForward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnPoolingForward( - handle, poolingDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef poolingBackward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnPoolingBackward( - handle, poolingDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - -############################################################################### -# Batch Normalization -############################################################################### - -CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON - -cpdef deriveBNTensorDescriptor( - size_t derivedBnDesc, size_t xDesc, int mode): - status = miopenDeriveBNTensorDescriptor( - derivedBnDesc, xDesc, - mode) - check_status(status) - - -cpdef batchNormalizationForwardTraining( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): - _setStream(handle) - with nogil: - status = miopenBatchNormalizationForwardTraining( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardInference( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, size_t estimatedMean, size_t estimatedVariance, - double epsilon): - _setStream(handle) - with nogil: - status = miopenBatchNormalizationForwardInference( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, estimatedMean, estimatedVariance, - epsilon) - check_status(status) - - -cpdef batchNormalizationBackward( - intptr_t handle, int mode, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, size_t dyDesc, - size_t dy, size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, size_t bnScale, - size_t dBnScaleResult, size_t dBnBiasResult, - double epsilon, size_t savedMean, size_t savedInvVariance): - _setStream(handle) - with nogil: - status = miopenBatchNormalizationBackward( - handle, mode, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - dyDesc, dy, - dxDesc, dx, - dBnScaleBiasDesc, bnScale, - dBnScaleResult, dBnBiasResult, - epsilon, savedMean, savedInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardTrainingEx( - intptr_t handle, int mode, int bnOps, - size_t alpha, size_t beta, - size_t xDesc, size_t x, - size_t zDesc, size_t z, - size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, - size_t bnScale, size_t bnBias, - double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardTrainingEx( - handle, mode, bnOps, - alpha, beta, - xDesc, x, - zDesc, z, - yDesc, y, - bnScaleBiasMeanVarDesc, - bnScale, bnBias, - exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t zDesc, - size_t yDesc, - size_t bnScaleBiasMeanVarDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - handle, - mode, bnOps, - xDesc, - zDesc, - yDesc, - bnScaleBiasMeanVarDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef batchNormalizationBackwardEx( - intptr_t handle, int mode, int bnops, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, - size_t yDesc, size_t y, - size_t dyDesc, size_t dy, - size_t dzDesc, size_t dz, - size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, - size_t bnScaleData, size_t bnBiasData, - size_t dBnScaleData, size_t dBnBiasData, - double epsilon, - size_t savedMean, size_t savedInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationBackwardEx( - handle, - mode, bnops, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - yDesc, y, - dyDesc, dy, - dzDesc, dz, - dxDesc, dx, - dBnScaleBiasDesc, - bnScaleData, bnBiasData, - dBnScaleData, dBnBiasData, - epsilon, - savedMean, savedInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t yDesc, - size_t dyDesc, - size_t dzDesc, - size_t dxDesc, - size_t dBnScaleBiasDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( - handle, - mode, - bnOps, - xDesc, - yDesc, - dyDesc, - dzDesc, - dxDesc, - dBnScaleBiasDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( - intptr_t handle, int mode, int bnOps, - size_t activationDesc, - size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - handle, - mode, - bnOps, - activationDesc, - xDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -############################################################################### -# Activation -############################################################################### - -cpdef size_t createActivationDescriptor() except? 0: - cdef ActivationDescriptor activationDesc - status = miopenCreateActivationDescriptor(&activationDesc) - check_status(status) - return activationDesc - - -cpdef setActivationDescriptor( - size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): - status = cudnnSetActivationDescriptor( - activationDesc, mode, - reluNanOpt, reluCeiling) - check_status(status) - - -cpdef destroyActivationDescriptor(size_t activationDesc): - status = miopenDestroyActivationDescriptor( - activationDesc) - check_status(status) - - -cpdef softmaxForward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = miopenSoftmaxForward( - handle, - alpha, srcDesc, srcData, - beta, dstDesc, dstData) - check_status(status) - - -cpdef softmaxBackward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, - size_t destDiffDesc, size_t destDiffData): - _setStream(handle) - with nogil: - status = miopenSoftmaxBackward( - handle, - alpha, srcDesc, srcData, - srcDiffDesc, srcDiffData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -cpdef activationForward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnActivationForward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef activationBackward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnActivationBackward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -############################################################################### -# Dropout -############################################################################### - -cpdef size_t createDropoutDescriptor() except? 0: - cdef DropoutDescriptor desc - status = miopenCreateDropoutDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyDropoutDescriptor(size_t dropoutDesc): - status = miopenDestroyDropoutDescriptor(dropoutDesc) - check_status(status) - - -cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: - cdef size_t sizeInBytes - status = miopenDropoutGetStatesSize( - handle, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef setDropoutDescriptor( - size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed): - status = cudnnSetDropoutDescriptor( - dropoutDesc, handle, dropout, - states, stateSizeInBytes, seed) - check_status(status) - - -cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = miopenDropoutGetReserveSpaceSize( - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef dropoutForward( - intptr_t handle, size_t dropoutDesc, - size_t srcDesc, size_t srcData, - size_t dstDesc, size_t dstData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutForward( - handle, dropoutDesc, - srcDesc, srcData, - dstDesc, dstData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef dropoutBackward( - intptr_t handle, size_t dropoutDesc, - size_t dyDesc, size_t dyData, - size_t dxDesc, size_t dxData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutBackward( - handle, dropoutDesc, - dyDesc, dyData, - dxDesc, dxData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# CTC -############################################################################### -cpdef size_t createCTCLossDescriptor() except? 0: - cdef CTCLossDescriptor desc - status = miopenCreateCTCLossDescriptor(&desc) - check_status(status) - return desc - -cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - status = miopenDestroyCTCLossDescriptor(ctcLossDesc) - check_status(status) - -cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): - status = cudnnSetCTCLossDescriptor( - ctcLossDesc, dataType) - check_status(status) - -cpdef getCTCLossDescriptor(size_t ctcLossDesc): - cdef DataType compType - status = cudnnGetCTCLossDescriptor( - ctcLossDesc, &compType) - check_status(status) - return compType - -cpdef size_t getCTCLossWorkspaceSize( - intptr_t handle, size_t probsDesc, size_t gradientsDesc, - size_t labels, size_t labelLengths, size_t inputLengths, - int algo, size_t ctcLossDesc) except? 0: - cdef size_t sizeInBytes - status = miopenGetCTCLossWorkspaceSize( - handle, probsDesc, - gradientsDesc, - labels, labelLengths, inputLengths, - algo, ctcLossDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - -cpdef CTCLoss( - intptr_t handle, size_t probsDesc, - size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, - size_t costs, size_t gradientsDesc, size_t gradients, - int algo, size_t ctcLossDesc, - size_t workspace, size_t workSpaceSizeInBytes): - status = miopenCTCLoss( - handle, probsDesc, probs, - labels, labelLengths, inputLengths, - costs, gradientsDesc, gradients, - algo, ctcLossDesc, - workspace, workSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# RNN -############################################################################### - -cpdef size_t createRNNDescriptor() except? 0: - cdef RNNDescriptor desc - status = miopenCreateRNNDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDescriptor(size_t rnnDesc): - status = miopenDestroyRNNDescriptor(rnnDesc) - check_status(status) - - -cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, - int dataType) except? 0: - cdef PersistentRNNPlan plan - status = cudnnCreatePersistentRNNPlan( - rnnDesc, - minibatch, dataType, &plan) - check_status(status) - return plan - - -cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): - status = cudnnSetPersistentRNNPlan( - rnnDesc, plan) - check_status(status) - - -cpdef destroyPersistentRNNPlan(size_t plan): - status = cudnnDestroyPersistentRNNPlan(plan) - check_status(status) - - -cpdef setRNNDescriptor_v5( - size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int dataType): - status = cudnnSetRNNDescriptor_v5( - rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, dataType) - check_status(status) - - -cpdef setRNNDescriptor_v6( - intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int algo, int dataType): - status = cudnnSetRNNDescriptor_v6( - handle, rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, algo, - dataType) - check_status(status) - - -cpdef setRNNPaddingMode( - size_t rnnDesc, int paddingMode): - status = cudnnSetRNNPaddingMode( - rnnDesc, paddingMode) - check_status(status) - - -cpdef getRNNPaddingMode(size_t rnnDesc): - cdef RNNPaddingMode paddingMode - status = cudnnGetRNNPaddingMode( - rnnDesc, &paddingMode) - check_status(status) - return paddingMode - - -cpdef size_t createRNNDataDescriptor() except? 0: - cdef RNNDataDescriptor desc - status = cudnnCreateRNNDataDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): - status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) - check_status(status) - - -cpdef setRNNDataDescriptor( - size_t RNNDataDesc, int dataType, size_t layout, - int maxSeqLength, int batchSize, int vectorSize, - size_t seqLengthArray, size_t paddingFill): - status = cudnnSetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, vectorSize, - seqLengthArray, paddingFill) - check_status(status) - - -cpdef getRNNDataDescriptor( - size_t RNNDataDesc, size_t dataType, - size_t layout, size_t maxSeqLength, size_t batchSize, - size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, - size_t paddingFill): - status = cudnnGetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, - vectorSize, arrayLengthRequested, seqLengthArray, - paddingFill) - check_status(status) - - -cpdef getRNNWorkspaceSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - status = miopenGetRNNWorkspaceSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNTrainingReserveSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - status = miopenGetRNNTrainingReserveSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNParamsSize( - intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): - cdef size_t sizeInBytes - status = miopenGetRNNParamsSize( - handle, rnnDesc, xDesc, - &sizeInBytes, dataType) - check_status(status) - return sizeInBytes - - -cpdef getRNNLinLayerMatrixParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): - status = cudnnGetRNNLinLayerMatrixParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerMatDesc, linLayerMat) - check_status(status) - - -cpdef getRNNLinLayerBiasParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerBiasDesc, - size_t linLayerBias): - status = cudnnGetRNNLinLayerBiasParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerBiasDesc, linLayerBias) - check_status(status) - - -cpdef RNNForwardInference( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, - size_t x, size_t hxDesc, size_t hx, size_t cxDesc, - size_t cx, size_t wDesc, size_t w, size_t yDesc, - size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t workspace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = miopenRNNForwardInference( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTraining( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t wDesc, size_t w, size_t yDesc, size_t y, - size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, - size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = miopenRNNForwardTraining( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardData( - intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, - size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, - size_t dcxDesc, size_t dcx, size_t workspace, - size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardData( - handle, rnnDesc, seqLength, - yDesc, y, - dyDesc, dy, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeights( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, - size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeights( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - yDesc, y, - workspace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardInferenceEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardInferenceEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTrainingEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardTrainingEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardDataEx( - intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, - size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, - size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, - size_t dkDesc, size_t dkeys, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardDataEx( - handle, rnnDesc, - yDesc, y, - dyDesc, dy, - dcDesc, dcAttn, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - dkDesc, dkeys, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeightsEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t dwDesc, size_t dw, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeightsEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - yDesc, y, - workSpace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# Spatial Transformer -############################################################################### - -cpdef size_t createSpatialTransformerDescriptor() except? 0: - cdef SpatialTransformerDescriptor stDesc - status = cudnnCreateSpatialTransformerDescriptor(&stDesc) - check_status(status) - return stDesc - - -cpdef destroySpatialTransformerDescriptor(size_t stDesc): - status = cudnnDestroySpatialTransformerDescriptor( - stDesc) - check_status(status) - - -cpdef setSpatialTransformerDescriptor( - size_t stDesc, size_t samplerType, int dataType, - int nbDims, size_t dimA): - status = cudnnSetSpatialTransformerNdDescriptor( - stDesc, samplerType, - dataType, nbDims, dimA) - check_status(status) - - -cpdef spatialTfGridGeneratorForward( - intptr_t handle, size_t stDesc, size_t theta, size_t grid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorForward( - handle, stDesc, - theta, grid) - check_status(status) - - -cpdef spatialTfGridGeneratorBackward( - intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorBackward( - handle, stDesc, - dgrid, dtheta) - check_status(status) - - -cpdef spatialTfSamplerForward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerForward( - handle, stDesc, - alpha, xDesc, x, grid, - beta, yDesc, y) - check_status(status) - - -cpdef spatialTfSamplerBackward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, - size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerBackward( - handle, stDesc, - alpha, xDesc, x, beta, - dxDesc, dx, alphaDgrid, - dyDesc, dy, grid, - betaDgrid, dgrid) - check_status(status) - -############################################################################### -# Fused Ops -############################################################################### - -cpdef createFusedOpsConstParamPack(int ops): - cdef FusedOpsConstParamPack constPack - with nogil: - status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) - check_status(status) - return constPack - -cpdef destroyFusedOpsConstParamPack(size_t constPack): - with nogil: - status = cudnnDestroyFusedOpsConstParamPack( - constPack) - check_status(status) - -cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - with nogil: - status = cudnnSetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param) - check_status(status) - -cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - cdef int isNULL = 0 - with nogil: - status = cudnnGetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param, &isNULL) - check_status(status) - return isNULL - -cpdef createFusedOpsVariantParamPack(int ops): - cdef FusedOpsVariantParamPack varPack - with nogil: - status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) - check_status(status) - return varPack - -cpdef destroyFusedOpsVariantParamPack(size_t varPack): - with nogil: - status = cudnnDestroyFusedOpsVariantParamPack( - varPack) - check_status(status) - -cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnSetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnGetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef createFusedOpsPlan(int ops): - cdef FusedOpsPlan plan - with nogil: - status = cudnnCreateFusedOpsPlan(&plan, ops) - check_status(status) - return plan - -cpdef destroyFusedOpsPlan(size_t plan): - with nogil: - status = cudnnDestroyFusedOpsPlan(plan) - check_status(status) - -cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): - cdef size_t workspaceSizeInBytes - _setStream(handle) - with nogil: - status = cudnnMakeFusedOpsPlan(handle, plan, - constPack, - &workspaceSizeInBytes) - check_status(status) - return workspaceSizeInBytes - -cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): - _setStream(handle) - with nogil: - status = cudnnFusedOpsExecute(handle, plan, - varPack) - check_status(status) - From fd2b3220dc3b7dffe7c8333e78c1154f3fe14caf Mon Sep 17 00:00:00 2001 From: bmedishe Date: Mon, 27 Nov 2023 21:49:34 +0000 Subject: [PATCH 06/26] do not skip tests --- tests/cupyx_tests/test_cudnn.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/cupyx_tests/test_cudnn.py b/tests/cupyx_tests/test_cudnn.py index 84ef7b02071..0087a1c661b 100644 --- a/tests/cupyx_tests/test_cudnn.py +++ b/tests/cupyx_tests/test_cudnn.py @@ -40,7 +40,6 @@ 'dtype': [numpy.float32, numpy.float64], 'mode': modes, })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnActivation: @pytest.fixture(autouse=True) @@ -60,7 +59,6 @@ def test_activation_backward(self): 'dtype': [numpy.float32, numpy.float64], 'mode': coef_modes, })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnActivationCoef: @pytest.fixture(autouse=True) @@ -83,7 +81,6 @@ def test_activation_backward(self): 'ratio': [0.0, 0.1, 0.2, 0.5], 'seed': [0, 100] })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnDropout: @pytest.fixture(autouse=True) @@ -136,7 +133,6 @@ def test_dropout_seed(self): 'bias': [True, False], 'layout': layouts, }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionForward: @pytest.fixture(autouse=True) @@ -224,7 +220,6 @@ def test_call(self): 'auto_tune': [True, False], 'deterministic': [True, False], }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionBackwardFilter: @pytest.fixture(autouse=True) @@ -303,7 +298,6 @@ def test_call(self): 'deterministic': [True, False], 'bias': [True, False], }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionBackwardData: @pytest.fixture(autouse=True) From 334557c5f74df9953da504cddc7d9eb075f24d43 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Tue, 28 Nov 2023 20:27:52 +0000 Subject: [PATCH 07/26] update _feature.py with miopen lib, include --- install/cupy_builder/_features.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index 4ad926b4249..a239e483ac0 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -164,6 +164,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'cupyx.cusolver', 'cupy_backends.cuda.libs.curand_hip', 'cupy_backends.cuda.libs.nvrtc_hip', + 'cupy_backends.cuda.libs.miopen', ], 'include': [ 'hip/hip_runtime_api.h', @@ -175,6 +176,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'roctx.h', 'rocsolver/rocsolver.h' if rocm_version >= 560 else 'rocsolver.h', 'hipsolver/hipsolver.h' if rocm_version >= 560 else 'hipsolver.h', + 'miopen/miopen.h', ], 'libraries': [ 'amdhip64', # was hiprtc and hip_hcc before ROCm 3.8.0 @@ -188,6 +190,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'rocsolver', 'rocsparse', 'hipsolver', + 'MIOpen', ], 'check_method': build.check_hip_version, 'version_method': build.get_hip_version, From d396e57361ccaa5af4d862144af58d8bc4d1622b Mon Sep 17 00:00:00 2001 From: bmedishe Date: Tue, 28 Nov 2023 20:37:01 +0000 Subject: [PATCH 08/26] add cudnn in _features.py --- install/cupy_builder/_features.py | 1 + 1 file changed, 1 insertion(+) diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index a239e483ac0..078460816e4 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -164,6 +164,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'cupyx.cusolver', 'cupy_backends.cuda.libs.curand_hip', 'cupy_backends.cuda.libs.nvrtc_hip', + 'cupy_backends.cuda.libs.cudnn', 'cupy_backends.cuda.libs.miopen', ], 'include': [ From 02ab1ff5871780cb949708b23f383ef536fb2055 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Tue, 28 Nov 2023 20:43:23 +0000 Subject: [PATCH 09/26] _is_hip_env replaced with hip_env --- cupy_backends/cuda/libs/cudnn.pyx | 100 +++++++++++++++--------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index bd4c50f3d41..2f567cc6c5e 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -759,7 +759,7 @@ class CuDNNError(RuntimeError): def __init__(self, int status): self.status = status - if runtime._is_hip_environment: + if runtime.hip_environment: msg = miopenGetErrorString(status) else: msg = cudnnGetErrorString(status) @@ -803,7 +803,7 @@ def get_build_version(): ############################################################################### cpdef size_t getVersion() except? 0: - if runtime._is_hip_environment: + if runtime.hip_environment: return miopenGetVersion() else: return cudnnGetVersion() @@ -829,7 +829,7 @@ cpdef queryRuntimeError(intptr_t handle, int mode): cpdef intptr_t create() except? 0: cdef Handle handle with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreate(&handle) else: status = cudnnCreate(&handle) @@ -839,7 +839,7 @@ cpdef intptr_t create() except? 0: cpdef destroy(intptr_t handle): with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroy(handle) else: status = cudnnDestroy(handle) @@ -849,11 +849,11 @@ cpdef destroy(intptr_t handle): cpdef setStream(intptr_t handle, size_t stream): # TODO(leofang): The support of stream capture is not mentioned at all in # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. - if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): + if not runtime.hip_environment and runtime.streamIsCapturing(stream): raise NotImplementedError( 'calling cuDNN API during stream capture is currently ' 'unsupported') - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenSetStream(handle, stream) else: status = cudnnSetStream(handle, stream) @@ -862,7 +862,7 @@ cpdef setStream(intptr_t handle, size_t stream): cpdef size_t getStream(intptr_t handle) except? 0: cdef driver.Stream stream - if runtime._is_hip_environment: + if runtime.hip_environment: status = cudnnGetStream(handle, &stream) else: status = miopenGetStream(handle, &stream) @@ -880,7 +880,7 @@ cdef _setStream(intptr_t handle): cpdef size_t createTensorDescriptor() except? 0: cdef TensorDescriptor descriptor - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreateTensorDescriptor(&descriptor) else: status = cudnnCreateTensorDescriptor(&descriptor) @@ -924,7 +924,7 @@ cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, cpdef destroyTensorDescriptor(size_t tensorDesc): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroyTensorDescriptor(tensorDesc) else: status = cudnnDestroyTensorDescriptor(tensorDesc) @@ -981,7 +981,7 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, size_t B, size_t beta, size_t cDesc, size_t C): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenOpTensor( handle, opTensorDesc, alpha1, aDesc, A, alpha2, @@ -1002,7 +1002,7 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, cpdef size_t createReduceTensorDescriptor() except? 0: cdef ReduceTensorDescriptor reduceTensorDesc - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreateReduceTensorDescriptor(&reduceTensorDesc) else: status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) @@ -1013,7 +1013,7 @@ cpdef setReduceTensorDescriptor( size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, int reduceTensorNanOpt, int reduceTensorIndices, int reduceTensorIndicesType): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenSetReduceTensorDescriptor( reduceTensorDesc, reduceTensorOp, @@ -1036,7 +1036,7 @@ cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): cdef NanPropagation redNanOpt cdef ReduceTensorIndices redIndices cdef IndicesType redIndicesType - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenGetReduceTensorDescriptor( reduceTensorDesc, &redOp, &redCompType, &redNanOpt, &redIndices, &redIndicesType) @@ -1049,7 +1049,7 @@ cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroyReduceTensorDescriptor( reduceTensorDesc) else: @@ -1061,7 +1061,7 @@ cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenGetReductionIndicesSize( handle, reduceTensorDesc, aDesc, cDesc, &sizeInBytes) @@ -1077,7 +1077,7 @@ cpdef size_t getReductionWorkspaceSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenGetReductionWorkspaceSize( handle, reduceTensorDesc, aDesc, cDesc, @@ -1097,7 +1097,7 @@ cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, size_t A, size_t beta, size_t cDesc, size_t C): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenReduceTensor( handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, @@ -1115,7 +1115,7 @@ cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenSetTensor( handle, yDesc, y, valuePtr) @@ -1129,7 +1129,7 @@ cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenScaleTensor( handle, yDesc, y, alpha) @@ -1194,7 +1194,7 @@ cpdef destroyFilterDescriptor(size_t filterDesc): cpdef size_t createConvolutionDescriptor() except? 0: cdef ConvolutionDescriptor desc - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreateConvolutionDescriptor(&desc) else: status = cudnnCreateConvolutionDescriptor(&desc) @@ -1216,7 +1216,7 @@ cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenSetConvolutionGroupCount( convDesc, groupCount) else: @@ -1227,7 +1227,7 @@ cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: cdef int groupCount - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenGetConvolutionGroupCount( convDesc, &groupCount) else: @@ -1265,7 +1265,7 @@ cpdef setConvolutionNdDescriptor_v3( cpdef destroyConvolutionDescriptor(size_t convDesc): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroyConvolutionDescriptor( convDesc) else: @@ -1378,7 +1378,7 @@ cpdef convolutionForward( size_t destDesc, size_t destData): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenConvolutionForward(handle, alpha, srcDesc, srcData, filterDesc, filterData, @@ -1401,7 +1401,7 @@ cpdef convolutionBackwardBias( size_t beta, size_t destDesc, size_t destData): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenConvolutionBackwardBias( handle, alpha, srcDesc, srcData, beta, @@ -1651,7 +1651,7 @@ cpdef convolutionBackwardData_v3( cpdef size_t createPoolingDescriptor() except? 0: cdef PoolingDescriptor desc - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreatePoolingDescriptor(&desc) else: status = cudnnCreatePoolingDescriptor(&desc) @@ -1681,7 +1681,7 @@ cpdef setPoolingNdDescriptor_v4( cpdef destroyPoolingDescriptor(size_t poolingDesc): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroyPoolingDescriptor(poolingDesc) else: status = cudnnDestroyPoolingDescriptor(poolingDesc) @@ -1723,7 +1723,7 @@ CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON cpdef deriveBNTensorDescriptor( size_t derivedBnDesc, size_t xDesc, int mode): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDeriveBNTensorDescriptor( derivedBnDesc, xDesc, mode) @@ -1744,7 +1744,7 @@ cpdef batchNormalizationForwardTraining( double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenBatchNormalizationForwardTraining( handle, mode, alpha, beta, xDesc, @@ -1774,7 +1774,7 @@ cpdef batchNormalizationForwardInference( double epsilon): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenBatchNormalizationForwardInference( handle, mode, alpha, beta, xDesc, @@ -1804,7 +1804,7 @@ cpdef batchNormalizationBackward( double epsilon, size_t savedMean, size_t savedInvVariance): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenBatchNormalizationBackward( handle, mode, alphaDataDiff, betaDataDiff, @@ -1971,7 +1971,7 @@ cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( cpdef size_t createActivationDescriptor() except? 0: cdef ActivationDescriptor activationDesc - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreateActivationDescriptor(&activationDesc) else: status = cudnnCreateActivationDescriptor(&activationDesc) @@ -1988,7 +1988,7 @@ cpdef setActivationDescriptor( cpdef destroyActivationDescriptor(size_t activationDesc): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroyActivationDescriptor( activationDesc) else: @@ -2002,7 +2002,7 @@ cpdef softmaxForward( size_t srcData, size_t beta, size_t dstDesc, size_t dstData): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenSoftmaxForward( handle, algorithm, mode, alpha, srcDesc, srcData, @@ -2021,7 +2021,7 @@ cpdef softmaxBackward( size_t destDiffDesc, size_t destDiffData): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenSoftmaxBackward( handle, algorithm, mode, alpha, srcDesc, srcData, @@ -2070,7 +2070,7 @@ cpdef activationBackward_v4( cpdef size_t createDropoutDescriptor() except? 0: cdef DropoutDescriptor desc - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreateDropoutDescriptor(&desc) else: status = cudnnCreateDropoutDescriptor(&desc) @@ -2079,7 +2079,7 @@ cpdef size_t createDropoutDescriptor() except? 0: cpdef destroyDropoutDescriptor(size_t dropoutDesc): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroyDropoutDescriptor(dropoutDesc) else: status = cudnnDestroyDropoutDescriptor(dropoutDesc) @@ -2088,7 +2088,7 @@ cpdef destroyDropoutDescriptor(size_t dropoutDesc): cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: cdef size_t sizeInBytes - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDropoutGetStatesSize( handle, &sizeInBytes) else: @@ -2109,7 +2109,7 @@ cpdef setDropoutDescriptor( cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: cdef size_t sizeInBytes - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDropoutGetReserveSpaceSize( xDesc, &sizeInBytes) else: @@ -2154,7 +2154,7 @@ cpdef dropoutBackward( ############################################################################### cpdef size_t createCTCLossDescriptor() except? 0: cdef CTCLossDescriptor desc - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreateCTCLossDescriptor(&desc) else: status = cudnnCreateCTCLossDescriptor(&desc) @@ -2162,7 +2162,7 @@ cpdef size_t createCTCLossDescriptor() except? 0: return desc cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroyCTCLossDescriptor(ctcLossDesc) else: status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) @@ -2185,7 +2185,7 @@ cpdef size_t getCTCLossWorkspaceSize( size_t labels, size_t labelLengths, size_t inputLengths, int algo, size_t ctcLossDesc) except? 0: cdef size_t sizeInBytes - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenGetCTCLossWorkspaceSize( handle, probsDesc, gradientsDesc, @@ -2206,7 +2206,7 @@ cpdef CTCLoss( size_t costs, size_t gradientsDesc, size_t gradients, int algo, size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCTCLoss( handle, probsDesc, probs, labels, labelLengths, inputLengths, @@ -2229,7 +2229,7 @@ cpdef CTCLoss( cpdef size_t createRNNDescriptor() except? 0: cdef RNNDescriptor desc - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenCreateRNNDescriptor(&desc) else: status = cudnnCreateRNNDescriptor(&desc) @@ -2238,7 +2238,7 @@ cpdef size_t createRNNDescriptor() except? 0: cpdef destroyRNNDescriptor(size_t rnnDesc): - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenDestroyRNNDescriptor(rnnDesc) else: status = cudnnDestroyRNNDescriptor(rnnDesc) @@ -2343,7 +2343,7 @@ cpdef getRNNDataDescriptor( cpdef getRNNWorkspaceSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenGetRNNWorkspaceSize( handle, rnnDesc, seqLength, xDesc, &sizeInBytes) @@ -2358,7 +2358,7 @@ cpdef getRNNWorkspaceSize( cpdef getRNNTrainingReserveSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenGetRNNTrainingReserveSize( handle, rnnDesc, seqLength, xDesc, &sizeInBytes) @@ -2373,7 +2373,7 @@ cpdef getRNNTrainingReserveSize( cpdef getRNNParamsSize( intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): cdef size_t sizeInBytes - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenGetRNNParamsSize( handle, rnnDesc, xDesc, &sizeInBytes, dataType) @@ -2414,7 +2414,7 @@ cpdef RNNForwardInference( size_t cy, size_t workspace, size_t workSpaceSizeInBytes): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenRNNForwardInference( handle, rnnDesc, seqLength, xDesc, x, @@ -2448,7 +2448,7 @@ cpdef RNNForwardTraining( size_t reserveSpaceSizeInBytes): _setStream(handle) with nogil: - if runtime._is_hip_environment: + if runtime.hip_environment: status = miopenRNNForwardTraining( handle, rnnDesc, seqLength, xDesc, x, From 9d2148fa08529be4ad914d2d44edafcfed8ef8a3 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Wed, 29 Nov 2023 02:32:59 +0000 Subject: [PATCH 10/26] tabs error --- cupy_backends/cuda/libs/cudnn.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 2f567cc6c5e..64dc7c72615 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -1378,8 +1378,8 @@ cpdef convolutionForward( size_t destDesc, size_t destData): _setStream(handle) with nogil: - if runtime.hip_environment: - status = miopenConvolutionForward(handle, alpha, + if runtime.hip_environment: + status = miopenConvolutionForward(handle, alpha, srcDesc, srcData, filterDesc, filterData, convDesc, algo, @@ -1681,7 +1681,7 @@ cpdef setPoolingNdDescriptor_v4( cpdef destroyPoolingDescriptor(size_t poolingDesc): - if runtime.hip_environment: + if runtime.hip_environment: status = miopenDestroyPoolingDescriptor(poolingDesc) else: status = cudnnDestroyPoolingDescriptor(poolingDesc) From 8e0c0a1e27d159efe8b96c641a007d53b12a82a5 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Wed, 29 Nov 2023 04:48:56 +0000 Subject: [PATCH 11/26] runtime.hip_env replaced with runtime._is_hip_env --- cupy_backends/cuda/libs/cudnn.pyx | 100 +++++++++++++++--------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 64dc7c72615..fb082641d36 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -759,7 +759,7 @@ class CuDNNError(RuntimeError): def __init__(self, int status): self.status = status - if runtime.hip_environment: + if runtime._is_hip_environment: msg = miopenGetErrorString(status) else: msg = cudnnGetErrorString(status) @@ -803,7 +803,7 @@ def get_build_version(): ############################################################################### cpdef size_t getVersion() except? 0: - if runtime.hip_environment: + if runtime._is_hip_environment: return miopenGetVersion() else: return cudnnGetVersion() @@ -829,7 +829,7 @@ cpdef queryRuntimeError(intptr_t handle, int mode): cpdef intptr_t create() except? 0: cdef Handle handle with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreate(&handle) else: status = cudnnCreate(&handle) @@ -839,7 +839,7 @@ cpdef intptr_t create() except? 0: cpdef destroy(intptr_t handle): with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroy(handle) else: status = cudnnDestroy(handle) @@ -849,11 +849,11 @@ cpdef destroy(intptr_t handle): cpdef setStream(intptr_t handle, size_t stream): # TODO(leofang): The support of stream capture is not mentioned at all in # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. - if not runtime.hip_environment and runtime.streamIsCapturing(stream): + if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): raise NotImplementedError( 'calling cuDNN API during stream capture is currently ' 'unsupported') - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenSetStream(handle, stream) else: status = cudnnSetStream(handle, stream) @@ -862,7 +862,7 @@ cpdef setStream(intptr_t handle, size_t stream): cpdef size_t getStream(intptr_t handle) except? 0: cdef driver.Stream stream - if runtime.hip_environment: + if runtime._is_hip_environment: status = cudnnGetStream(handle, &stream) else: status = miopenGetStream(handle, &stream) @@ -880,7 +880,7 @@ cdef _setStream(intptr_t handle): cpdef size_t createTensorDescriptor() except? 0: cdef TensorDescriptor descriptor - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreateTensorDescriptor(&descriptor) else: status = cudnnCreateTensorDescriptor(&descriptor) @@ -924,7 +924,7 @@ cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, cpdef destroyTensorDescriptor(size_t tensorDesc): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroyTensorDescriptor(tensorDesc) else: status = cudnnDestroyTensorDescriptor(tensorDesc) @@ -981,7 +981,7 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, size_t B, size_t beta, size_t cDesc, size_t C): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenOpTensor( handle, opTensorDesc, alpha1, aDesc, A, alpha2, @@ -1002,7 +1002,7 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, cpdef size_t createReduceTensorDescriptor() except? 0: cdef ReduceTensorDescriptor reduceTensorDesc - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreateReduceTensorDescriptor(&reduceTensorDesc) else: status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) @@ -1013,7 +1013,7 @@ cpdef setReduceTensorDescriptor( size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, int reduceTensorNanOpt, int reduceTensorIndices, int reduceTensorIndicesType): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenSetReduceTensorDescriptor( reduceTensorDesc, reduceTensorOp, @@ -1036,7 +1036,7 @@ cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): cdef NanPropagation redNanOpt cdef ReduceTensorIndices redIndices cdef IndicesType redIndicesType - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenGetReduceTensorDescriptor( reduceTensorDesc, &redOp, &redCompType, &redNanOpt, &redIndices, &redIndicesType) @@ -1049,7 +1049,7 @@ cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroyReduceTensorDescriptor( reduceTensorDesc) else: @@ -1061,7 +1061,7 @@ cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenGetReductionIndicesSize( handle, reduceTensorDesc, aDesc, cDesc, &sizeInBytes) @@ -1077,7 +1077,7 @@ cpdef size_t getReductionWorkspaceSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenGetReductionWorkspaceSize( handle, reduceTensorDesc, aDesc, cDesc, @@ -1097,7 +1097,7 @@ cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, size_t A, size_t beta, size_t cDesc, size_t C): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenReduceTensor( handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, @@ -1115,7 +1115,7 @@ cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenSetTensor( handle, yDesc, y, valuePtr) @@ -1129,7 +1129,7 @@ cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenScaleTensor( handle, yDesc, y, alpha) @@ -1194,7 +1194,7 @@ cpdef destroyFilterDescriptor(size_t filterDesc): cpdef size_t createConvolutionDescriptor() except? 0: cdef ConvolutionDescriptor desc - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreateConvolutionDescriptor(&desc) else: status = cudnnCreateConvolutionDescriptor(&desc) @@ -1216,7 +1216,7 @@ cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenSetConvolutionGroupCount( convDesc, groupCount) else: @@ -1227,7 +1227,7 @@ cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: cdef int groupCount - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenGetConvolutionGroupCount( convDesc, &groupCount) else: @@ -1265,7 +1265,7 @@ cpdef setConvolutionNdDescriptor_v3( cpdef destroyConvolutionDescriptor(size_t convDesc): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroyConvolutionDescriptor( convDesc) else: @@ -1378,7 +1378,7 @@ cpdef convolutionForward( size_t destDesc, size_t destData): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenConvolutionForward(handle, alpha, srcDesc, srcData, filterDesc, filterData, @@ -1401,7 +1401,7 @@ cpdef convolutionBackwardBias( size_t beta, size_t destDesc, size_t destData): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenConvolutionBackwardBias( handle, alpha, srcDesc, srcData, beta, @@ -1651,7 +1651,7 @@ cpdef convolutionBackwardData_v3( cpdef size_t createPoolingDescriptor() except? 0: cdef PoolingDescriptor desc - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreatePoolingDescriptor(&desc) else: status = cudnnCreatePoolingDescriptor(&desc) @@ -1681,7 +1681,7 @@ cpdef setPoolingNdDescriptor_v4( cpdef destroyPoolingDescriptor(size_t poolingDesc): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroyPoolingDescriptor(poolingDesc) else: status = cudnnDestroyPoolingDescriptor(poolingDesc) @@ -1723,7 +1723,7 @@ CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON cpdef deriveBNTensorDescriptor( size_t derivedBnDesc, size_t xDesc, int mode): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDeriveBNTensorDescriptor( derivedBnDesc, xDesc, mode) @@ -1744,7 +1744,7 @@ cpdef batchNormalizationForwardTraining( double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenBatchNormalizationForwardTraining( handle, mode, alpha, beta, xDesc, @@ -1774,7 +1774,7 @@ cpdef batchNormalizationForwardInference( double epsilon): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenBatchNormalizationForwardInference( handle, mode, alpha, beta, xDesc, @@ -1804,7 +1804,7 @@ cpdef batchNormalizationBackward( double epsilon, size_t savedMean, size_t savedInvVariance): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenBatchNormalizationBackward( handle, mode, alphaDataDiff, betaDataDiff, @@ -1971,7 +1971,7 @@ cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( cpdef size_t createActivationDescriptor() except? 0: cdef ActivationDescriptor activationDesc - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreateActivationDescriptor(&activationDesc) else: status = cudnnCreateActivationDescriptor(&activationDesc) @@ -1988,7 +1988,7 @@ cpdef setActivationDescriptor( cpdef destroyActivationDescriptor(size_t activationDesc): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroyActivationDescriptor( activationDesc) else: @@ -2002,7 +2002,7 @@ cpdef softmaxForward( size_t srcData, size_t beta, size_t dstDesc, size_t dstData): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenSoftmaxForward( handle, algorithm, mode, alpha, srcDesc, srcData, @@ -2021,7 +2021,7 @@ cpdef softmaxBackward( size_t destDiffDesc, size_t destDiffData): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenSoftmaxBackward( handle, algorithm, mode, alpha, srcDesc, srcData, @@ -2070,7 +2070,7 @@ cpdef activationBackward_v4( cpdef size_t createDropoutDescriptor() except? 0: cdef DropoutDescriptor desc - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreateDropoutDescriptor(&desc) else: status = cudnnCreateDropoutDescriptor(&desc) @@ -2079,7 +2079,7 @@ cpdef size_t createDropoutDescriptor() except? 0: cpdef destroyDropoutDescriptor(size_t dropoutDesc): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroyDropoutDescriptor(dropoutDesc) else: status = cudnnDestroyDropoutDescriptor(dropoutDesc) @@ -2088,7 +2088,7 @@ cpdef destroyDropoutDescriptor(size_t dropoutDesc): cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: cdef size_t sizeInBytes - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDropoutGetStatesSize( handle, &sizeInBytes) else: @@ -2109,7 +2109,7 @@ cpdef setDropoutDescriptor( cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: cdef size_t sizeInBytes - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDropoutGetReserveSpaceSize( xDesc, &sizeInBytes) else: @@ -2154,7 +2154,7 @@ cpdef dropoutBackward( ############################################################################### cpdef size_t createCTCLossDescriptor() except? 0: cdef CTCLossDescriptor desc - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreateCTCLossDescriptor(&desc) else: status = cudnnCreateCTCLossDescriptor(&desc) @@ -2162,7 +2162,7 @@ cpdef size_t createCTCLossDescriptor() except? 0: return desc cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroyCTCLossDescriptor(ctcLossDesc) else: status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) @@ -2185,7 +2185,7 @@ cpdef size_t getCTCLossWorkspaceSize( size_t labels, size_t labelLengths, size_t inputLengths, int algo, size_t ctcLossDesc) except? 0: cdef size_t sizeInBytes - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenGetCTCLossWorkspaceSize( handle, probsDesc, gradientsDesc, @@ -2206,7 +2206,7 @@ cpdef CTCLoss( size_t costs, size_t gradientsDesc, size_t gradients, int algo, size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCTCLoss( handle, probsDesc, probs, labels, labelLengths, inputLengths, @@ -2229,7 +2229,7 @@ cpdef CTCLoss( cpdef size_t createRNNDescriptor() except? 0: cdef RNNDescriptor desc - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenCreateRNNDescriptor(&desc) else: status = cudnnCreateRNNDescriptor(&desc) @@ -2238,7 +2238,7 @@ cpdef size_t createRNNDescriptor() except? 0: cpdef destroyRNNDescriptor(size_t rnnDesc): - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenDestroyRNNDescriptor(rnnDesc) else: status = cudnnDestroyRNNDescriptor(rnnDesc) @@ -2343,7 +2343,7 @@ cpdef getRNNDataDescriptor( cpdef getRNNWorkspaceSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenGetRNNWorkspaceSize( handle, rnnDesc, seqLength, xDesc, &sizeInBytes) @@ -2358,7 +2358,7 @@ cpdef getRNNWorkspaceSize( cpdef getRNNTrainingReserveSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenGetRNNTrainingReserveSize( handle, rnnDesc, seqLength, xDesc, &sizeInBytes) @@ -2373,7 +2373,7 @@ cpdef getRNNTrainingReserveSize( cpdef getRNNParamsSize( intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): cdef size_t sizeInBytes - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenGetRNNParamsSize( handle, rnnDesc, xDesc, &sizeInBytes, dataType) @@ -2414,7 +2414,7 @@ cpdef RNNForwardInference( size_t cy, size_t workspace, size_t workSpaceSizeInBytes): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenRNNForwardInference( handle, rnnDesc, seqLength, xDesc, x, @@ -2448,7 +2448,7 @@ cpdef RNNForwardTraining( size_t reserveSpaceSizeInBytes): _setStream(handle) with nogil: - if runtime.hip_environment: + if runtime._is_hip_environment: status = miopenRNNForwardTraining( handle, rnnDesc, seqLength, xDesc, x, From d81e48c10a26abc0d2bf301542e2679130dbb03b Mon Sep 17 00:00:00 2001 From: bmedishe Date: Wed, 29 Nov 2023 05:19:36 +0000 Subject: [PATCH 12/26] update cudnn.pyx debug errors --- cupy_backends/cuda/libs/cudnn.pyx | 105 +++++++++++++++--------------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index fb082641d36..84d10d5b874 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -9,7 +9,7 @@ from cupy_backends.cuda.api cimport driver from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module -from cupy_backends.cuda.libs.miopen import * +from cupy_backends.cuda.libs import miopen ############################################################################### # Extern ############################################################################### @@ -760,7 +760,7 @@ class CuDNNError(RuntimeError): def __init__(self, int status): self.status = status if runtime._is_hip_environment: - msg = miopenGetErrorString(status) + msg = miopen.miopenGetErrorString(status) else: msg = cudnnGetErrorString(status) super(CuDNNError, self).__init__( @@ -804,7 +804,7 @@ def get_build_version(): cpdef size_t getVersion() except? 0: if runtime._is_hip_environment: - return miopenGetVersion() + return miopen.miopenGetVersion() else: return cudnnGetVersion() @@ -827,10 +827,13 @@ cpdef queryRuntimeError(intptr_t handle, int mode): ############################################################################### cpdef intptr_t create() except? 0: - cdef Handle handle + IF CUPY_HIP_VERSION != 0: + cdef miopen.Handle handle + ELSE: + cdef Handle handle with nogil: if runtime._is_hip_environment: - status = miopenCreate(&handle) + status = miopen.miopenCreate(&handle) else: status = cudnnCreate(&handle) check_status(status) @@ -840,7 +843,7 @@ cpdef intptr_t create() except? 0: cpdef destroy(intptr_t handle): with nogil: if runtime._is_hip_environment: - status = miopenDestroy(handle) + status = miopen.miopenDestroy(handle) else: status = cudnnDestroy(handle) check_status(status) @@ -854,7 +857,7 @@ cpdef setStream(intptr_t handle, size_t stream): 'calling cuDNN API during stream capture is currently ' 'unsupported') if runtime._is_hip_environment: - status = miopenSetStream(handle, stream) + status = miopen.miopenSetStream(handle, stream) else: status = cudnnSetStream(handle, stream) check_status(status) @@ -865,7 +868,7 @@ cpdef size_t getStream(intptr_t handle) except? 0: if runtime._is_hip_environment: status = cudnnGetStream(handle, &stream) else: - status = miopenGetStream(handle, &stream) + status = miopen.miopenGetStream(handle, &stream) check_status(status) return stream @@ -881,7 +884,7 @@ cdef _setStream(intptr_t handle): cpdef size_t createTensorDescriptor() except? 0: cdef TensorDescriptor descriptor if runtime._is_hip_environment: - status = miopenCreateTensorDescriptor(&descriptor) + status = miopen.miopenCreateTensorDescriptor(&descriptor) else: status = cudnnCreateTensorDescriptor(&descriptor) check_status(status) @@ -925,7 +928,7 @@ cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, cpdef destroyTensorDescriptor(size_t tensorDesc): if runtime._is_hip_environment: - status = miopenDestroyTensorDescriptor(tensorDesc) + status = miopen.miopenDestroyTensorDescriptor(tensorDesc) else: status = cudnnDestroyTensorDescriptor(tensorDesc) check_status(status) @@ -982,7 +985,7 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenOpTensor( + status = miopen.miopenOpTensor( handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, @@ -1003,7 +1006,7 @@ cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, cpdef size_t createReduceTensorDescriptor() except? 0: cdef ReduceTensorDescriptor reduceTensorDesc if runtime._is_hip_environment: - status = miopenCreateReduceTensorDescriptor(&reduceTensorDesc) + status = miopen.miopenCreateReduceTensorDescriptor(&reduceTensorDesc) else: status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) check_status(status) @@ -1014,7 +1017,7 @@ cpdef setReduceTensorDescriptor( int reduceTensorNanOpt, int reduceTensorIndices, int reduceTensorIndicesType): if runtime._is_hip_environment: - status = miopenSetReduceTensorDescriptor( + status = miopen.miopenSetReduceTensorDescriptor( reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, @@ -1037,7 +1040,7 @@ cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): cdef ReduceTensorIndices redIndices cdef IndicesType redIndicesType if runtime._is_hip_environment: - status = miopenGetReduceTensorDescriptor( + status = miopen.miopenGetReduceTensorDescriptor( reduceTensorDesc, &redOp, &redCompType, &redNanOpt, &redIndices, &redIndicesType) else: @@ -1050,7 +1053,7 @@ cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): if runtime._is_hip_environment: - status = miopenDestroyReduceTensorDescriptor( + status = miopen.miopenDestroyReduceTensorDescriptor( reduceTensorDesc) else: status = cudnnDestroyReduceTensorDescriptor( @@ -1062,7 +1065,7 @@ cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes if runtime._is_hip_environment: - status = miopenGetReductionIndicesSize( + status = miopen.miopenGetReductionIndicesSize( handle, reduceTensorDesc, aDesc, cDesc, &sizeInBytes) else: @@ -1078,7 +1081,7 @@ cpdef size_t getReductionWorkspaceSize(intptr_t handle, size_t aDesc, size_t cDesc) except? 0: cdef size_t sizeInBytes if runtime._is_hip_environment: - status = miopenGetReductionWorkspaceSize( + status = miopen.miopenGetReductionWorkspaceSize( handle, reduceTensorDesc, aDesc, cDesc, &sizeInBytes) @@ -1098,7 +1101,7 @@ cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenReduceTensor( + status = miopen.miopenReduceTensor( handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, @@ -1116,7 +1119,7 @@ cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenSetTensor( + status = miopen.miopenSetTensor( handle, yDesc, y, valuePtr) else: @@ -1130,7 +1133,7 @@ cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenScaleTensor( + status = miopen.miopenScaleTensor( handle, yDesc, y, alpha) else: @@ -1195,7 +1198,7 @@ cpdef destroyFilterDescriptor(size_t filterDesc): cpdef size_t createConvolutionDescriptor() except? 0: cdef ConvolutionDescriptor desc if runtime._is_hip_environment: - status = miopenCreateConvolutionDescriptor(&desc) + status = miopen.miopenCreateConvolutionDescriptor(&desc) else: status = cudnnCreateConvolutionDescriptor(&desc) check_status(status) @@ -1217,7 +1220,7 @@ cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): if runtime._is_hip_environment: - status = miopenSetConvolutionGroupCount( + status = miopen.miopenSetConvolutionGroupCount( convDesc, groupCount) else: status = cudnnSetConvolutionGroupCount( @@ -1228,7 +1231,7 @@ cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: cdef int groupCount if runtime._is_hip_environment: - status = miopenGetConvolutionGroupCount( + status = miopen.miopenGetConvolutionGroupCount( convDesc, &groupCount) else: status = cudnnGetConvolutionGroupCount( @@ -1266,7 +1269,7 @@ cpdef setConvolutionNdDescriptor_v3( cpdef destroyConvolutionDescriptor(size_t convDesc): if runtime._is_hip_environment: - status = miopenDestroyConvolutionDescriptor( + status = miopen.miopenDestroyConvolutionDescriptor( convDesc) else: status = cudnnDestroyConvolutionDescriptor( @@ -1379,7 +1382,7 @@ cpdef convolutionForward( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenConvolutionForward(handle, alpha, + status = miopen.miopenConvolutionForward(handle, alpha, srcDesc, srcData, filterDesc, filterData, convDesc, algo, @@ -1402,7 +1405,7 @@ cpdef convolutionBackwardBias( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenConvolutionBackwardBias( + status = miopen.miopenConvolutionBackwardBias( handle, alpha, srcDesc, srcData, beta, destDesc, destData) @@ -1652,7 +1655,7 @@ cpdef convolutionBackwardData_v3( cpdef size_t createPoolingDescriptor() except? 0: cdef PoolingDescriptor desc if runtime._is_hip_environment: - status = miopenCreatePoolingDescriptor(&desc) + status = miopen.miopenCreatePoolingDescriptor(&desc) else: status = cudnnCreatePoolingDescriptor(&desc) check_status(status) @@ -1682,7 +1685,7 @@ cpdef setPoolingNdDescriptor_v4( cpdef destroyPoolingDescriptor(size_t poolingDesc): if runtime._is_hip_environment: - status = miopenDestroyPoolingDescriptor(poolingDesc) + status = miopen.miopenDestroyPoolingDescriptor(poolingDesc) else: status = cudnnDestroyPoolingDescriptor(poolingDesc) check_status(status) @@ -1724,7 +1727,7 @@ CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON cpdef deriveBNTensorDescriptor( size_t derivedBnDesc, size_t xDesc, int mode): if runtime._is_hip_environment: - status = miopenDeriveBNTensorDescriptor( + status = miopen.miopenDeriveBNTensorDescriptor( derivedBnDesc, xDesc, mode) else: @@ -1745,7 +1748,7 @@ cpdef batchNormalizationForwardTraining( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenBatchNormalizationForwardTraining( + status = miopen.miopenBatchNormalizationForwardTraining( handle, mode, alpha, beta, xDesc, x, yDesc, y, @@ -1775,7 +1778,7 @@ cpdef batchNormalizationForwardInference( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenBatchNormalizationForwardInference( + status = miopen.miopenBatchNormalizationForwardInference( handle, mode, alpha, beta, xDesc, x, yDesc, y, @@ -1805,7 +1808,7 @@ cpdef batchNormalizationBackward( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenBatchNormalizationBackward( + status = miopen.miopenBatchNormalizationBackward( handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, @@ -1972,7 +1975,7 @@ cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( cpdef size_t createActivationDescriptor() except? 0: cdef ActivationDescriptor activationDesc if runtime._is_hip_environment: - status = miopenCreateActivationDescriptor(&activationDesc) + status = miopen.miopenCreateActivationDescriptor(&activationDesc) else: status = cudnnCreateActivationDescriptor(&activationDesc) check_status(status) @@ -1989,7 +1992,7 @@ cpdef setActivationDescriptor( cpdef destroyActivationDescriptor(size_t activationDesc): if runtime._is_hip_environment: - status = miopenDestroyActivationDescriptor( + status = miopen.miopenDestroyActivationDescriptor( activationDesc) else: status = cudnnDestroyActivationDescriptor( @@ -2003,7 +2006,7 @@ cpdef softmaxForward( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenSoftmaxForward( + status = miopen.miopenSoftmaxForward( handle, algorithm, mode, alpha, srcDesc, srcData, beta, dstDesc, dstData) @@ -2022,7 +2025,7 @@ cpdef softmaxBackward( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenSoftmaxBackward( + status = miopen.miopenSoftmaxBackward( handle, algorithm, mode, alpha, srcDesc, srcData, srcDiffDesc, srcDiffData, beta, @@ -2071,7 +2074,7 @@ cpdef activationBackward_v4( cpdef size_t createDropoutDescriptor() except? 0: cdef DropoutDescriptor desc if runtime._is_hip_environment: - status = miopenCreateDropoutDescriptor(&desc) + status = miopen.miopenCreateDropoutDescriptor(&desc) else: status = cudnnCreateDropoutDescriptor(&desc) check_status(status) @@ -2080,7 +2083,7 @@ cpdef size_t createDropoutDescriptor() except? 0: cpdef destroyDropoutDescriptor(size_t dropoutDesc): if runtime._is_hip_environment: - status = miopenDestroyDropoutDescriptor(dropoutDesc) + status = miopen.miopenDestroyDropoutDescriptor(dropoutDesc) else: status = cudnnDestroyDropoutDescriptor(dropoutDesc) check_status(status) @@ -2089,7 +2092,7 @@ cpdef destroyDropoutDescriptor(size_t dropoutDesc): cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: cdef size_t sizeInBytes if runtime._is_hip_environment: - status = miopenDropoutGetStatesSize( + status = miopen.miopenDropoutGetStatesSize( handle, &sizeInBytes) else: status = cudnnDropoutGetStatesSize( @@ -2110,7 +2113,7 @@ cpdef setDropoutDescriptor( cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: cdef size_t sizeInBytes if runtime._is_hip_environment: - status = miopenDropoutGetReserveSpaceSize( + status = miopen.miopenDropoutGetReserveSpaceSize( xDesc, &sizeInBytes) else: status = cudnnDropoutGetReserveSpaceSize( @@ -2155,7 +2158,7 @@ cpdef dropoutBackward( cpdef size_t createCTCLossDescriptor() except? 0: cdef CTCLossDescriptor desc if runtime._is_hip_environment: - status = miopenCreateCTCLossDescriptor(&desc) + status = miopen.miopenCreateCTCLossDescriptor(&desc) else: status = cudnnCreateCTCLossDescriptor(&desc) check_status(status) @@ -2163,7 +2166,7 @@ cpdef size_t createCTCLossDescriptor() except? 0: cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): if runtime._is_hip_environment: - status = miopenDestroyCTCLossDescriptor(ctcLossDesc) + status = miopen.miopenDestroyCTCLossDescriptor(ctcLossDesc) else: status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) check_status(status) @@ -2186,7 +2189,7 @@ cpdef size_t getCTCLossWorkspaceSize( int algo, size_t ctcLossDesc) except? 0: cdef size_t sizeInBytes if runtime._is_hip_environment: - status = miopenGetCTCLossWorkspaceSize( + status = miopen.miopenGetCTCLossWorkspaceSize( handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, @@ -2207,7 +2210,7 @@ cpdef CTCLoss( int algo, size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes): if runtime._is_hip_environment: - status = miopenCTCLoss( + status = miopen.miopenCTCLoss( handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, @@ -2230,7 +2233,7 @@ cpdef CTCLoss( cpdef size_t createRNNDescriptor() except? 0: cdef RNNDescriptor desc if runtime._is_hip_environment: - status = miopenCreateRNNDescriptor(&desc) + status = miopen.miopenCreateRNNDescriptor(&desc) else: status = cudnnCreateRNNDescriptor(&desc) check_status(status) @@ -2239,7 +2242,7 @@ cpdef size_t createRNNDescriptor() except? 0: cpdef destroyRNNDescriptor(size_t rnnDesc): if runtime._is_hip_environment: - status = miopenDestroyRNNDescriptor(rnnDesc) + status = miopen.miopenDestroyRNNDescriptor(rnnDesc) else: status = cudnnDestroyRNNDescriptor(rnnDesc) check_status(status) @@ -2344,7 +2347,7 @@ cpdef getRNNWorkspaceSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes if runtime._is_hip_environment: - status = miopenGetRNNWorkspaceSize( + status = miopen.miopenGetRNNWorkspaceSize( handle, rnnDesc, seqLength, xDesc, &sizeInBytes) else: @@ -2359,7 +2362,7 @@ cpdef getRNNTrainingReserveSize( intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): cdef size_t sizeInBytes if runtime._is_hip_environment: - status = miopenGetRNNTrainingReserveSize( + status = miopen.miopenGetRNNTrainingReserveSize( handle, rnnDesc, seqLength, xDesc, &sizeInBytes) else: @@ -2374,7 +2377,7 @@ cpdef getRNNParamsSize( intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): cdef size_t sizeInBytes if runtime._is_hip_environment: - status = miopenGetRNNParamsSize( + status = miopen.miopenGetRNNParamsSize( handle, rnnDesc, xDesc, &sizeInBytes, dataType) else: @@ -2415,7 +2418,7 @@ cpdef RNNForwardInference( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenRNNForwardInference( + status = miopen.miopenRNNForwardInference( handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, @@ -2449,7 +2452,7 @@ cpdef RNNForwardTraining( _setStream(handle) with nogil: if runtime._is_hip_environment: - status = miopenRNNForwardTraining( + status = miopen.miopenRNNForwardTraining( handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, From ee228f33c7f65608ffbdb4faa410198e63e33df0 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Wed, 29 Nov 2023 19:41:06 +0000 Subject: [PATCH 13/26] add miopen.pxd --- cupy_backends/cuda/libs/miopen.pxd | 1019 ++++++++++++++++++++++++++++ 1 file changed, 1019 insertions(+) create mode 100644 cupy_backends/cuda/libs/miopen.pxd diff --git a/cupy_backends/cuda/libs/miopen.pxd b/cupy_backends/cuda/libs/miopen.pxd new file mode 100644 index 00000000000..8fcd754470f --- /dev/null +++ b/cupy_backends/cuda/libs/miopen.pxd @@ -0,0 +1,1019 @@ +from libc.stdint cimport intptr_t + + +############################################################################### +# Enum +############################################################################### +IF CUPY_HIP_VERSION != 0: + cpdef enum: + CUDNN_DATA_FLOAT = 200 + CUDNN_DATA_DOUBLE = 201 + CUDNN_DATA_HALF = 202 + + CUDNN_DEFAULT_MATH = 210 + CUDNN_TENSOR_OP_MATH = 211 + + CUDNN_NOT_PROPAGATE_NAN = 220 + CUDNN_PROPAGATE_NAN = 221 + + CUDNN_NON_DETERMINISTIC = 230 + CUDNN_DETERMINISTIC = 231 + + CUDNN_TENSOR_NCHW = 240 + CUDNN_TENSOR_NHWC = 241 + + CUDNN_OP_TENSOR_ADD = 250 + CUDNN_OP_TENSOR_MUL = 251 + CUDNN_OP_TENSOR_MIN = 252 + CUDNN_OP_TENSOR_MAX = 253 + CUDNN_OP_TENSOR_SQRT = 254 + CUDNN_OP_TENSOR_NOT = 255 + + CUDNN_REDUCE_TENSOR_ADD = 260 + CUDNN_REDUCE_TENSOR_MUL = 261 + CUDNN_REDUCE_TENSOR_MIN = 262 + CUDNN_REDUCE_TENSOR_MAX = 263 + CUDNN_REDUCE_TENSOR_AMAX = 264 + CUDNN_REDUCE_TENSOR_AVG = 265 + CUDNN_REDUCE_TENSOR_NORM1 = 266 + CUDNN_REDUCE_TENSOR_NORM2 = 267 + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 268 + + CUDNN_REDUCE_TENSOR_NO_INDICES = 270 + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 271 + + CUDNN_32BIT_INDICES = 280 + CUDNN_64BIT_INDICES = 281 + CUDNN_16BIT_INDICES = 282 + CUDNN_8BIT_INDICES = 283 + + CUDNN_ADD_IMAGE = 290 + CUDNN_ADD_SAME_HW = 290 + CUDNN_ADD_FEATURE_MAP = 291 + CUDNN_ADD_SAME_CHW = 291 + CUDNN_ADD_SAME_C = 292 + CUDNN_ADD_FULL_TENSOR = 293 + + CUDNN_CONVOLUTION = 300 + CUDNN_CROSS_CORRELATION = 301 + + CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 310 + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 311 + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 312 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 320 + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 321 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 322 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 323 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 324 + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 325 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 326 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 327 + + CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 330 + CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 331 + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 332 + + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 340 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 341 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 342 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 343 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 344 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 345 + + CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 350 + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 351 + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 352 + + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 360 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 361 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 362 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 363 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 364 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 365 + + CUDNN_SOFTMAX_FAST = 370 + CUDNN_SOFTMAX_ACCURATE = 371 + CUDNN_SOFTMAX_LOG = 372 + + CUDNN_SOFTMAX_MODE_INSTANCE = 380 + CUDNN_SOFTMAX_MODE_CHANNEL = 381 + + CUDNN_POOLING_MAX = 390 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 391 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 392 + CUDNN_POOLING_MAX_DETERMINISTIC = 393 + + CUDNN_ACTIVATION_SIGMOID = 400 + CUDNN_ACTIVATION_RELU = 401 + CUDNN_ACTIVATION_TANH = 402 + CUDNN_ACTIVATION_CLIPPED_RELU = 403 + CUDNN_ACTIVATION_ELU = 404 + CUDNN_ACTIVATION_IDENTITY = 405 + + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 410 + + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 420 + + CUDNN_BATCHNORM_PER_ACTIVATION = 430 + CUDNN_BATCHNORM_SPATIAL = 431 + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 432 + + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 440 + CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 441 + + CUDNN_BATCHNORM_OPS_BN = 450 + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 451 + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 452 + + CUDNN_RNN_RELU = 460 + CUDNN_RNN_TANH = 461 + CUDNN_LSTM = 462 + CUDNN_GRU = 463 + + CUDNN_UNIDIRECTIONAL = 470 + CUDNN_BIDIRECTIONAL = 471 + + CUDNN_RNN_ALGO_STANDARD = 480 + CUDNN_RNN_ALGO_PERSIST_STATIC = 481 + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 482 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 490 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 491 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 492 + + CUDNN_RNN_PADDED_IO_DISABLED = 500 + CUDNN_RNN_PADDED_IO_ENABLED = 501 + + CUDNN_LINEAR_INPUT = 510 + CUDNN_SKIP_INPUT = 511 + + CUDNN_SAMPLER_BILINEAR = 520 + + CUDNN_STATUS_SUCCESS = 530 + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 541 + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 542 + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 543 + + CUDNN_ERRQUERY_RAWCODE = 550 + CUDNN_ERRQUERY_NONBLOCKING = 551 + CUDNN_ERRQUERY_BLOCKING = 552 + + # cudnnFusedOps_t + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 560 + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 561 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 562 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 563 + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 564 + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 565 + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 566 + + # cudnnFusedOpsConstParamLabel_t + CUDNN_PARAM_XDESC = 570 + CUDNN_PARAM_XDATA_PLACEHOLDER = 571 + CUDNN_PARAM_BN_MODE = 572 + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 573 + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 574 + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 575 + CUDNN_PARAM_ACTIVATION_DESC = 576 + CUDNN_PARAM_CONV_DESC = 577 + CUDNN_PARAM_WDESC = 578 + CUDNN_PARAM_WDATA_PLACEHOLDER = 579 + CUDNN_PARAM_DWDESC = 580 + CUDNN_PARAM_DWDATA_PLACEHOLDER = 581 + CUDNN_PARAM_YDESC = 582 + CUDNN_PARAM_YDATA_PLACEHOLDER = 583 + CUDNN_PARAM_DYDESC = 584 + CUDNN_PARAM_DYDATA_PLACEHOLDER = 585 + CUDNN_PARAM_YSTATS_DESC = 586 + CUDNN_PARAM_YSUM_PLACEHOLDER = 587 + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 588 + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 589 + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 590 + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 591 + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 592 + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 593 + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 594 + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 595 + CUDNN_PARAM_ZDESC = 596 + CUDNN_PARAM_ZDATA_PLACEHOLDER = 597 + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 598 + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 599 + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 600 + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 601 + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 602 + CUDNN_PARAM_DXDESC = 603 + CUDNN_PARAM_DXDATA_PLACEHOLDER = 604 + CUDNN_PARAM_DZDESC = 605 + CUDNN_PARAM_DZDATA_PLACEHOLDER = 606 + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 607 + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 608 + + # cudnnFusedOpsPointerPlaceHolder_t + CUDNN_PTR_NULL = 610 + CUDNN_PTR_ELEM_ALIGNED = 611 + CUDNN_PTR_16B_ALIGNED = 612 + + # cudnnFusedOpsVariantParamLabel_t + CUDNN_PTR_XDATA = 620 + CUDNN_PTR_BN_EQSCALE = 621 + CUDNN_PTR_BN_EQBIAS = 622 + CUDNN_PTR_WDATA = 623 + CUDNN_PTR_DWDATA = 624 + CUDNN_PTR_YDATA = 625 + CUDNN_PTR_DYDATA = 626 + CUDNN_PTR_YSUM = 627 + CUDNN_PTR_YSQSUM = 628 + CUDNN_PTR_WORKSPACE = 629 + CUDNN_PTR_BN_SCALE = 630 + CUDNN_PTR_BN_BIAS = 631 + CUDNN_PTR_BN_SAVED_MEAN = 632 + CUDNN_PTR_BN_SAVED_INVSTD = 633 + CUDNN_PTR_BN_RUNNING_MEAN = 634 + CUDNN_PTR_BN_RUNNING_VAR = 635 + CUDNN_PTR_ZDATA = 636 + CUDNN_PTR_BN_Z_EQSCALE = 637 + CUDNN_PTR_BN_Z_EQBIAS = 638 + CUDNN_PTR_ACTIVATION_BITMASK = 639 + CUDNN_PTR_DXDATA = 640 + CUDNN_PTR_DZDATA = 641 + CUDNN_PTR_BN_DSCALE = 642 + CUDNN_PTR_BN_DBIAS = 643 + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 720 + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 721 + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 722 + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 723 +ELSE: + cpdef enum: + CUDNN_DATA_FLOAT = 0 + CUDNN_DATA_DOUBLE = 1 + CUDNN_DATA_HALF = 2 + + CUDNN_DEFAULT_MATH = 0 + CUDNN_TENSOR_OP_MATH = 1 + + CUDNN_NOT_PROPAGATE_NAN = 0 + CUDNN_PROPAGATE_NAN = 1 + + CUDNN_NON_DETERMINISTIC = 0 + CUDNN_DETERMINISTIC = 1 + + CUDNN_TENSOR_NCHW = 0 + CUDNN_TENSOR_NHWC = 1 + + CUDNN_OP_TENSOR_ADD = 0 + CUDNN_OP_TENSOR_MUL = 1 + CUDNN_OP_TENSOR_MIN = 2 + CUDNN_OP_TENSOR_MAX = 3 + CUDNN_OP_TENSOR_SQRT = 4 + CUDNN_OP_TENSOR_NOT = 5 + + CUDNN_REDUCE_TENSOR_ADD = 0 + CUDNN_REDUCE_TENSOR_MUL = 1 + CUDNN_REDUCE_TENSOR_MIN = 2 + CUDNN_REDUCE_TENSOR_MAX = 3 + CUDNN_REDUCE_TENSOR_AMAX = 4 + CUDNN_REDUCE_TENSOR_AVG = 5 + CUDNN_REDUCE_TENSOR_NORM1 = 6 + CUDNN_REDUCE_TENSOR_NORM2 = 7 + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 + + CUDNN_REDUCE_TENSOR_NO_INDICES = 0 + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1 + + CUDNN_32BIT_INDICES = 0 + CUDNN_64BIT_INDICES = 1 + CUDNN_16BIT_INDICES = 2 + CUDNN_8BIT_INDICES = 3 + + CUDNN_ADD_IMAGE = 0 + CUDNN_ADD_SAME_HW = 0 + CUDNN_ADD_FEATURE_MAP = 1 + CUDNN_ADD_SAME_CHW = 1 + CUDNN_ADD_SAME_C = 2 + CUDNN_ADD_FULL_TENSOR = 3 + + CUDNN_CONVOLUTION = 0 + CUDNN_CROSS_CORRELATION = 1 + + CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0 + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4 + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7 + + CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5 + + CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 + + CUDNN_SOFTMAX_FAST = 0 + CUDNN_SOFTMAX_ACCURATE = 1 + CUDNN_SOFTMAX_LOG = 2 + + CUDNN_SOFTMAX_MODE_INSTANCE = 0 + CUDNN_SOFTMAX_MODE_CHANNEL = 1 + + CUDNN_POOLING_MAX = 0 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 + CUDNN_POOLING_MAX_DETERMINISTIC = 3 + + CUDNN_ACTIVATION_SIGMOID = 0 + CUDNN_ACTIVATION_RELU = 1 + CUDNN_ACTIVATION_TANH = 2 + CUDNN_ACTIVATION_CLIPPED_RELU = 3 + CUDNN_ACTIVATION_ELU = 4 + CUDNN_ACTIVATION_IDENTITY = 5 + + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0 + + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 + + CUDNN_BATCHNORM_PER_ACTIVATION = 0 + CUDNN_BATCHNORM_SPATIAL = 1 + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 + + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0 + CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 + + CUDNN_BATCHNORM_OPS_BN = 0 + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 + + CUDNN_RNN_RELU = 0 + CUDNN_RNN_TANH = 1 + CUDNN_LSTM = 2 + CUDNN_GRU = 3 + + CUDNN_UNIDIRECTIONAL = 0 + CUDNN_BIDIRECTIONAL = 1 + + CUDNN_RNN_ALGO_STANDARD = 0 + CUDNN_RNN_ALGO_PERSIST_STATIC = 1 + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 + + CUDNN_RNN_PADDED_IO_DISABLED = 0 + CUDNN_RNN_PADDED_IO_ENABLED = 1 + + CUDNN_LINEAR_INPUT = 0 + CUDNN_SKIP_INPUT = 1 + + CUDNN_SAMPLER_BILINEAR = 0 + + CUDNN_STATUS_SUCCESS = 0 + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 + + CUDNN_ERRQUERY_RAWCODE = 0 + CUDNN_ERRQUERY_NONBLOCKING = 1 + CUDNN_ERRQUERY_BLOCKING = 2 + + # cudnnFusedOps_t + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0 + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3 + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4 + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5 + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6 + + # cudnnFusedOpsConstParamLabel_t + CUDNN_PARAM_XDESC = 0 + CUDNN_PARAM_XDATA_PLACEHOLDER = 1 + CUDNN_PARAM_BN_MODE = 2 + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3 + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4 + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5 + CUDNN_PARAM_ACTIVATION_DESC = 6 + CUDNN_PARAM_CONV_DESC = 7 + CUDNN_PARAM_WDESC = 8 + CUDNN_PARAM_WDATA_PLACEHOLDER = 9 + CUDNN_PARAM_DWDESC = 10 + CUDNN_PARAM_DWDATA_PLACEHOLDER = 11 + CUDNN_PARAM_YDESC = 12 + CUDNN_PARAM_YDATA_PLACEHOLDER = 13 + CUDNN_PARAM_DYDESC = 14 + CUDNN_PARAM_DYDATA_PLACEHOLDER = 15 + CUDNN_PARAM_YSTATS_DESC = 16 + CUDNN_PARAM_YSUM_PLACEHOLDER = 17 + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18 + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19 + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20 + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21 + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22 + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23 + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24 + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25 + CUDNN_PARAM_ZDESC = 26 + CUDNN_PARAM_ZDATA_PLACEHOLDER = 27 + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28 + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29 + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30 + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31 + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32 + CUDNN_PARAM_DXDESC = 33 + CUDNN_PARAM_DXDATA_PLACEHOLDER = 34 + CUDNN_PARAM_DZDESC = 35 + CUDNN_PARAM_DZDATA_PLACEHOLDER = 36 + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37 + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38 + + # cudnnFusedOpsPointerPlaceHolder_t + CUDNN_PTR_NULL = 0 + CUDNN_PTR_ELEM_ALIGNED = 1 + CUDNN_PTR_16B_ALIGNED = 2 + + # cudnnFusedOpsVariantParamLabel_t + CUDNN_PTR_XDATA = 0 + CUDNN_PTR_BN_EQSCALE = 1 + CUDNN_PTR_BN_EQBIAS = 2 + CUDNN_PTR_WDATA = 3 + CUDNN_PTR_DWDATA = 4 + CUDNN_PTR_YDATA = 5 + CUDNN_PTR_DYDATA = 6 + CUDNN_PTR_YSUM = 7 + CUDNN_PTR_YSQSUM = 8 + CUDNN_PTR_WORKSPACE = 9 + CUDNN_PTR_BN_SCALE = 10 + CUDNN_PTR_BN_BIAS = 11 + CUDNN_PTR_BN_SAVED_MEAN = 12 + CUDNN_PTR_BN_SAVED_INVSTD = 13 + CUDNN_PTR_BN_RUNNING_MEAN = 14 + CUDNN_PTR_BN_RUNNING_VAR = 15 + CUDNN_PTR_ZDATA = 16 + CUDNN_PTR_BN_Z_EQSCALE = 17 + CUDNN_PTR_BN_Z_EQBIAS = 18 + CUDNN_PTR_ACTIVATION_BITMASK = 19 + CUDNN_PTR_DXDATA = 20 + CUDNN_PTR_DZDATA = 21 + CUDNN_PTR_BN_DSCALE = 22 + CUDNN_PTR_BN_DBIAS = 23 + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100 + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101 + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 + + + +############################################################################### +# Class +############################################################################### + +cdef class CuDNNAlgoPerf: + cdef: + int algo + int status + float time + size_t memory + int determinism + int mathType + +IF CUPY_HIP_VERSION == 0: + ############################################################################### + # Version + ############################################################################### + + cpdef size_t getVersion() except? 0 + + ############################################################################### + # Runtime error checking + ############################################################################### + cpdef queryRuntimeError(intptr_t handle, int mode) + + ############################################################################### + # Initialization and CUDA cooperation + ############################################################################### + + cpdef intptr_t create() except? 0 + cpdef destroy(intptr_t handle) + cpdef setStream(intptr_t handle, size_t stream) + cpdef size_t getStream(intptr_t handle) except? 0 + + + ############################################################################### + # Tensor manipulation + ############################################################################### + + cpdef size_t createTensorDescriptor() except? 0 + cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w) + cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride) + cpdef tuple getTensor4dDescriptor(size_t tensorDesc) + cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA) + cpdef destroyTensorDescriptor(size_t tensorDesc) + cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y) + + + ############################################################################### + # Tensor operations + ############################################################################### + + cpdef size_t createOpTensorDescriptor() except? 0 + cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt) + cpdef getOpTensorDescriptor(size_t opTensorDesc) + cpdef destroyOpTensorDescriptor(size_t opTensorDesc) + cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C) + + + ############################################################################### + # Tensor reductions + ############################################################################### + + cpdef size_t createReduceTensorDescriptor() except? 0 + cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, + int reduceTensorCompType, int reduceTensorNanOpt, + int reduceTensorIndices, int reduceTensorIndicesType) + cpdef getReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef size_t getReductionIndicesSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef size_t getReductionWorkspaceSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef reduceTensor( + intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C) + cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr) + cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha) + + + ############################################################################### + # Filter manipulation + ############################################################################### + + cpdef size_t createFilterDescriptor() except? 0 + cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, int format, int k, int c, int h, int w) + cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA) + cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested) + cpdef destroyFilterDescriptor(size_t filterDesc) + + + ############################################################################### + # Convolution + ############################################################################### + + cpdef size_t createConvolutionDescriptor() except? 0 + cpdef setConvolutionMathType( + size_t convDesc, size_t mathType) + cpdef size_t getConvolutionMathType(size_t convDesc) except? 0 + cpdef setConvolutionGroupCount( + size_t convDesc, int groupCount) + cpdef int getConvolutionGroupCount(size_t convDesc) except? -1 + cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode) + cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType) + cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType) + cpdef destroyConvolutionDescriptor(size_t convDesc) + cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, size_t yDesc, + int requestedAlgoCount) + cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1 + cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData) + cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData) + cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1 + cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1 + cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + + + ############################################################################### + # Pooling + ############################################################################### + + cpdef size_t createPoolingDescriptor() except? 0 + cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride) + cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA) + cpdef destroyPoolingDescriptor(size_t poolingDesc) + cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + ############################################################################### + # Batch Normalization + ############################################################################### + + cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode) + + cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance) + + cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon) + + cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance) + + cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0 + + cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0 + + cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0 + + + ############################################################################### + # Activation + ############################################################################### + + cpdef size_t createActivationDescriptor() except? 0 + cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling) + cpdef destroyActivationDescriptor(size_t activationDesc) + cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData) + cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + + ############################################################################### + # Dropout + ############################################################################### + cpdef size_t createDropoutDescriptor() except? 0 + cpdef destroyDropoutDescriptor(size_t dropoutDesc) + cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1 + cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed) + cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0 + cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxtDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # CTC + ############################################################################### + + cpdef size_t createCTCLossDescriptor() except? 0 + cpdef destroyCTCLossDescriptor(size_t ctcLossDesc) + cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType) + cpdef getCTCLossDescriptor(size_t ctcLossDesc) + cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0 + cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, int algo, + size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes) + + + ############################################################################### + # RNN + ############################################################################### + + cpdef size_t createRNNDescriptor() except? 0 + cpdef destroyRNNDescriptor(size_t rnnDesc) + cpdef size_t createPersistentRNNPlan( + size_t rnnDesc, int minibatch, int dataType) except? 0 + cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan) + cpdef destroyPersistentRNNPlan(size_t plan) + cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType) + cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType) + cpdef setRNNPaddingMode(size_t rnnDesc, int paddingMode) + cpdef getRNNPaddingMode(size_t rnnDesc) + cpdef size_t createRNNDataDescriptor() except? 0 + cpdef destroyRNNDataDescriptor(size_t RNNDataDesc) + cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill) + cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill) + cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType) + cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat) + cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias) + cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # Spatial Transformer + ############################################################################### + + cpdef size_t createSpatialTransformerDescriptor() except? 0 + cpdef destroySpatialTransformerDescriptor(size_t stDesc) + cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA) + cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid) + cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta) + cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y) + cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid) + + ############################################################################### + # Fused Ops + ############################################################################### + + cpdef createFusedOpsConstParamPack(int ops) + cpdef destroyFusedOpsConstParamPack(size_t constPack) + cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef createFusedOpsVariantParamPack(int ops) + cpdef destroyFusedOpsVariantParamPack(size_t varPack) + cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef createFusedOpsPlan(int ops) + cpdef destroyFusedOpsPlan(size_t plan) + cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) + cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) + From 67b677d5c3c2ce3072cf454fd38ae23cd6db0f11 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Wed, 29 Nov 2023 20:59:46 +0000 Subject: [PATCH 14/26] update for cupy_miopen build --- cupy_backends/cuda/libs/cudnn.pxd | 1751 +++++++++++++++----------- cupy_backends/cuda/libs/cudnn.pyx | 1950 +---------------------------- cupy_backends/cupy_cudnn.h | 9 +- 3 files changed, 1003 insertions(+), 2707 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pxd b/cupy_backends/cuda/libs/cudnn.pxd index 5f7430ab3f6..8fcd754470f 100644 --- a/cupy_backends/cuda/libs/cudnn.pxd +++ b/cupy_backends/cuda/libs/cudnn.pxd @@ -4,245 +4,485 @@ from libc.stdint cimport intptr_t ############################################################################### # Enum ############################################################################### +IF CUPY_HIP_VERSION != 0: + cpdef enum: + CUDNN_DATA_FLOAT = 200 + CUDNN_DATA_DOUBLE = 201 + CUDNN_DATA_HALF = 202 + + CUDNN_DEFAULT_MATH = 210 + CUDNN_TENSOR_OP_MATH = 211 + + CUDNN_NOT_PROPAGATE_NAN = 220 + CUDNN_PROPAGATE_NAN = 221 + + CUDNN_NON_DETERMINISTIC = 230 + CUDNN_DETERMINISTIC = 231 + + CUDNN_TENSOR_NCHW = 240 + CUDNN_TENSOR_NHWC = 241 + + CUDNN_OP_TENSOR_ADD = 250 + CUDNN_OP_TENSOR_MUL = 251 + CUDNN_OP_TENSOR_MIN = 252 + CUDNN_OP_TENSOR_MAX = 253 + CUDNN_OP_TENSOR_SQRT = 254 + CUDNN_OP_TENSOR_NOT = 255 + + CUDNN_REDUCE_TENSOR_ADD = 260 + CUDNN_REDUCE_TENSOR_MUL = 261 + CUDNN_REDUCE_TENSOR_MIN = 262 + CUDNN_REDUCE_TENSOR_MAX = 263 + CUDNN_REDUCE_TENSOR_AMAX = 264 + CUDNN_REDUCE_TENSOR_AVG = 265 + CUDNN_REDUCE_TENSOR_NORM1 = 266 + CUDNN_REDUCE_TENSOR_NORM2 = 267 + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 268 + + CUDNN_REDUCE_TENSOR_NO_INDICES = 270 + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 271 + + CUDNN_32BIT_INDICES = 280 + CUDNN_64BIT_INDICES = 281 + CUDNN_16BIT_INDICES = 282 + CUDNN_8BIT_INDICES = 283 + + CUDNN_ADD_IMAGE = 290 + CUDNN_ADD_SAME_HW = 290 + CUDNN_ADD_FEATURE_MAP = 291 + CUDNN_ADD_SAME_CHW = 291 + CUDNN_ADD_SAME_C = 292 + CUDNN_ADD_FULL_TENSOR = 293 + + CUDNN_CONVOLUTION = 300 + CUDNN_CROSS_CORRELATION = 301 + + CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 310 + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 311 + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 312 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 320 + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 321 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 322 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 323 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 324 + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 325 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 326 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 327 + + CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 330 + CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 331 + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 332 + + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 340 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 341 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 342 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 343 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 344 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 345 + + CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 350 + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 351 + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 352 + + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 360 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 361 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 362 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 363 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 364 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 365 + + CUDNN_SOFTMAX_FAST = 370 + CUDNN_SOFTMAX_ACCURATE = 371 + CUDNN_SOFTMAX_LOG = 372 + + CUDNN_SOFTMAX_MODE_INSTANCE = 380 + CUDNN_SOFTMAX_MODE_CHANNEL = 381 + + CUDNN_POOLING_MAX = 390 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 391 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 392 + CUDNN_POOLING_MAX_DETERMINISTIC = 393 + + CUDNN_ACTIVATION_SIGMOID = 400 + CUDNN_ACTIVATION_RELU = 401 + CUDNN_ACTIVATION_TANH = 402 + CUDNN_ACTIVATION_CLIPPED_RELU = 403 + CUDNN_ACTIVATION_ELU = 404 + CUDNN_ACTIVATION_IDENTITY = 405 + + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 410 + + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 420 + + CUDNN_BATCHNORM_PER_ACTIVATION = 430 + CUDNN_BATCHNORM_SPATIAL = 431 + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 432 + + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 440 + CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 441 + + CUDNN_BATCHNORM_OPS_BN = 450 + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 451 + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 452 + + CUDNN_RNN_RELU = 460 + CUDNN_RNN_TANH = 461 + CUDNN_LSTM = 462 + CUDNN_GRU = 463 + + CUDNN_UNIDIRECTIONAL = 470 + CUDNN_BIDIRECTIONAL = 471 + + CUDNN_RNN_ALGO_STANDARD = 480 + CUDNN_RNN_ALGO_PERSIST_STATIC = 481 + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 482 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 490 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 491 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 492 + + CUDNN_RNN_PADDED_IO_DISABLED = 500 + CUDNN_RNN_PADDED_IO_ENABLED = 501 + + CUDNN_LINEAR_INPUT = 510 + CUDNN_SKIP_INPUT = 511 + + CUDNN_SAMPLER_BILINEAR = 520 + + CUDNN_STATUS_SUCCESS = 530 + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 541 + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 542 + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 543 + + CUDNN_ERRQUERY_RAWCODE = 550 + CUDNN_ERRQUERY_NONBLOCKING = 551 + CUDNN_ERRQUERY_BLOCKING = 552 + + # cudnnFusedOps_t + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 560 + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 561 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 562 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 563 + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 564 + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 565 + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 566 + + # cudnnFusedOpsConstParamLabel_t + CUDNN_PARAM_XDESC = 570 + CUDNN_PARAM_XDATA_PLACEHOLDER = 571 + CUDNN_PARAM_BN_MODE = 572 + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 573 + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 574 + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 575 + CUDNN_PARAM_ACTIVATION_DESC = 576 + CUDNN_PARAM_CONV_DESC = 577 + CUDNN_PARAM_WDESC = 578 + CUDNN_PARAM_WDATA_PLACEHOLDER = 579 + CUDNN_PARAM_DWDESC = 580 + CUDNN_PARAM_DWDATA_PLACEHOLDER = 581 + CUDNN_PARAM_YDESC = 582 + CUDNN_PARAM_YDATA_PLACEHOLDER = 583 + CUDNN_PARAM_DYDESC = 584 + CUDNN_PARAM_DYDATA_PLACEHOLDER = 585 + CUDNN_PARAM_YSTATS_DESC = 586 + CUDNN_PARAM_YSUM_PLACEHOLDER = 587 + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 588 + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 589 + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 590 + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 591 + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 592 + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 593 + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 594 + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 595 + CUDNN_PARAM_ZDESC = 596 + CUDNN_PARAM_ZDATA_PLACEHOLDER = 597 + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 598 + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 599 + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 600 + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 601 + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 602 + CUDNN_PARAM_DXDESC = 603 + CUDNN_PARAM_DXDATA_PLACEHOLDER = 604 + CUDNN_PARAM_DZDESC = 605 + CUDNN_PARAM_DZDATA_PLACEHOLDER = 606 + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 607 + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 608 + + # cudnnFusedOpsPointerPlaceHolder_t + CUDNN_PTR_NULL = 610 + CUDNN_PTR_ELEM_ALIGNED = 611 + CUDNN_PTR_16B_ALIGNED = 612 + + # cudnnFusedOpsVariantParamLabel_t + CUDNN_PTR_XDATA = 620 + CUDNN_PTR_BN_EQSCALE = 621 + CUDNN_PTR_BN_EQBIAS = 622 + CUDNN_PTR_WDATA = 623 + CUDNN_PTR_DWDATA = 624 + CUDNN_PTR_YDATA = 625 + CUDNN_PTR_DYDATA = 626 + CUDNN_PTR_YSUM = 627 + CUDNN_PTR_YSQSUM = 628 + CUDNN_PTR_WORKSPACE = 629 + CUDNN_PTR_BN_SCALE = 630 + CUDNN_PTR_BN_BIAS = 631 + CUDNN_PTR_BN_SAVED_MEAN = 632 + CUDNN_PTR_BN_SAVED_INVSTD = 633 + CUDNN_PTR_BN_RUNNING_MEAN = 634 + CUDNN_PTR_BN_RUNNING_VAR = 635 + CUDNN_PTR_ZDATA = 636 + CUDNN_PTR_BN_Z_EQSCALE = 637 + CUDNN_PTR_BN_Z_EQBIAS = 638 + CUDNN_PTR_ACTIVATION_BITMASK = 639 + CUDNN_PTR_DXDATA = 640 + CUDNN_PTR_DZDATA = 641 + CUDNN_PTR_BN_DSCALE = 642 + CUDNN_PTR_BN_DBIAS = 643 + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 720 + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 721 + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 722 + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 723 +ELSE: + cpdef enum: + CUDNN_DATA_FLOAT = 0 + CUDNN_DATA_DOUBLE = 1 + CUDNN_DATA_HALF = 2 + + CUDNN_DEFAULT_MATH = 0 + CUDNN_TENSOR_OP_MATH = 1 + + CUDNN_NOT_PROPAGATE_NAN = 0 + CUDNN_PROPAGATE_NAN = 1 + + CUDNN_NON_DETERMINISTIC = 0 + CUDNN_DETERMINISTIC = 1 + + CUDNN_TENSOR_NCHW = 0 + CUDNN_TENSOR_NHWC = 1 + + CUDNN_OP_TENSOR_ADD = 0 + CUDNN_OP_TENSOR_MUL = 1 + CUDNN_OP_TENSOR_MIN = 2 + CUDNN_OP_TENSOR_MAX = 3 + CUDNN_OP_TENSOR_SQRT = 4 + CUDNN_OP_TENSOR_NOT = 5 + + CUDNN_REDUCE_TENSOR_ADD = 0 + CUDNN_REDUCE_TENSOR_MUL = 1 + CUDNN_REDUCE_TENSOR_MIN = 2 + CUDNN_REDUCE_TENSOR_MAX = 3 + CUDNN_REDUCE_TENSOR_AMAX = 4 + CUDNN_REDUCE_TENSOR_AVG = 5 + CUDNN_REDUCE_TENSOR_NORM1 = 6 + CUDNN_REDUCE_TENSOR_NORM2 = 7 + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 + + CUDNN_REDUCE_TENSOR_NO_INDICES = 0 + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1 + + CUDNN_32BIT_INDICES = 0 + CUDNN_64BIT_INDICES = 1 + CUDNN_16BIT_INDICES = 2 + CUDNN_8BIT_INDICES = 3 + + CUDNN_ADD_IMAGE = 0 + CUDNN_ADD_SAME_HW = 0 + CUDNN_ADD_FEATURE_MAP = 1 + CUDNN_ADD_SAME_CHW = 1 + CUDNN_ADD_SAME_C = 2 + CUDNN_ADD_FULL_TENSOR = 3 + + CUDNN_CONVOLUTION = 0 + CUDNN_CROSS_CORRELATION = 1 + + CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0 + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4 + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7 + + CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5 + + CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 + + CUDNN_SOFTMAX_FAST = 0 + CUDNN_SOFTMAX_ACCURATE = 1 + CUDNN_SOFTMAX_LOG = 2 + + CUDNN_SOFTMAX_MODE_INSTANCE = 0 + CUDNN_SOFTMAX_MODE_CHANNEL = 1 + + CUDNN_POOLING_MAX = 0 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 + CUDNN_POOLING_MAX_DETERMINISTIC = 3 + + CUDNN_ACTIVATION_SIGMOID = 0 + CUDNN_ACTIVATION_RELU = 1 + CUDNN_ACTIVATION_TANH = 2 + CUDNN_ACTIVATION_CLIPPED_RELU = 3 + CUDNN_ACTIVATION_ELU = 4 + CUDNN_ACTIVATION_IDENTITY = 5 + + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0 + + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 + + CUDNN_BATCHNORM_PER_ACTIVATION = 0 + CUDNN_BATCHNORM_SPATIAL = 1 + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 + + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0 + CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 + + CUDNN_BATCHNORM_OPS_BN = 0 + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 + + CUDNN_RNN_RELU = 0 + CUDNN_RNN_TANH = 1 + CUDNN_LSTM = 2 + CUDNN_GRU = 3 + + CUDNN_UNIDIRECTIONAL = 0 + CUDNN_BIDIRECTIONAL = 1 + + CUDNN_RNN_ALGO_STANDARD = 0 + CUDNN_RNN_ALGO_PERSIST_STATIC = 1 + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 + + CUDNN_RNN_PADDED_IO_DISABLED = 0 + CUDNN_RNN_PADDED_IO_ENABLED = 1 + + CUDNN_LINEAR_INPUT = 0 + CUDNN_SKIP_INPUT = 1 + + CUDNN_SAMPLER_BILINEAR = 0 + + CUDNN_STATUS_SUCCESS = 0 + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 + + CUDNN_ERRQUERY_RAWCODE = 0 + CUDNN_ERRQUERY_NONBLOCKING = 1 + CUDNN_ERRQUERY_BLOCKING = 2 + + # cudnnFusedOps_t + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0 + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3 + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4 + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5 + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6 + + # cudnnFusedOpsConstParamLabel_t + CUDNN_PARAM_XDESC = 0 + CUDNN_PARAM_XDATA_PLACEHOLDER = 1 + CUDNN_PARAM_BN_MODE = 2 + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3 + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4 + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5 + CUDNN_PARAM_ACTIVATION_DESC = 6 + CUDNN_PARAM_CONV_DESC = 7 + CUDNN_PARAM_WDESC = 8 + CUDNN_PARAM_WDATA_PLACEHOLDER = 9 + CUDNN_PARAM_DWDESC = 10 + CUDNN_PARAM_DWDATA_PLACEHOLDER = 11 + CUDNN_PARAM_YDESC = 12 + CUDNN_PARAM_YDATA_PLACEHOLDER = 13 + CUDNN_PARAM_DYDESC = 14 + CUDNN_PARAM_DYDATA_PLACEHOLDER = 15 + CUDNN_PARAM_YSTATS_DESC = 16 + CUDNN_PARAM_YSUM_PLACEHOLDER = 17 + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18 + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19 + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20 + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21 + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22 + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23 + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24 + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25 + CUDNN_PARAM_ZDESC = 26 + CUDNN_PARAM_ZDATA_PLACEHOLDER = 27 + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28 + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29 + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30 + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31 + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32 + CUDNN_PARAM_DXDESC = 33 + CUDNN_PARAM_DXDATA_PLACEHOLDER = 34 + CUDNN_PARAM_DZDESC = 35 + CUDNN_PARAM_DZDATA_PLACEHOLDER = 36 + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37 + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38 + + # cudnnFusedOpsPointerPlaceHolder_t + CUDNN_PTR_NULL = 0 + CUDNN_PTR_ELEM_ALIGNED = 1 + CUDNN_PTR_16B_ALIGNED = 2 + + # cudnnFusedOpsVariantParamLabel_t + CUDNN_PTR_XDATA = 0 + CUDNN_PTR_BN_EQSCALE = 1 + CUDNN_PTR_BN_EQBIAS = 2 + CUDNN_PTR_WDATA = 3 + CUDNN_PTR_DWDATA = 4 + CUDNN_PTR_YDATA = 5 + CUDNN_PTR_DYDATA = 6 + CUDNN_PTR_YSUM = 7 + CUDNN_PTR_YSQSUM = 8 + CUDNN_PTR_WORKSPACE = 9 + CUDNN_PTR_BN_SCALE = 10 + CUDNN_PTR_BN_BIAS = 11 + CUDNN_PTR_BN_SAVED_MEAN = 12 + CUDNN_PTR_BN_SAVED_INVSTD = 13 + CUDNN_PTR_BN_RUNNING_MEAN = 14 + CUDNN_PTR_BN_RUNNING_VAR = 15 + CUDNN_PTR_ZDATA = 16 + CUDNN_PTR_BN_Z_EQSCALE = 17 + CUDNN_PTR_BN_Z_EQBIAS = 18 + CUDNN_PTR_ACTIVATION_BITMASK = 19 + CUDNN_PTR_DXDATA = 20 + CUDNN_PTR_DZDATA = 21 + CUDNN_PTR_BN_DSCALE = 22 + CUDNN_PTR_BN_DBIAS = 23 + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100 + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101 + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 -cpdef enum: - CUDNN_DATA_FLOAT = 0 - CUDNN_DATA_DOUBLE = 1 - CUDNN_DATA_HALF = 2 - - CUDNN_DEFAULT_MATH = 0 - CUDNN_TENSOR_OP_MATH = 1 - - CUDNN_NOT_PROPAGATE_NAN = 0 - CUDNN_PROPAGATE_NAN = 1 - - CUDNN_NON_DETERMINISTIC = 0 - CUDNN_DETERMINISTIC = 1 - - CUDNN_TENSOR_NCHW = 0 - CUDNN_TENSOR_NHWC = 1 - - CUDNN_OP_TENSOR_ADD = 0 - CUDNN_OP_TENSOR_MUL = 1 - CUDNN_OP_TENSOR_MIN = 2 - CUDNN_OP_TENSOR_MAX = 3 - CUDNN_OP_TENSOR_SQRT = 4 - CUDNN_OP_TENSOR_NOT = 5 - - CUDNN_REDUCE_TENSOR_ADD = 0 - CUDNN_REDUCE_TENSOR_MUL = 1 - CUDNN_REDUCE_TENSOR_MIN = 2 - CUDNN_REDUCE_TENSOR_MAX = 3 - CUDNN_REDUCE_TENSOR_AMAX = 4 - CUDNN_REDUCE_TENSOR_AVG = 5 - CUDNN_REDUCE_TENSOR_NORM1 = 6 - CUDNN_REDUCE_TENSOR_NORM2 = 7 - CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 - - CUDNN_REDUCE_TENSOR_NO_INDICES = 0 - CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1 - - CUDNN_32BIT_INDICES = 0 - CUDNN_64BIT_INDICES = 1 - CUDNN_16BIT_INDICES = 2 - CUDNN_8BIT_INDICES = 3 - - CUDNN_ADD_IMAGE = 0 - CUDNN_ADD_SAME_HW = 0 - CUDNN_ADD_FEATURE_MAP = 1 - CUDNN_ADD_SAME_CHW = 1 - CUDNN_ADD_SAME_C = 2 - CUDNN_ADD_FULL_TENSOR = 3 - - CUDNN_CONVOLUTION = 0 - CUDNN_CROSS_CORRELATION = 1 - - CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0 - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1 - CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3 - CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4 - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7 - - CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5 - - CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 - - CUDNN_SOFTMAX_FAST = 0 - CUDNN_SOFTMAX_ACCURATE = 1 - CUDNN_SOFTMAX_LOG = 2 - - CUDNN_SOFTMAX_MODE_INSTANCE = 0 - CUDNN_SOFTMAX_MODE_CHANNEL = 1 - - CUDNN_POOLING_MAX = 0 - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 - CUDNN_POOLING_MAX_DETERMINISTIC = 3 - - CUDNN_ACTIVATION_SIGMOID = 0 - CUDNN_ACTIVATION_RELU = 1 - CUDNN_ACTIVATION_TANH = 2 - CUDNN_ACTIVATION_CLIPPED_RELU = 3 - CUDNN_ACTIVATION_ELU = 4 - CUDNN_ACTIVATION_IDENTITY = 5 - - CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0 - - CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 - - CUDNN_BATCHNORM_PER_ACTIVATION = 0 - CUDNN_BATCHNORM_SPATIAL = 1 - CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 - - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0 - CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 - - CUDNN_BATCHNORM_OPS_BN = 0 - CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 - CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 - - CUDNN_RNN_RELU = 0 - CUDNN_RNN_TANH = 1 - CUDNN_LSTM = 2 - CUDNN_GRU = 3 - - CUDNN_UNIDIRECTIONAL = 0 - CUDNN_BIDIRECTIONAL = 1 - - CUDNN_RNN_ALGO_STANDARD = 0 - CUDNN_RNN_ALGO_PERSIST_STATIC = 1 - CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 - - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0 - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 - CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 - - CUDNN_RNN_PADDED_IO_DISABLED = 0 - CUDNN_RNN_PADDED_IO_ENABLED = 1 - - CUDNN_LINEAR_INPUT = 0 - CUDNN_SKIP_INPUT = 1 - - CUDNN_SAMPLER_BILINEAR = 0 - - CUDNN_STATUS_SUCCESS = 0 - CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 - CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 - CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 - - CUDNN_ERRQUERY_RAWCODE = 0 - CUDNN_ERRQUERY_NONBLOCKING = 1 - CUDNN_ERRQUERY_BLOCKING = 2 - - # cudnnFusedOps_t - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0 - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3 - CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4 - CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5 - CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6 - - # cudnnFusedOpsConstParamLabel_t - CUDNN_PARAM_XDESC = 0 - CUDNN_PARAM_XDATA_PLACEHOLDER = 1 - CUDNN_PARAM_BN_MODE = 2 - CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3 - CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4 - CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5 - CUDNN_PARAM_ACTIVATION_DESC = 6 - CUDNN_PARAM_CONV_DESC = 7 - CUDNN_PARAM_WDESC = 8 - CUDNN_PARAM_WDATA_PLACEHOLDER = 9 - CUDNN_PARAM_DWDESC = 10 - CUDNN_PARAM_DWDATA_PLACEHOLDER = 11 - CUDNN_PARAM_YDESC = 12 - CUDNN_PARAM_YDATA_PLACEHOLDER = 13 - CUDNN_PARAM_DYDESC = 14 - CUDNN_PARAM_DYDATA_PLACEHOLDER = 15 - CUDNN_PARAM_YSTATS_DESC = 16 - CUDNN_PARAM_YSUM_PLACEHOLDER = 17 - CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18 - CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19 - CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20 - CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21 - CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22 - CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23 - CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24 - CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25 - CUDNN_PARAM_ZDESC = 26 - CUDNN_PARAM_ZDATA_PLACEHOLDER = 27 - CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28 - CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29 - CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30 - CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31 - CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32 - CUDNN_PARAM_DXDESC = 33 - CUDNN_PARAM_DXDATA_PLACEHOLDER = 34 - CUDNN_PARAM_DZDESC = 35 - CUDNN_PARAM_DZDATA_PLACEHOLDER = 36 - CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37 - CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38 - - # cudnnFusedOpsPointerPlaceHolder_t - CUDNN_PTR_NULL = 0 - CUDNN_PTR_ELEM_ALIGNED = 1 - CUDNN_PTR_16B_ALIGNED = 2 - - # cudnnFusedOpsVariantParamLabel_t - CUDNN_PTR_XDATA = 0 - CUDNN_PTR_BN_EQSCALE = 1 - CUDNN_PTR_BN_EQBIAS = 2 - CUDNN_PTR_WDATA = 3 - CUDNN_PTR_DWDATA = 4 - CUDNN_PTR_YDATA = 5 - CUDNN_PTR_DYDATA = 6 - CUDNN_PTR_YSUM = 7 - CUDNN_PTR_YSQSUM = 8 - CUDNN_PTR_WORKSPACE = 9 - CUDNN_PTR_BN_SCALE = 10 - CUDNN_PTR_BN_BIAS = 11 - CUDNN_PTR_BN_SAVED_MEAN = 12 - CUDNN_PTR_BN_SAVED_INVSTD = 13 - CUDNN_PTR_BN_RUNNING_MEAN = 14 - CUDNN_PTR_BN_RUNNING_VAR = 15 - CUDNN_PTR_ZDATA = 16 - CUDNN_PTR_BN_Z_EQSCALE = 17 - CUDNN_PTR_BN_Z_EQBIAS = 18 - CUDNN_PTR_ACTIVATION_BITMASK = 19 - CUDNN_PTR_DXDATA = 20 - CUDNN_PTR_DZDATA = 21 - CUDNN_PTR_BN_DSCALE = 22 - CUDNN_PTR_BN_DBIAS = 23 - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100 - CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101 - CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 - CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 ############################################################################### @@ -258,521 +498,522 @@ cdef class CuDNNAlgoPerf: int determinism int mathType +IF CUPY_HIP_VERSION == 0: + ############################################################################### + # Version + ############################################################################### + + cpdef size_t getVersion() except? 0 + + ############################################################################### + # Runtime error checking + ############################################################################### + cpdef queryRuntimeError(intptr_t handle, int mode) + + ############################################################################### + # Initialization and CUDA cooperation + ############################################################################### + + cpdef intptr_t create() except? 0 + cpdef destroy(intptr_t handle) + cpdef setStream(intptr_t handle, size_t stream) + cpdef size_t getStream(intptr_t handle) except? 0 + + + ############################################################################### + # Tensor manipulation + ############################################################################### + + cpdef size_t createTensorDescriptor() except? 0 + cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w) + cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride) + cpdef tuple getTensor4dDescriptor(size_t tensorDesc) + cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA) + cpdef destroyTensorDescriptor(size_t tensorDesc) + cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y) + + + ############################################################################### + # Tensor operations + ############################################################################### + + cpdef size_t createOpTensorDescriptor() except? 0 + cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt) + cpdef getOpTensorDescriptor(size_t opTensorDesc) + cpdef destroyOpTensorDescriptor(size_t opTensorDesc) + cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C) + + + ############################################################################### + # Tensor reductions + ############################################################################### + + cpdef size_t createReduceTensorDescriptor() except? 0 + cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, + int reduceTensorCompType, int reduceTensorNanOpt, + int reduceTensorIndices, int reduceTensorIndicesType) + cpdef getReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef size_t getReductionIndicesSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef size_t getReductionWorkspaceSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef reduceTensor( + intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C) + cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr) + cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha) + + + ############################################################################### + # Filter manipulation + ############################################################################### + + cpdef size_t createFilterDescriptor() except? 0 + cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, int format, int k, int c, int h, int w) + cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA) + cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested) + cpdef destroyFilterDescriptor(size_t filterDesc) + + + ############################################################################### + # Convolution + ############################################################################### + + cpdef size_t createConvolutionDescriptor() except? 0 + cpdef setConvolutionMathType( + size_t convDesc, size_t mathType) + cpdef size_t getConvolutionMathType(size_t convDesc) except? 0 + cpdef setConvolutionGroupCount( + size_t convDesc, int groupCount) + cpdef int getConvolutionGroupCount(size_t convDesc) except? -1 + cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode) + cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType) + cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType) + cpdef destroyConvolutionDescriptor(size_t convDesc) + cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, size_t yDesc, + int requestedAlgoCount) + cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1 + cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData) + cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData) + cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1 + cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1 + cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + + + ############################################################################### + # Pooling + ############################################################################### + + cpdef size_t createPoolingDescriptor() except? 0 + cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride) + cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA) + cpdef destroyPoolingDescriptor(size_t poolingDesc) + cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + ############################################################################### + # Batch Normalization + ############################################################################### + + cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode) + + cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance) + + cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon) + + cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance) + + cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0 + + cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0 + + cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0 + + + ############################################################################### + # Activation + ############################################################################### + + cpdef size_t createActivationDescriptor() except? 0 + cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling) + cpdef destroyActivationDescriptor(size_t activationDesc) + cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData) + cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + + ############################################################################### + # Dropout + ############################################################################### + cpdef size_t createDropoutDescriptor() except? 0 + cpdef destroyDropoutDescriptor(size_t dropoutDesc) + cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1 + cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed) + cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0 + cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxtDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # CTC + ############################################################################### + + cpdef size_t createCTCLossDescriptor() except? 0 + cpdef destroyCTCLossDescriptor(size_t ctcLossDesc) + cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType) + cpdef getCTCLossDescriptor(size_t ctcLossDesc) + cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0 + cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, int algo, + size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes) + + + ############################################################################### + # RNN + ############################################################################### + + cpdef size_t createRNNDescriptor() except? 0 + cpdef destroyRNNDescriptor(size_t rnnDesc) + cpdef size_t createPersistentRNNPlan( + size_t rnnDesc, int minibatch, int dataType) except? 0 + cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan) + cpdef destroyPersistentRNNPlan(size_t plan) + cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType) + cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType) + cpdef setRNNPaddingMode(size_t rnnDesc, int paddingMode) + cpdef getRNNPaddingMode(size_t rnnDesc) + cpdef size_t createRNNDataDescriptor() except? 0 + cpdef destroyRNNDataDescriptor(size_t RNNDataDesc) + cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill) + cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill) + cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType) + cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat) + cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias) + cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # Spatial Transformer + ############################################################################### + + cpdef size_t createSpatialTransformerDescriptor() except? 0 + cpdef destroySpatialTransformerDescriptor(size_t stDesc) + cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA) + cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid) + cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta) + cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y) + cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid) + + ############################################################################### + # Fused Ops + ############################################################################### + + cpdef createFusedOpsConstParamPack(int ops) + cpdef destroyFusedOpsConstParamPack(size_t constPack) + cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef createFusedOpsVariantParamPack(int ops) + cpdef destroyFusedOpsVariantParamPack(size_t varPack) + cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef createFusedOpsPlan(int ops) + cpdef destroyFusedOpsPlan(size_t plan) + cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) + cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) -############################################################################### -# Version -############################################################################### - -cpdef size_t getVersion() except? 0 - -############################################################################### -# Runtime error checking -############################################################################### -cpdef queryRuntimeError(intptr_t handle, int mode) - -############################################################################### -# Initialization and CUDA cooperation -############################################################################### - -cpdef intptr_t create() except? 0 -cpdef destroy(intptr_t handle) -cpdef setStream(intptr_t handle, size_t stream) -cpdef size_t getStream(intptr_t handle) except? 0 - - -############################################################################### -# Tensor manipulation -############################################################################### - -cpdef size_t createTensorDescriptor() except? 0 -cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, - int n, int c, int h, int w) -cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, - int n, int c, int h, int w, int nStride, - int cStride, int hStride, int wStride) -cpdef tuple getTensor4dDescriptor(size_t tensorDesc) -cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, - size_t dimA, size_t strideA) -cpdef destroyTensorDescriptor(size_t tensorDesc) -cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, - size_t b, size_t beta, size_t yDesc, size_t y) - - -############################################################################### -# Tensor operations -############################################################################### - -cpdef size_t createOpTensorDescriptor() except? 0 -cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, - int opTensorCompType, int opTensorNanOpt) -cpdef getOpTensorDescriptor(size_t opTensorDesc) -cpdef destroyOpTensorDescriptor(size_t opTensorDesc) -cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, - size_t aDesc, size_t A, size_t alpha2, size_t bDesc, - size_t B, size_t beta, size_t cDesc, size_t C) - - -############################################################################### -# Tensor reductions -############################################################################### - -cpdef size_t createReduceTensorDescriptor() except? 0 -cpdef setReduceTensorDescriptor( - size_t reduceTensorDesc, int reduceTensorOp, - int reduceTensorCompType, int reduceTensorNanOpt, - int reduceTensorIndices, int reduceTensorIndicesType) -cpdef getReduceTensorDescriptor(size_t reduceTensorDesc) -cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc) -cpdef size_t getReductionIndicesSize( - intptr_t handle, size_t reduceTensorDesc, size_t aDesc, - size_t cDesc) except? 0 -cpdef size_t getReductionWorkspaceSize( - intptr_t handle, size_t reduceTensorDesc, size_t aDesc, - size_t cDesc) except? 0 -cpdef reduceTensor( - intptr_t handle, size_t reduceTensorDesc, size_t indices, - size_t indicesSizeInBytes, size_t workspace, - size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, - size_t A, size_t beta, size_t cDesc, size_t C) -cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr) -cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha) - - -############################################################################### -# Filter manipulation -############################################################################### - -cpdef size_t createFilterDescriptor() except? 0 -cpdef setFilter4dDescriptor_v4( - size_t filterDesc, int dataType, int format, int k, int c, int h, int w) -cpdef setFilterNdDescriptor_v4( - size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA) -cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested) -cpdef destroyFilterDescriptor(size_t filterDesc) - - -############################################################################### -# Convolution -############################################################################### - -cpdef size_t createConvolutionDescriptor() except? 0 -cpdef setConvolutionMathType( - size_t convDesc, size_t mathType) -cpdef size_t getConvolutionMathType(size_t convDesc) except? 0 -cpdef setConvolutionGroupCount( - size_t convDesc, int groupCount) -cpdef int getConvolutionGroupCount(size_t convDesc) except? -1 -cpdef setConvolution2dDescriptor_v4( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode) -cpdef setConvolution2dDescriptor_v5( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode, size_t computeType) -cpdef setConvolutionNdDescriptor_v3( - size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, - size_t dilationA, int mode, int dataType) -cpdef destroyConvolutionDescriptor(size_t convDesc) -cpdef findConvolutionForwardAlgorithm( - intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, size_t yDesc, - int requestedAlgoCount) -cpdef list findConvolutionForwardAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionForwardAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionForwardAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionForwardAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int algo) except? -1 -cpdef convolutionForward( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t filterDesc, size_t filterData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t destDesc, size_t destData) -cpdef convolutionBackwardBias( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t beta, size_t destDesc, size_t destData) -cpdef findConvolutionBackwardFilterAlgorithm( - intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, - size_t dwDesc, int requestedAlgoCount) -cpdef list findConvolutionBackwardFilterAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionBackwardFilterAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int preference, size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionBackwardFilterAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int algo) except? -1 -cpdef convolutionBackwardFilter_v3( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData) -cpdef findConvolutionBackwardDataAlgorithm( - intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, - size_t dxDesc, int requestedAlgoCount) -cpdef list findConvolutionBackwardDataAlgorithmEx( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionBackwardDataAlgorithmEx_v7( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionBackwardDataAlgorithm_v6( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, size_t preference, - size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionBackwardDataAlgorithm_v7( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int algo) except? -1 -cpdef convolutionBackwardData_v3( - intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData) - - -############################################################################### -# Pooling -############################################################################### - -cpdef size_t createPoolingDescriptor() except? 0 -cpdef setPooling2dDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, - int windowWidth, int verticalPadding, int horizontalPadding, - int verticalStride, int horizontalStride) -cpdef setPoolingNdDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, - size_t windowDimA, size_t paddingA, size_t strideA) -cpdef destroyPoolingDescriptor(size_t poolingDesc) -cpdef poolingForward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData) -cpdef poolingBackward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData) - -############################################################################### -# Batch Normalization -############################################################################### - -cpdef deriveBNTensorDescriptor( - size_t derivedBnDesc, size_t xDesc, int mode) - -cpdef batchNormalizationForwardTraining( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance) - -cpdef batchNormalizationForwardInference( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, size_t estimatedMean, size_t estimatedVariance, - double epsilon) - -cpdef batchNormalizationBackward( - intptr_t handle, int mode, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, size_t dyDesc, - size_t dy, size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, size_t bnScale, - size_t dBnScaleResult, size_t dBnBiasResult, - double epsilon, size_t savedMean, size_t savedInvVariance) - -cpdef batchNormalizationForwardTrainingEx( - intptr_t handle, int mode, int bnOps, - size_t alpha, size_t beta, - size_t xDesc, size_t x, - size_t zDesc, size_t z, - size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, - size_t bnScale, size_t bnBias, - double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - -cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t zDesc, - size_t yDesc, - size_t bnScaleBiasMeanVarDesc, - size_t activationDesc) except? 0 - -cpdef batchNormalizationBackwardEx( - intptr_t handle, int mode, int bnops, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, - size_t yDesc, size_t y, - size_t dyDesc, size_t dy, - size_t dzDesc, size_t dz, - size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, - size_t bnScaleData, size_t bnBiasData, - size_t dBnScaleData, size_t dBnBiasData, - double epsilon, - size_t savedMean, size_t savedInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - -cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t yDesc, - size_t dyDesc, - size_t dzDesc, - size_t dxDesc, - size_t dBnScaleBiasDesc, - size_t activationDesc) except? 0 - -cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( - intptr_t handle, int mode, int bnOps, - size_t activationDesc, - size_t xDesc) except? 0 - - -############################################################################### -# Activation -############################################################################### - -cpdef size_t createActivationDescriptor() except? 0 -cpdef setActivationDescriptor( - size_t activationDesc, int mode, int reluNanOpt, double reluCeiling) -cpdef destroyActivationDescriptor(size_t activationDesc) -cpdef softmaxForward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData) -cpdef softmaxBackward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, - size_t destDiffDesc, size_t destDiffData) -cpdef activationForward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData) -cpdef activationBackward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData) - - -############################################################################### -# Dropout -############################################################################### -cpdef size_t createDropoutDescriptor() except? 0 -cpdef destroyDropoutDescriptor(size_t dropoutDesc) -cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1 -cpdef setDropoutDescriptor( - size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed) -cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0 -cpdef dropoutForward( - intptr_t handle, size_t dropoutDesc, - size_t srcDesc, size_t srcData, - size_t dstDesc, size_t dstData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef dropoutBackward( - intptr_t handle, size_t dropoutDesc, - size_t dyDesc, size_t dyData, - size_t dxtDesc, size_t dxData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - - -############################################################################### -# CTC -############################################################################### - -cpdef size_t createCTCLossDescriptor() except? 0 -cpdef destroyCTCLossDescriptor(size_t ctcLossDesc) -cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType) -cpdef getCTCLossDescriptor(size_t ctcLossDesc) -cpdef size_t getCTCLossWorkspaceSize( - intptr_t handle, size_t probsDesc, size_t gradientsDesc, - size_t labels, size_t labelLengths, size_t inputLengths, - int algo, size_t ctcLossDesc) except? 0 -cpdef CTCLoss( - intptr_t handle, size_t probsDesc, - size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, - size_t costs, size_t gradientsDesc, size_t gradients, int algo, - size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes) - - -############################################################################### -# RNN -############################################################################### - -cpdef size_t createRNNDescriptor() except? 0 -cpdef destroyRNNDescriptor(size_t rnnDesc) -cpdef size_t createPersistentRNNPlan( - size_t rnnDesc, int minibatch, int dataType) except? 0 -cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan) -cpdef destroyPersistentRNNPlan(size_t plan) -cpdef setRNNDescriptor_v5( - size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int dataType) -cpdef setRNNDescriptor_v6( - intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int algo, int dataType) -cpdef setRNNPaddingMode(size_t rnnDesc, int paddingMode) -cpdef getRNNPaddingMode(size_t rnnDesc) -cpdef size_t createRNNDataDescriptor() except? 0 -cpdef destroyRNNDataDescriptor(size_t RNNDataDesc) -cpdef setRNNDataDescriptor( - size_t RNNDataDesc, int dataType, size_t layout, - int maxSeqLength, int batchSize, int vectorSize, - size_t seqLengthArray, size_t paddingFill) -cpdef getRNNDataDescriptor( - size_t RNNDataDesc, size_t dataType, - size_t layout, size_t maxSeqLength, size_t batchSize, - size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, - size_t paddingFill) -cpdef getRNNWorkspaceSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) -cpdef getRNNTrainingReserveSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) -cpdef getRNNParamsSize( - intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType) -cpdef getRNNLinLayerMatrixParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat) -cpdef getRNNLinLayerBiasParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerBiasDesc, - size_t linLayerBias) -cpdef RNNForwardInference( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, - size_t x, size_t hxDesc, size_t hx, size_t cxDesc, - size_t cx, size_t wDesc, size_t w, size_t yDesc, - size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t workspace, size_t workSpaceSizeInBytes) -cpdef RNNForwardTraining( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t wDesc, size_t w, size_t yDesc, size_t y, - size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, - size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardData( - intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, - size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, - size_t dcxDesc, size_t dcx, size_t workspace, - size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardWeights( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, - size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNForwardInferenceEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef RNNForwardTrainingEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardDataEx( - intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, - size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, - size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, - size_t dkDesc, size_t dkeys, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardWeightsEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t dwDesc, size_t dw, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - - -############################################################################### -# Spatial Transformer -############################################################################### - -cpdef size_t createSpatialTransformerDescriptor() except? 0 -cpdef destroySpatialTransformerDescriptor(size_t stDesc) -cpdef setSpatialTransformerDescriptor( - size_t stDesc, size_t samplerType, int dataType, - int nbDims, size_t dimA) -cpdef spatialTfGridGeneratorForward( - intptr_t handle, size_t stDesc, size_t theta, size_t grid) -cpdef spatialTfGridGeneratorBackward( - intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta) -cpdef spatialTfSamplerForward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t grid, size_t beta, size_t yDesc, size_t y) -cpdef spatialTfSamplerBackward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, - size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid) - -############################################################################### -# Fused Ops -############################################################################### - -cpdef createFusedOpsConstParamPack(int ops) -cpdef destroyFusedOpsConstParamPack(size_t constPack) -cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param) -cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param) -cpdef createFusedOpsVariantParamPack(int ops) -cpdef destroyFusedOpsVariantParamPack(size_t varPack) -cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr) -cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr) -cpdef createFusedOpsPlan(int ops) -cpdef destroyFusedOpsPlan(size_t plan) -cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) -cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 84d10d5b874..bc49c090ea8 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -9,7 +9,7 @@ from cupy_backends.cuda.api cimport driver from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module -from cupy_backends.cuda.libs import miopen +from cupy_backends.cuda.libs cimport miopen ############################################################################### # Extern ############################################################################### @@ -760,7 +760,7 @@ class CuDNNError(RuntimeError): def __init__(self, int status): self.status = status if runtime._is_hip_environment: - msg = miopen.miopenGetErrorString(status) + msg = miopenGetErrorString(status) else: msg = cudnnGetErrorString(status) super(CuDNNError, self).__init__( @@ -827,10 +827,7 @@ cpdef queryRuntimeError(intptr_t handle, int mode): ############################################################################### cpdef intptr_t create() except? 0: - IF CUPY_HIP_VERSION != 0: - cdef miopen.Handle handle - ELSE: - cdef Handle handle + cdef Handle handle with nogil: if runtime._is_hip_environment: status = miopen.miopenCreate(&handle) @@ -849,1945 +846,4 @@ cpdef destroy(intptr_t handle): check_status(status) -cpdef setStream(intptr_t handle, size_t stream): - # TODO(leofang): The support of stream capture is not mentioned at all in - # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. - if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): - raise NotImplementedError( - 'calling cuDNN API during stream capture is currently ' - 'unsupported') - if runtime._is_hip_environment: - status = miopen.miopenSetStream(handle, stream) - else: - status = cudnnSetStream(handle, stream) - check_status(status) - - -cpdef size_t getStream(intptr_t handle) except? 0: - cdef driver.Stream stream - if runtime._is_hip_environment: - status = cudnnGetStream(handle, &stream) - else: - status = miopen.miopenGetStream(handle, &stream) - check_status(status) - return stream - - -cdef _setStream(intptr_t handle): - """Set current stream""" - setStream(handle, stream_module.get_current_stream_ptr()) - -############################################################################### -# Tensor manipulation -############################################################################### - -cpdef size_t createTensorDescriptor() except? 0: - cdef TensorDescriptor descriptor - if runtime._is_hip_environment: - status = miopen.miopenCreateTensorDescriptor(&descriptor) - else: - status = cudnnCreateTensorDescriptor(&descriptor) - check_status(status) - return descriptor - - -cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, - int n, int c, int h, int w): - status = cudnnSetTensor4dDescriptor( - tensorDesc, format, - dataType, n, c, h, w) - check_status(status) - - -cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, - int n, int c, int h, int w, int nStride, - int cStride, int hStride, int wStride): - status = cudnnSetTensor4dDescriptorEx( - tensorDesc, dataType, n, c, h, w, - nStride, cStride, hStride, wStride) - check_status(status) - - -cpdef tuple getTensor4dDescriptor(size_t tensorDesc): - cdef DataType dataType - cdef int n, c, h, w, nStride, cStride, hStride, wStride - status = cudnnGetTensor4dDescriptor( - tensorDesc, &dataType, - &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) - check_status(status) - return dataType, n, c, h, w, nStride, cStride, hStride, wStride - - -cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, - size_t dimA, size_t strideA): - status = cudnnSetTensorNdDescriptor( - tensorDesc, dataType, nbDims, - dimA, strideA) - check_status(status) - - -cpdef destroyTensorDescriptor(size_t tensorDesc): - if runtime._is_hip_environment: - status = miopen.miopenDestroyTensorDescriptor(tensorDesc) - else: - status = cudnnDestroyTensorDescriptor(tensorDesc) - check_status(status) - - -cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, - size_t b, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnAddTensor_v3( - handle, alpha, bDesc, - b, beta, yDesc, y) - check_status(status) - - -############################################################################### -# Tensor operations -############################################################################### - -cpdef size_t createOpTensorDescriptor() except? 0: - cdef OpTensorDescriptor opTensorDesc - status = cudnnCreateOpTensorDescriptor(&opTensorDesc) - check_status(status) - return opTensorDesc - - -cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, - int opTensorCompType, int opTensorNanOpt): - status = cudnnSetOpTensorDescriptor( - opTensorDesc, opTensorOp, - opTensorCompType, opTensorNanOpt) - check_status(status) - - -cpdef getOpTensorDescriptor(size_t opTensorDesc): - cdef OpTensorOp opTensorOp - cdef DataType opTensorCompType - cdef NanPropagation opTensorNanOpt - status = cudnnGetOpTensorDescriptor( - opTensorDesc, &opTensorOp, &opTensorCompType, - &opTensorNanOpt) - check_status(status) - return opTensorOp, opTensorCompType, opTensorNanOpt - - -cpdef destroyOpTensorDescriptor(size_t opTensorDesc): - status = cudnnDestroyOpTensorDescriptor(opTensorDesc) - check_status(status) - - -cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, - size_t aDesc, size_t A, size_t alpha2, size_t bDesc, - size_t B, size_t beta, size_t cDesc, size_t C): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenOpTensor( - handle, opTensorDesc, alpha1, - aDesc, A, alpha2, - bDesc, B, beta, - cDesc, C) - else: - status = cudnnOpTensor( - handle, opTensorDesc, alpha1, - aDesc, A, alpha2, - bDesc, B, beta, - cDesc, C) - check_status(status) - - -############################################################################### -# Tensor reductions -############################################################################### - -cpdef size_t createReduceTensorDescriptor() except? 0: - cdef ReduceTensorDescriptor reduceTensorDesc - if runtime._is_hip_environment: - status = miopen.miopenCreateReduceTensorDescriptor(&reduceTensorDesc) - else: - status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) - check_status(status) - return reduceTensorDesc - -cpdef setReduceTensorDescriptor( - size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, - int reduceTensorNanOpt, int reduceTensorIndices, - int reduceTensorIndicesType): - if runtime._is_hip_environment: - status = miopen.miopenSetReduceTensorDescriptor( - reduceTensorDesc, - reduceTensorOp, - reduceTensorCompType, reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType) - else: - status = cudnnSetReduceTensorDescriptor( - reduceTensorDesc, - reduceTensorOp, - reduceTensorCompType, reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType) - check_status(status) - - -cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): - cdef ReduceTensorOp redOp - cdef DataType redCompType - cdef NanPropagation redNanOpt - cdef ReduceTensorIndices redIndices - cdef IndicesType redIndicesType - if runtime._is_hip_environment: - status = miopen.miopenGetReduceTensorDescriptor( - reduceTensorDesc, &redOp, - &redCompType, &redNanOpt, &redIndices, &redIndicesType) - else: - status = cudnnGetReduceTensorDescriptor( - reduceTensorDesc, &redOp, - &redCompType, &redNanOpt, &redIndices, &redIndicesType) - check_status(status) - return redOp, redCompType, redNanOpt, redIndices, redIndicesType - - -cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - if runtime._is_hip_environment: - status = miopen.miopenDestroyReduceTensorDescriptor( - reduceTensorDesc) - else: - status = cudnnDestroyReduceTensorDescriptor( - reduceTensorDesc) - check_status(status) - - -cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - if runtime._is_hip_environment: - status = miopen.miopenGetReductionIndicesSize( - handle, reduceTensorDesc, - aDesc, cDesc, &sizeInBytes) - else: - status = cudnnGetReductionIndicesSize( - handle, reduceTensorDesc, - aDesc, cDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getReductionWorkspaceSize(intptr_t handle, - size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - if runtime._is_hip_environment: - status = miopen.miopenGetReductionWorkspaceSize( - handle, reduceTensorDesc, - aDesc, cDesc, - &sizeInBytes) - else: - status = cudnnGetReductionWorkspaceSize( - handle, reduceTensorDesc, - aDesc, cDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, - size_t indicesSizeInBytes, size_t workspace, - size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, - size_t A, size_t beta, size_t cDesc, size_t C): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenReduceTensor( - handle, reduceTensorDesc, - indices, indicesSizeInBytes, workspace, - workspaceSizeInBytes, alpha, aDesc, - A, beta, cDesc, C) - else: - status = cudnnReduceTensor( - handle, reduceTensorDesc, - indices, indicesSizeInBytes, workspace, - workspaceSizeInBytes, alpha, aDesc, - A, beta, cDesc, C) - check_status(status) - - -cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenSetTensor( - handle, yDesc, y, - valuePtr) - else: - status = cudnnSetTensor( - handle, yDesc, y, - valuePtr) - check_status(status) - - -cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenScaleTensor( - handle, yDesc, y, - alpha) - else: - status = cudnnScaleTensor( - handle, yDesc, y, - alpha) - check_status(status) - - -############################################################################### -# Filter manipulation -############################################################################### - -cpdef size_t createFilterDescriptor() except? 0: - cdef FilterDescriptor desc - status = cudnnCreateFilterDescriptor(&desc) - check_status(status) - return desc - - -cpdef setFilter4dDescriptor_v4( - size_t filterDesc, int dataType, - int format, int k, int c, int h, int w): - status = cudnnSetFilter4dDescriptor_v4( - filterDesc, dataType, - format, k, c, h, w) - check_status(status) - - -cpdef setFilterNdDescriptor_v4( - size_t filterDesc, int dataType, - int format, int nbDims, size_t filterDimA): - status = cudnnSetFilterNdDescriptor_v4( - filterDesc, dataType, - format, nbDims, filterDimA) - check_status(status) - - -cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): - cdef DataType dataType - cdef TensorFormat format - cdef int nbDims - cdef vector.vector[int] filterDimA - filterDimA.resize(nbDimsRequested) - - status = cudnnGetFilterNdDescriptor_v4( - wDesc, nbDimsRequested, &dataType, - &format, &nbDims, filterDimA.data()) - check_status(status) - return dataType, format, nbDims, tuple(filterDimA) - - -cpdef destroyFilterDescriptor(size_t filterDesc): - status = cudnnDestroyFilterDescriptor(filterDesc) - check_status(status) - - -############################################################################### -# Convolution -############################################################################### - -cpdef size_t createConvolutionDescriptor() except? 0: - cdef ConvolutionDescriptor desc - if runtime._is_hip_environment: - status = miopen.miopenCreateConvolutionDescriptor(&desc) - else: - status = cudnnCreateConvolutionDescriptor(&desc) - check_status(status) - return desc - - -cpdef setConvolutionMathType(size_t convDesc, size_t mathType): - status = cudnnSetConvolutionMathType( - convDesc, mathType) - check_status(status) - - -cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: - cdef MathType mathType - status = cudnnGetConvolutionMathType( - convDesc, &mathType) - return mathType - - -cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - if runtime._is_hip_environment: - status = miopen.miopenSetConvolutionGroupCount( - convDesc, groupCount) - else: - status = cudnnSetConvolutionGroupCount( - convDesc, groupCount) - check_status(status) - - -cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: - cdef int groupCount - if runtime._is_hip_environment: - status = miopen.miopenGetConvolutionGroupCount( - convDesc, &groupCount) - else: - status = cudnnGetConvolutionGroupCount( - convDesc, &groupCount) - return groupCount - - -cpdef setConvolution2dDescriptor_v4( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode): - status = cudnnSetConvolution2dDescriptor_v4( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode) - check_status(status) - - -cpdef setConvolution2dDescriptor_v5( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode, size_t computeType): - status = cudnnSetConvolution2dDescriptor_v5( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode, computeType) - check_status(status) - - -cpdef setConvolutionNdDescriptor_v3( - size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, - size_t dilationA, int mode, int dataType): - status = cudnnSetConvolutionNdDescriptor_v3( - convDesc, arrayLength, padA, - filterStrideA, dilationA, mode, - dataType) - check_status(status) - - -cpdef destroyConvolutionDescriptor(size_t convDesc): - if runtime._is_hip_environment: - status = miopen.miopenDestroyConvolutionDescriptor( - convDesc) - else: - status = cudnnDestroyConvolutionDescriptor( - convDesc) - check_status(status) - - -cpdef findConvolutionForwardAlgorithm( - intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, - size_t yDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithm( - handle, xDesc, wDesc, - convDesc, yDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionForwardAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionForwardAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx_v7( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionForwardAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: - cdef ConvolutionFwdAlgo algo - status = cudnnGetConvolutionForwardAlgorithm_v6( - handle, srcDesc, - filterDesc, convDesc, - destDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionForwardAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionForwardAlgorithm_v7( - handle, srcDesc, - filterDesc, convDesc, - destDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = cudnnGetConvolutionForwardWorkspaceSize( - handle, srcDesc, - filterDesc, convDesc, - destDesc, algo, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionForward( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t filterDesc, size_t filterData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t destDesc, size_t destData): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenConvolutionForward(handle, alpha, - srcDesc, srcData, - filterDesc, filterData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - destDesc, destData) - else: - status = cudnnConvolutionForward( - handle, alpha, - srcDesc, srcData, - filterDesc, filterData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - destDesc, destData) - check_status(status) - - -cpdef convolutionBackwardBias( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t beta, size_t destDesc, size_t destData): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenConvolutionBackwardBias( - handle, alpha, - srcDesc, srcData, beta, - destDesc, destData) - else: - status = cudnnConvolutionBackwardBias( - handle, alpha, - srcDesc, srcData, beta, - destDesc, destData) - check_status(status) - - -cpdef findConvolutionBackwardFilterAlgorithm( - intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, - size_t dwDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithm( - handle, xDesc, dyDesc, - convDesc, dwDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionBackwardFilterAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithmEx( - handle, xDesc, x, - dyDesc, dy, convDesc, - dwDesc, dw, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( - handle, xDesc, x, - dyDesc, dy, convDesc, - dwDesc, dw, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionBackwardFilterAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int preference, - size_t memoryLimitInbytes) except? -1: - cdef ConvolutionBwdFilterAlgo algo - status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( - handle, srcDesc, - diffDesc, convDesc, - filterDesc, - preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionBackwardFilterAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( - handle, srcDesc, diffDesc, - convDesc, gradDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle, srcDesc, - diffDesc, convDesc, - filterDesc, algo, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionBackwardFilter_v3( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardFilter_v3( - handle, alpha, - srcDesc, srcData, - diffDesc, diffData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - gradDesc, gradData) - check_status(status) - - -cpdef findConvolutionBackwardDataAlgorithm( - intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, - size_t dxDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithm( - handle, wDesc, dyDesc, - convDesc, dxDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - - -cpdef list findConvolutionBackwardDataAlgorithmEx( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithmEx( - handle, wDesc, w, - dyDesc, dy, convDesc, - dxDesc, dx, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) - for p in perfResults] - - -cpdef list findConvolutionBackwardDataAlgorithmEx_v7( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): - cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( - handle, wDesc, w, - dyDesc, dy, convDesc, - dxDesc, dx, - requestedAlgoCount, &returnedAlgoCount, perfResults.data(), - workSpace, workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionBackwardDataAlgorithm_v6( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, size_t preference, - size_t memoryLimitInbytes) except? -1: - cdef ConvolutionBwdDataAlgo algo - status = cudnnGetConvolutionBackwardDataAlgorithm_v6( - handle, filterDesc, - diffDesc, convDesc, - gradDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionBackwardDataAlgorithm_v7( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionBackwardDataAlgorithm_v7( - handle, filterDesc, - diffDesc, convDesc, - gradDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int algo) except? -1: - cdef size_t sizeInBytes - status = cudnnGetConvolutionBackwardDataWorkspaceSize( - handle, filterDesc, - diffDesc, - convDesc, gradDesc, - algo, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef convolutionBackwardData_v3( - intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData): - _setStream(handle) - with nogil: - status = cudnnConvolutionBackwardData_v3( - handle, alpha, - filterDesc, filterData, - diffDesc, diffData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - gradDesc, gradData) - check_status(status) - -############################################################################### -# Pooling -############################################################################### - -cpdef size_t createPoolingDescriptor() except? 0: - cdef PoolingDescriptor desc - if runtime._is_hip_environment: - status = miopen.miopenCreatePoolingDescriptor(&desc) - else: - status = cudnnCreatePoolingDescriptor(&desc) - check_status(status) - return desc - - -cpdef setPooling2dDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, - int windowWidth, int verticalPadding, int horizontalPadding, - int verticalStride, int horizontalStride): - status = cudnnSetPooling2dDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, windowHeight, windowWidth, - verticalPadding, horizontalPadding, verticalStride, horizontalStride) - check_status(status) - - -cpdef setPoolingNdDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, - size_t windowDimA, size_t paddingA, size_t strideA): - status = cudnnSetPoolingNdDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, nbDims, - windowDimA, paddingA, strideA) - check_status(status) - - -cpdef destroyPoolingDescriptor(size_t poolingDesc): - if runtime._is_hip_environment: - status = miopen.miopenDestroyPoolingDescriptor(poolingDesc) - else: - status = cudnnDestroyPoolingDescriptor(poolingDesc) - check_status(status) - - -cpdef poolingForward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnPoolingForward( - handle, poolingDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef poolingBackward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnPoolingBackward( - handle, poolingDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - -############################################################################### -# Batch Normalization -############################################################################### - -CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON - -cpdef deriveBNTensorDescriptor( - size_t derivedBnDesc, size_t xDesc, int mode): - if runtime._is_hip_environment: - status = miopen.miopenDeriveBNTensorDescriptor( - derivedBnDesc, xDesc, - mode) - else: - status = cudnnDeriveBNTensorDescriptor( - derivedBnDesc, xDesc, - mode) - check_status(status) - - -cpdef batchNormalizationForwardTraining( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenBatchNormalizationForwardTraining( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance) - else: - status = cudnnBatchNormalizationForwardTraining( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardInference( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, size_t estimatedMean, size_t estimatedVariance, - double epsilon): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenBatchNormalizationForwardInference( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, estimatedMean, estimatedVariance, - epsilon) - else: - status = cudnnBatchNormalizationForwardInference( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, estimatedMean, estimatedVariance, - epsilon) - check_status(status) - - -cpdef batchNormalizationBackward( - intptr_t handle, int mode, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, size_t dyDesc, - size_t dy, size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, size_t bnScale, - size_t dBnScaleResult, size_t dBnBiasResult, - double epsilon, size_t savedMean, size_t savedInvVariance): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenBatchNormalizationBackward( - handle, mode, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - dyDesc, dy, - dxDesc, dx, - dBnScaleBiasDesc, bnScale, - dBnScaleResult, dBnBiasResult, - epsilon, savedMean, savedInvVariance) - else: - status = cudnnBatchNormalizationBackward( - handle, mode, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - dyDesc, dy, - dxDesc, dx, - dBnScaleBiasDesc, bnScale, - dBnScaleResult, dBnBiasResult, - epsilon, savedMean, savedInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardTrainingEx( - intptr_t handle, int mode, int bnOps, - size_t alpha, size_t beta, - size_t xDesc, size_t x, - size_t zDesc, size_t z, - size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, - size_t bnScale, size_t bnBias, - double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardTrainingEx( - handle, mode, bnOps, - alpha, beta, - xDesc, x, - zDesc, z, - yDesc, y, - bnScaleBiasMeanVarDesc, - bnScale, bnBias, - exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t zDesc, - size_t yDesc, - size_t bnScaleBiasMeanVarDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - handle, - mode, bnOps, - xDesc, - zDesc, - yDesc, - bnScaleBiasMeanVarDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef batchNormalizationBackwardEx( - intptr_t handle, int mode, int bnops, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, - size_t yDesc, size_t y, - size_t dyDesc, size_t dy, - size_t dzDesc, size_t dz, - size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, - size_t bnScaleData, size_t bnBiasData, - size_t dBnScaleData, size_t dBnBiasData, - double epsilon, - size_t savedMean, size_t savedInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationBackwardEx( - handle, - mode, bnops, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - yDesc, y, - dyDesc, dy, - dzDesc, dz, - dxDesc, dx, - dBnScaleBiasDesc, - bnScaleData, bnBiasData, - dBnScaleData, dBnBiasData, - epsilon, - savedMean, savedInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t yDesc, - size_t dyDesc, - size_t dzDesc, - size_t dxDesc, - size_t dBnScaleBiasDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( - handle, - mode, - bnOps, - xDesc, - yDesc, - dyDesc, - dzDesc, - dxDesc, - dBnScaleBiasDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( - intptr_t handle, int mode, int bnOps, - size_t activationDesc, - size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - handle, - mode, - bnOps, - activationDesc, - xDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -############################################################################### -# Activation -############################################################################### - -cpdef size_t createActivationDescriptor() except? 0: - cdef ActivationDescriptor activationDesc - if runtime._is_hip_environment: - status = miopen.miopenCreateActivationDescriptor(&activationDesc) - else: - status = cudnnCreateActivationDescriptor(&activationDesc) - check_status(status) - return activationDesc - - -cpdef setActivationDescriptor( - size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): - status = cudnnSetActivationDescriptor( - activationDesc, mode, - reluNanOpt, reluCeiling) - check_status(status) - - -cpdef destroyActivationDescriptor(size_t activationDesc): - if runtime._is_hip_environment: - status = miopen.miopenDestroyActivationDescriptor( - activationDesc) - else: - status = cudnnDestroyActivationDescriptor( - activationDesc) - check_status(status) - - -cpdef softmaxForward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenSoftmaxForward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - beta, dstDesc, dstData) - else: - status = cudnnSoftmaxForward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - beta, dstDesc, dstData) - check_status(status) - - -cpdef softmaxBackward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, - size_t destDiffDesc, size_t destDiffData): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenSoftmaxBackward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - srcDiffDesc, srcDiffData, beta, - destDiffDesc, destDiffData) - else: - status = cudnnSoftmaxBackward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - srcDiffDesc, srcDiffData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -cpdef activationForward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnActivationForward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef activationBackward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnActivationBackward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -############################################################################### -# Dropout -############################################################################### - -cpdef size_t createDropoutDescriptor() except? 0: - cdef DropoutDescriptor desc - if runtime._is_hip_environment: - status = miopen.miopenCreateDropoutDescriptor(&desc) - else: - status = cudnnCreateDropoutDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyDropoutDescriptor(size_t dropoutDesc): - if runtime._is_hip_environment: - status = miopen.miopenDestroyDropoutDescriptor(dropoutDesc) - else: - status = cudnnDestroyDropoutDescriptor(dropoutDesc) - check_status(status) - - -cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: - cdef size_t sizeInBytes - if runtime._is_hip_environment: - status = miopen.miopenDropoutGetStatesSize( - handle, &sizeInBytes) - else: - status = cudnnDropoutGetStatesSize( - handle, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef setDropoutDescriptor( - size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed): - status = cudnnSetDropoutDescriptor( - dropoutDesc, handle, dropout, - states, stateSizeInBytes, seed) - check_status(status) - - -cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: - cdef size_t sizeInBytes - if runtime._is_hip_environment: - status = miopen.miopenDropoutGetReserveSpaceSize( - xDesc, &sizeInBytes) - else: - status = cudnnDropoutGetReserveSpaceSize( - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef dropoutForward( - intptr_t handle, size_t dropoutDesc, - size_t srcDesc, size_t srcData, - size_t dstDesc, size_t dstData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutForward( - handle, dropoutDesc, - srcDesc, srcData, - dstDesc, dstData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef dropoutBackward( - intptr_t handle, size_t dropoutDesc, - size_t dyDesc, size_t dyData, - size_t dxDesc, size_t dxData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutBackward( - handle, dropoutDesc, - dyDesc, dyData, - dxDesc, dxData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# CTC -############################################################################### -cpdef size_t createCTCLossDescriptor() except? 0: - cdef CTCLossDescriptor desc - if runtime._is_hip_environment: - status = miopen.miopenCreateCTCLossDescriptor(&desc) - else: - status = cudnnCreateCTCLossDescriptor(&desc) - check_status(status) - return desc - -cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - if runtime._is_hip_environment: - status = miopen.miopenDestroyCTCLossDescriptor(ctcLossDesc) - else: - status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) - check_status(status) - -cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): - status = cudnnSetCTCLossDescriptor( - ctcLossDesc, dataType) - check_status(status) - -cpdef getCTCLossDescriptor(size_t ctcLossDesc): - cdef DataType compType - status = cudnnGetCTCLossDescriptor( - ctcLossDesc, &compType) - check_status(status) - return compType - -cpdef size_t getCTCLossWorkspaceSize( - intptr_t handle, size_t probsDesc, size_t gradientsDesc, - size_t labels, size_t labelLengths, size_t inputLengths, - int algo, size_t ctcLossDesc) except? 0: - cdef size_t sizeInBytes - if runtime._is_hip_environment: - status = miopen.miopenGetCTCLossWorkspaceSize( - handle, probsDesc, - gradientsDesc, - labels, labelLengths, inputLengths, - algo, ctcLossDesc, &sizeInBytes) - else: - status = cudnnGetCTCLossWorkspaceSize( - handle, probsDesc, - gradientsDesc, - labels, labelLengths, inputLengths, - algo, ctcLossDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - -cpdef CTCLoss( - intptr_t handle, size_t probsDesc, - size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, - size_t costs, size_t gradientsDesc, size_t gradients, - int algo, size_t ctcLossDesc, - size_t workspace, size_t workSpaceSizeInBytes): - if runtime._is_hip_environment: - status = miopen.miopenCTCLoss( - handle, probsDesc, probs, - labels, labelLengths, inputLengths, - costs, gradientsDesc, gradients, - algo, ctcLossDesc, - workspace, workSpaceSizeInBytes) - else: - status = cudnnCTCLoss( - handle, probsDesc, probs, - labels, labelLengths, inputLengths, - costs, gradientsDesc, gradients, - algo, ctcLossDesc, - workspace, workSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# RNN -############################################################################### - -cpdef size_t createRNNDescriptor() except? 0: - cdef RNNDescriptor desc - if runtime._is_hip_environment: - status = miopen.miopenCreateRNNDescriptor(&desc) - else: - status = cudnnCreateRNNDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDescriptor(size_t rnnDesc): - if runtime._is_hip_environment: - status = miopen.miopenDestroyRNNDescriptor(rnnDesc) - else: - status = cudnnDestroyRNNDescriptor(rnnDesc) - check_status(status) - - -cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, - int dataType) except? 0: - cdef PersistentRNNPlan plan - status = cudnnCreatePersistentRNNPlan( - rnnDesc, - minibatch, dataType, &plan) - check_status(status) - return plan - - -cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): - status = cudnnSetPersistentRNNPlan( - rnnDesc, plan) - check_status(status) - - -cpdef destroyPersistentRNNPlan(size_t plan): - status = cudnnDestroyPersistentRNNPlan(plan) - check_status(status) - - -cpdef setRNNDescriptor_v5( - size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int dataType): - status = cudnnSetRNNDescriptor_v5( - rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, dataType) - check_status(status) - - -cpdef setRNNDescriptor_v6( - intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int algo, int dataType): - status = cudnnSetRNNDescriptor_v6( - handle, rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, algo, - dataType) - check_status(status) - - -cpdef setRNNPaddingMode( - size_t rnnDesc, int paddingMode): - status = cudnnSetRNNPaddingMode( - rnnDesc, paddingMode) - check_status(status) - - -cpdef getRNNPaddingMode(size_t rnnDesc): - cdef RNNPaddingMode paddingMode - status = cudnnGetRNNPaddingMode( - rnnDesc, &paddingMode) - check_status(status) - return paddingMode - - -cpdef size_t createRNNDataDescriptor() except? 0: - cdef RNNDataDescriptor desc - status = cudnnCreateRNNDataDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): - status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) - check_status(status) - - -cpdef setRNNDataDescriptor( - size_t RNNDataDesc, int dataType, size_t layout, - int maxSeqLength, int batchSize, int vectorSize, - size_t seqLengthArray, size_t paddingFill): - status = cudnnSetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, vectorSize, - seqLengthArray, paddingFill) - check_status(status) - - -cpdef getRNNDataDescriptor( - size_t RNNDataDesc, size_t dataType, - size_t layout, size_t maxSeqLength, size_t batchSize, - size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, - size_t paddingFill): - status = cudnnGetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, - vectorSize, arrayLengthRequested, seqLengthArray, - paddingFill) - check_status(status) - - -cpdef getRNNWorkspaceSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - if runtime._is_hip_environment: - status = miopen.miopenGetRNNWorkspaceSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - else: - status = cudnnGetRNNWorkspaceSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNTrainingReserveSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - if runtime._is_hip_environment: - status = miopen.miopenGetRNNTrainingReserveSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - else: - status = cudnnGetRNNTrainingReserveSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNParamsSize( - intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): - cdef size_t sizeInBytes - if runtime._is_hip_environment: - status = miopen.miopenGetRNNParamsSize( - handle, rnnDesc, xDesc, - &sizeInBytes, dataType) - else: - status = cudnnGetRNNParamsSize( - handle, rnnDesc, xDesc, - &sizeInBytes, dataType) - check_status(status) - return sizeInBytes - - -cpdef getRNNLinLayerMatrixParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): - status = cudnnGetRNNLinLayerMatrixParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerMatDesc, linLayerMat) - check_status(status) - - -cpdef getRNNLinLayerBiasParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerBiasDesc, - size_t linLayerBias): - status = cudnnGetRNNLinLayerBiasParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerBiasDesc, linLayerBias) - check_status(status) - - -cpdef RNNForwardInference( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, - size_t x, size_t hxDesc, size_t hx, size_t cxDesc, - size_t cx, size_t wDesc, size_t w, size_t yDesc, - size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t workspace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenRNNForwardInference( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes) - else: - status = cudnnRNNForwardInference( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTraining( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t wDesc, size_t w, size_t yDesc, size_t y, - size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, - size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - if runtime._is_hip_environment: - status = miopen.miopenRNNForwardTraining( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - else: - status = cudnnRNNForwardTraining( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardData( - intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, - size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, - size_t dcxDesc, size_t dcx, size_t workspace, - size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardData( - handle, rnnDesc, seqLength, - yDesc, y, - dyDesc, dy, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeights( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, - size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeights( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - yDesc, y, - workspace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardInferenceEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardInferenceEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTrainingEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardTrainingEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardDataEx( - intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, - size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, - size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, - size_t dkDesc, size_t dkeys, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardDataEx( - handle, rnnDesc, - yDesc, y, - dyDesc, dy, - dcDesc, dcAttn, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - dkDesc, dkeys, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeightsEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t dwDesc, size_t dw, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeightsEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - yDesc, y, - workSpace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# Spatial Transformer -############################################################################### - -cpdef size_t createSpatialTransformerDescriptor() except? 0: - cdef SpatialTransformerDescriptor stDesc - status = cudnnCreateSpatialTransformerDescriptor(&stDesc) - check_status(status) - return stDesc - - -cpdef destroySpatialTransformerDescriptor(size_t stDesc): - status = cudnnDestroySpatialTransformerDescriptor( - stDesc) - check_status(status) - - -cpdef setSpatialTransformerDescriptor( - size_t stDesc, size_t samplerType, int dataType, - int nbDims, size_t dimA): - status = cudnnSetSpatialTransformerNdDescriptor( - stDesc, samplerType, - dataType, nbDims, dimA) - check_status(status) - - -cpdef spatialTfGridGeneratorForward( - intptr_t handle, size_t stDesc, size_t theta, size_t grid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorForward( - handle, stDesc, - theta, grid) - check_status(status) - - -cpdef spatialTfGridGeneratorBackward( - intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorBackward( - handle, stDesc, - dgrid, dtheta) - check_status(status) - - -cpdef spatialTfSamplerForward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerForward( - handle, stDesc, - alpha, xDesc, x, grid, - beta, yDesc, y) - check_status(status) - - -cpdef spatialTfSamplerBackward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, - size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerBackward( - handle, stDesc, - alpha, xDesc, x, beta, - dxDesc, dx, alphaDgrid, - dyDesc, dy, grid, - betaDgrid, dgrid) - check_status(status) - -############################################################################### -# Fused Ops -############################################################################### - -cpdef createFusedOpsConstParamPack(int ops): - cdef FusedOpsConstParamPack constPack - with nogil: - status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) - check_status(status) - return constPack -cpdef destroyFusedOpsConstParamPack(size_t constPack): - with nogil: - status = cudnnDestroyFusedOpsConstParamPack( - constPack) - check_status(status) - -cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - with nogil: - status = cudnnSetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param) - check_status(status) - -cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - cdef int isNULL = 0 - with nogil: - status = cudnnGetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param, &isNULL) - check_status(status) - return isNULL - -cpdef createFusedOpsVariantParamPack(int ops): - cdef FusedOpsVariantParamPack varPack - with nogil: - status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) - check_status(status) - return varPack - -cpdef destroyFusedOpsVariantParamPack(size_t varPack): - with nogil: - status = cudnnDestroyFusedOpsVariantParamPack( - varPack) - check_status(status) - -cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnSetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnGetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef createFusedOpsPlan(int ops): - cdef FusedOpsPlan plan - with nogil: - status = cudnnCreateFusedOpsPlan(&plan, ops) - check_status(status) - return plan - -cpdef destroyFusedOpsPlan(size_t plan): - with nogil: - status = cudnnDestroyFusedOpsPlan(plan) - check_status(status) - -cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): - cdef size_t workspaceSizeInBytes - _setStream(handle) - with nogil: - status = cudnnMakeFusedOpsPlan(handle, plan, - constPack, - &workspaceSizeInBytes) - check_status(status) - return workspaceSizeInBytes - -cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): - _setStream(handle) - with nogil: - status = cudnnFusedOpsExecute(handle, plan, - varPack) - check_status(status) diff --git a/cupy_backends/cupy_cudnn.h b/cupy_backends/cupy_cudnn.h index a514f63d200..f30147ac9e8 100644 --- a/cupy_backends/cupy_cudnn.h +++ b/cupy_backends/cupy_cudnn.h @@ -2,8 +2,11 @@ #ifndef INCLUDE_GUARD_CUPY_CUDNN_H #define INCLUDE_GUARD_CUPY_CUDNN_H +#if CUPY_USE_HIP -#ifndef CUPY_NO_CUDA +#include "miopen/miopen.h" + +#elif !defined(CUPY_NO_CUDA) #include @@ -12,10 +15,6 @@ #include "stub/cupy_cuda_common.h" #include "stub/cupy_cudnn.h" -#else - -#include "hip/cupy_hip_common.h" -#include "stub/cupy_cudnn.h" #endif // #ifdef CUPY_NO_CUDA From e79cce9443ce1c7ddeedf10222b9844cc3f8cc4f Mon Sep 17 00:00:00 2001 From: bmedishe Date: Thu, 30 Nov 2023 05:15:16 +0000 Subject: [PATCH 15/26] update cudnn.pyx --- cupy_backends/cuda/libs/cudnn.pyx | 1462 +++++++++++++++-------------- install/cupy_builder/_features.py | 2 +- 2 files changed, 733 insertions(+), 731 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index bc49c090ea8..835b8e0570c 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -8,736 +8,737 @@ from libcpp cimport vector from cupy_backends.cuda.api cimport driver from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module - -from cupy_backends.cuda.libs cimport miopen -############################################################################### -# Extern -############################################################################### - -cdef extern from '../../cupy_cudnn.h' nogil: - # Types - ctypedef int ActivationMode 'cudnnActivationMode_t' - ctypedef int AddMode 'cudnnAddMode_t' - ctypedef int BatchNormMode 'cudnnBatchNormMode_t' - ctypedef int BatchNormOps 'cudnnBatchNormOps_t' - ctypedef int ConvolutionBwdDataAlgo 'cudnnConvolutionBwdDataAlgo_t' - ctypedef int ConvolutionBwdDataPreference \ - 'cudnnConvolutionBwdDataPreference_t' - ctypedef struct ConvolutionBwdDataAlgoPerf \ - 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ - 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionBwdFilterAlgo 'cudnnConvolutionBwdFilterAlgo_t' - ctypedef int ConvolutionBwdFilterPreference \ - 'cudnnConvolutionBwdFilterPreference_t' - ctypedef struct ConvolutionBwdFilterAlgoPerf \ - 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ - 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionFwdAlgo 'cudnnConvolutionFwdAlgo_t' - ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' - ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionFwdAlgoPerf_v7 \ - 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionMode 'cudnnConvolutionMode_t' - ctypedef int DataType 'cudnnDataType_t' - ctypedef int MathType 'cudnnMathType_t' - ctypedef int DirectionMode 'cudnnDirectionMode_t' - ctypedef int NanPropagation 'cudnnNanPropagation_t' - ctypedef int PoolingMode 'cudnnPoolingMode_t' - ctypedef int RNNInputMode 'cudnnRNNInputMode_t' - ctypedef int CTCLossAlgo 'cudnnCTCLossAlgo_t' - ctypedef int RNNMode 'cudnnRNNMode_t' - ctypedef int RNNAlgo 'cudnnRNNAlgo_t' - ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' - ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' - ctypedef int SoftmaxAlgorithm 'cudnnSoftmaxAlgorithm_t' - ctypedef int SoftmaxMode 'cudnnSoftmaxMode_t' - ctypedef int Status 'cudnnStatus_t' - ctypedef int TensorFormat 'cudnnTensorFormat_t' - ctypedef int OpTensorOp 'cudnnOpTensorOp_t' - ctypedef int ReduceTensorOp 'cudnnReduceTensorOp_t' - ctypedef int ReduceTensorIndices 'cudnnReduceTensorIndices_t' - ctypedef int IndicesType 'cudnnIndicesType_t' - ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' - ctypedef int FusedOps 'cudnnFusedOps_t' - ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' - ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' - ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' - ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' - - ctypedef void* ActivationDescriptor 'cudnnActivationDescriptor_t' - ctypedef void* ConvolutionDescriptor 'cudnnConvolutionDescriptor_t' - ctypedef void* DropoutDescriptor 'cudnnDropoutDescriptor_t' - ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' - ctypedef void* Handle 'cudnnHandle_t' - ctypedef void* PoolingDescriptor 'cudnnPoolingDescriptor_t' - ctypedef void* CTCLossDescriptor 'cudnnCTCLossDescriptor_t' - ctypedef void* RNNDescriptor 'cudnnRNNDescriptor_t' - ctypedef void* RNNDataDescriptor 'cudnnRNNDataDescriptor_t' - ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' - ctypedef void* TensorDescriptor 'cudnnTensorDescriptor_t' - ctypedef void* OpTensorDescriptor 'cudnnOpTensorDescriptor_t' - ctypedef void* ReduceTensorDescriptor 'cudnnReduceTensorDescriptor_t' - ctypedef void* SpatialTransformerDescriptor \ - 'cudnnSpatialTransformerDescriptor_t' - ctypedef void* SamplerType 'cudnnSamplerType_t' - ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' - ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' - ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' - - # Error handling - const char* cudnnGetErrorString(Status status) - - # Version - size_t cudnnGetVersion() - - # Runtime error checking - int cudnnQueryRuntimeError(Handle handle, Status *rstatus, - ErrQueryMode mode, RuntimeTag *tag) - - # Initialization and CUDA cooperation - int cudnnCreate(Handle* handle) - int cudnnDestroy(Handle handle) - int cudnnSetStream(Handle handle, driver.Stream stream) - int cudnnGetStream(Handle handle, driver.Stream* stream) - - # Tensor manipulation - int cudnnCreateTensorDescriptor(TensorDescriptor* descriptor) - int cudnnSetTensor4dDescriptor( - TensorDescriptor tensorDesc, TensorFormat format, - DataType dataType, int n, int c, int h, int w) - int cudnnSetTensor4dDescriptorEx( - TensorDescriptor tensorDesc, DataType dataType, - int n, int c, int h, int w, - int nStride, int cStride, int hStride, int wStride) - int cudnnGetTensor4dDescriptor( - TensorDescriptor tensorDesc, DataType* dataType, - int* n, int* c, int* h, int* w, - int* nStride, int* cStride, int* hStride, int* wStride) - int cudnnSetTensorNdDescriptor( - TensorDescriptor tensorDesc, DataType dataType, int nbDims, - int* dimA, int* strideA) - int cudnnDestroyTensorDescriptor(TensorDescriptor tensorDesc) - int cudnnAddTensor_v3( - Handle handle, void* alpha, TensorDescriptor bDesc, - void* b, void* beta, TensorDescriptor yDesc, void* y) - - # Tensor operations - int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) - int cudnnSetOpTensorDescriptor( - OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, - DataType opTensorCompType, NanPropagation opTensorNanOpt) - int cudnnGetOpTensorDescriptor( - OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, - DataType* opTensorCompType, NanPropagation* opTensorNanOpt) - int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) - int cudnnOpTensor( - Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, - TensorDescriptor aDesc, void* A, void* alpha2, - TensorDescriptor bDesc, void* B, void* beta, - TensorDescriptor cDesc, void* C) - - # Tensor reductions - int cudnnCreateReduceTensorDescriptor( - ReduceTensorDescriptor* reduceTensorDesc) - int cudnnSetReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, - DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, - ReduceTensorIndices reduceTensorIndices, - IndicesType reduceTensorIndicesType) - int cudnnGetReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc, - ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, - NanPropagation* reduceTensorNanOpt, - ReduceTensorIndices* reduceTensorIndices, - IndicesType* reduceTensorIndicesType) - int cudnnDestroyReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc) - int cudnnGetReductionIndicesSize( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, - TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) - int cudnnGetReductionWorkspaceSize( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, - TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) - int cudnnReduceTensor( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, - size_t indicesSizeInBytes, void* workspace, - size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, - void* A, void* beta, TensorDescriptor cDesc, void* c) - int cudnnSetTensor( - Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) - int cudnnScaleTensor( - Handle handle, TensorDescriptor yDesc, void* y, void* alpha) - - # Filter manipulation - int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) - int cudnnSetFilter4dDescriptor_v4( - FilterDescriptor filterDesc, DataType dataType, - TensorFormat format, int k, int c, int h, int w) - int cudnnSetFilterNdDescriptor_v4( - FilterDescriptor filterDesc, DataType dataType, - TensorFormat format, int nbDims, const int filterDimA[]) - int cudnnGetFilterNdDescriptor_v4( - FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, - TensorFormat* format, int* nbDims, int filterDimA[]) - int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) - - # Convolution - int cudnnCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) - int cudnnSetConvolutionMathType( - ConvolutionDescriptor convDesc, MathType mathType) - int cudnnGetConvolutionMathType( - ConvolutionDescriptor convDesc, MathType *mathType) - int cudnnSetConvolutionGroupCount( - ConvolutionDescriptor convDesc, int groupCount) - int cudnnGetConvolutionGroupCount( - ConvolutionDescriptor convDesc, int *groupCount) - int cudnnSetConvolution2dDescriptor_v4( - ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, - int v, int dilation_h, int dilation_w, ConvolutionMode mode) - int cudnnSetConvolution2dDescriptor_v5( - ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, - int v, int dilation_h, int dilation_w, ConvolutionMode mode, - DataType computeType) - int cudnnSetConvolutionNdDescriptor_v3( - ConvolutionDescriptor convDesc, int arrayLength, int* padA, - int* filterStrideA, int* dilationA, ConvolutionMode mode, - DataType dataType) - int cudnnDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) - int cudnnFindConvolutionForwardAlgorithm( - Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, - ConvolutionDescriptor convDesc, TensorDescriptor yDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionFwdAlgoPerf* perfResults) - int cudnnFindConvolutionForwardAlgorithmEx( - Handle handle, TensorDescriptor xDesc, void* x, - FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, - TensorDescriptor yDesc, void* y, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionForwardAlgorithmEx_v7( - Handle handle, TensorDescriptor xDesc, void* x, - FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, - TensorDescriptor yDesc, void* y, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionForwardAlgorithm_v6( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, ConvolutionFwdPreference preference, - size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) - int cudnnGetConvolutionForwardAlgorithm_v7( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) - int cudnnGetConvolutionForwardWorkspaceSize( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, ConvolutionFwdAlgo algo, - size_t* sizeInBytes) - int cudnnConvolutionForward( - Handle handle, void* alpha, TensorDescriptor srcDesc, - void* srcData, FilterDescriptor filterDesc, void* filterData, - ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - TensorDescriptor destDesc, void* destData) - int cudnnConvolutionBackwardBias( - Handle handle, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor destDesc, void* destData) - int cudnnFindConvolutionBackwardFilterAlgorithm( - Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, - ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdFilterAlgoPerf* perfResults) - int cudnnFindConvolutionBackwardFilterAlgorithmEx( - Handle handle, TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( - Handle handle, TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionBackwardFilterAlgorithm_v6( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, - ConvolutionBwdFilterPreference preference, - size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) - int cudnnGetConvolutionBackwardFilterAlgorithm_v7( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdFilterAlgoPerf_v7* perfResults) - int cudnnGetConvolutionBackwardFilterWorkspaceSize( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, - ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) - int cudnnConvolutionBackwardFilter_v3( - Handle handle, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor diffDesc, void* diffData, - ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - FilterDescriptor gradDesc, void* gradData) - int cudnnGetConvolutionBackwardDataAlgorithm_v6( - Handle handle, FilterDescriptor filterDesc, - TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, - ConvolutionBwdDataPreference preference, - size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) - int cudnnGetConvolutionBackwardDataAlgorithm_v7( - Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdDataAlgoPerf_v7* perfResults) - int cudnnFindConvolutionBackwardDataAlgorithm( - Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, - ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdDataAlgoPerf* perfResults) - int cudnnFindConvolutionBackwardDataAlgorithmEx( - Handle handle, FilterDescriptor wDesc, void* w, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( - Handle handle, FilterDescriptor wDesc, void* w, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionBackwardDataWorkspaceSize( - Handle handle, FilterDescriptor filterDesc, - TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, - ConvolutionBwdDataAlgo algo, size_t* sizeInBytes) - int cudnnConvolutionBackwardData_v3( - Handle handle, void* alpha, - FilterDescriptor filterDesc, void* filterData, - TensorDescriptor diffDesc, void* diffData, - ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - TensorDescriptor gradDesc, void* gradData) - - # Pooling - int cudnnCreatePoolingDescriptor(PoolingDescriptor* desc) - int cudnnSetPooling2dDescriptor_v4( - PoolingDescriptor poolingDesc, PoolingMode mode, - NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, - int verticalPadding, int horizontalPadding, int verticalStride, - int horizontalStride) - int cudnnSetPoolingNdDescriptor_v4( - PoolingDescriptor poolingDesc, PoolingMode mode, - NanPropagation maxpoolingNanOpt, int nbDims, - int* windowDimA, int* paddingA, int* strideA) - int cudnnDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) - int cudnnPoolingForward( - Handle handle, PoolingDescriptor poolingDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor dstDesc, void* dstData) - int cudnnPoolingBackward( - Handle handle, PoolingDescriptor poolingDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, - TensorDescriptor destDesc, void* destData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - - # Batch Normalization - int cudnnDeriveBNTensorDescriptor( - TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, - BatchNormMode mode) - int cudnnBatchNormalizationForwardTraining( - Handle handle, BatchNormMode mode, - void* alpha, void* beta, TensorDescriptor xDesc, - void* x, TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, - void* bnBias, double exponentialAverageFactor, - void* resultRunningMean, void* resultRunningVariance, - double epsilon, void* resultSaveMean, - void* resultSaveInvVariance) - int cudnnBatchNormalizationForwardInference( - Handle handle, BatchNormMode mode, - void* alpha, void* beta, TensorDescriptor xDesc, - void* x, TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, - void* bnBias, void* estimatedMean, void* estimatedVariance, - double epsilon) - int cudnnBatchNormalizationBackward( - Handle handle, BatchNormMode mode, - void* alphaDataDiff, void* betaDataDiff, - void* alphaParamDiff, void* betaParamDiff, - TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, - TensorDescriptor dxDesc, void* dx, - TensorDescriptor dBnScaleBiasDesc, void* bnScale, - void* dBnScaleResult, void* dBnBiasResult, - double epsilon, void* savedMean, void* savedInvVariance) - - int cudnnBatchNormalizationForwardTrainingEx( - Handle handle, - BatchNormMode mode, BatchNormOps bnOps, - void* alpha, void* beta, - TensorDescriptor xDesc, void* x, - TensorDescriptor zDesc, void* z, - TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, - void* bnScale, void* bnBias, - double exponentialAverageFactor, - void* resultRunningMean, void* resultRunningVariance, - double epsilon, - void* resultSaveMean, void* resultSaveInvVariance, - ActivationDescriptor activationDesc, - void* workspace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - Handle handle, - BatchNormMode mode, BatchNormOps bnOps, - TensorDescriptor xDesc, - TensorDescriptor zDesc, - TensorDescriptor yDesc, - TensorDescriptor bnScaleBiasMeanVarDesc, - ActivationDescriptor activationDesc, - size_t* sizeInBytes) - int cudnnBatchNormalizationBackwardEx( - Handle handle, - BatchNormMode mode, BatchNormOps bnops, - void* alphaDataDiff, void* betaDataDiff, - void* alphaParamDiff, void* betaParamDiff, - TensorDescriptor xDesc, void* x, - TensorDescriptor yDesc, void* y, - TensorDescriptor dyDesc, void* dy, - TensorDescriptor dzDesc, void* dz, - TensorDescriptor dxDesc, void* dx, - TensorDescriptor dBnScaleBiasDesc, - void* bnScaleData, void* bnBiasData, - void* dBnScaleData, void* dBnBiasData, - double epsilon, - void* savedMean, void* savedInvVariance, - ActivationDescriptor activationDesc, - void* workspace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnGetBatchNormalizationBackwardExWorkspaceSize( - Handle handle, - BatchNormMode mode, - BatchNormOps bnOps, - TensorDescriptor xDesc, - TensorDescriptor yDesc, - TensorDescriptor dyDesc, - TensorDescriptor dzDesc, - TensorDescriptor dxDesc, - TensorDescriptor dBnScaleBiasDesc, - ActivationDescriptor activationDesc, - size_t* sizeInBytes) - int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - Handle handle, - BatchNormMode mode, - BatchNormOps bnOps, - ActivationDescriptor activationDesc, - TensorDescriptor xDesc, - size_t* sizeInBytes) - - # Activation - int cudnnCreateActivationDescriptor( - ActivationDescriptor* activationDesc) - int cudnnSetActivationDescriptor( - ActivationDescriptor activationDesc, ActivationMode mode, - NanPropagation reluNanOpt, double reluCeiling) - int cudnnDestroyActivationDescriptor( - ActivationDescriptor activationDesc) - int cudnnSoftmaxForward( - Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, - void* alpha, TensorDescriptor srcDesc, void* srcData, - void* beta, TensorDescriptor dstDesc, void* dstData) - int cudnnSoftmaxBackward( - Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, - void* alpha, TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - int cudnnActivationForward_v4( - Handle handle, ActivationDescriptor activationDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor dstDesc, void* dstData) - int cudnnActivationBackward_v4( - Handle handle, ActivationDescriptor activationDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, - TensorDescriptor destDesc, void* destData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - - # Dropout - int cudnnCreateDropoutDescriptor(DropoutDescriptor* desc) - int cudnnDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) - int cudnnDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) - int cudnnDropoutGetReserveSpaceSize( - TensorDescriptor xDesc, size_t* sizeInBytes) - int cudnnSetDropoutDescriptor( - DropoutDescriptor dropoutDesc, Handle handle, float dropout, - void* states, size_t stateSizeInBytes, unsigned long long seed) - int cudnnDropoutForward( - Handle handle, DropoutDescriptor dropoutDesc, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor dstDesc, void* dstData, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnDropoutBackward( - Handle handle, DropoutDescriptor dropoutDesc, - TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, - void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) - - # CTC - int cudnnCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) - int cudnnDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) - int cudnnSetCTCLossDescriptor( - CTCLossDescriptor ctcLossDesc, DataType dataType) - int cudnnGetCTCLossDescriptor( - CTCLossDescriptor ctcLossDesc, DataType* dataType) - int cudnnGetCTCLossWorkspaceSize( - Handle handle, TensorDescriptor probsDesc, - TensorDescriptor gradientsDesc, int* labels, - int* labelLengths, int* inputLengths, CTCLossAlgo algo, - CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) - int cudnnCTCLoss( - Handle handle, TensorDescriptor probsDesc, - void* probs, int* labels, int* labelLengths, int* inputLengths, - void* costs, TensorDescriptor gradientsDesc, void* gradients, - CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, - void* workspace, size_t workSpaceSizeInBytes) - # RNN - int cudnnCreateRNNDescriptor(RNNDescriptor* rnnDesc) - int cudnnDestroyRNNDescriptor(RNNDescriptor rnnDesc) - int cudnnCreatePersistentRNNPlan( - RNNDescriptor rnnDesc, - const int minibatch, DataType dataType, - PersistentRNNPlan* plan) - int cudnnSetPersistentRNNPlan( - RNNDescriptor rnnDesc, PersistentRNNPlan plan) - int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) - int cudnnSetRNNDescriptor_v5( - RNNDescriptor rnnDesc, int hiddenSize, - int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, - DirectionMode direction, RNNMode mode, DataType dataType) - int cudnnSetRNNDescriptor_v6( - Handle handle, RNNDescriptor rnnDesc, int hiddenSize, - int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, - DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) - int cudnnSetRNNPaddingMode( - RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) - int cudnnGetRNNPaddingMode( - RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) - int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) - int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) - int cudnnSetRNNDataDescriptor( - RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, - int maxSeqLength, int batchSize, int vectorSize, - const int seqLengthArray[], void *paddingFill) - int cudnnGetRNNDataDescriptor( - RNNDataDescriptor RNNDataDesc, DataType* dataType, - RNNDataLayout* layout, int* maxSeqLength, int* batchSize, - int* vectorSize, int arrayLengthRequested, int seqLengthArray[], - void* paddingFill) - int cudnnGetRNNWorkspaceSize( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, size_t* sizeInBytes) - int cudnnGetRNNTrainingReserveSize( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, size_t* sizeInBytes) - int cudnnGetRNNParamsSize( - Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, - size_t* sizeInBytes, DataType dataType) - int cudnnGetRNNLinLayerMatrixParams( - Handle handle, RNNDescriptor rnnDesc, int layer, - TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, - int linLayerID, FilterDescriptor linLayerMatDesc, - void** linLayerMat) - int cudnnGetRNNLinLayerBiasParams( - Handle handle, RNNDescriptor rnnDesc, int layer, - TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, - int linLayerID, FilterDescriptor linLayerBiasDesc, - void** linLayerBias) - int cudnnRNNForwardInference( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, - void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, - void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, - void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, - void* cy, void* workspace, size_t workSpaceSizeInBytes) - int cudnnRNNForwardTraining( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, void* x, - TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, - FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, - TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, - void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, - size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardData( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* yDesc, void* y, - TensorDescriptor* dyDesc, void* dy, - TensorDescriptor dhyDesc, void* dhy, - TensorDescriptor dcyDesc, void* dcy, - FilterDescriptor wDesc, void* w, - TensorDescriptor hxDesc, void* hx, - TensorDescriptor cxDesc, void* cx, - TensorDescriptor* dxDesc, void* dx, - TensorDescriptor dhxDesc, void* dhx, - TensorDescriptor dcxDesc, void* dcx, void* workspace, - size_t workSpaceSizeInBytes, void* reserveSpace, - size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardWeights( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, - TensorDescriptor* yDesc, void* y, - void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, - void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) - - int cudnnRNNForwardInferenceEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - FilterDescriptor wDesc, const void* w, - RNNDataDescriptor yDesc, void* y, - TensorDescriptor hyDesc, void* hy, - TensorDescriptor cyDesc, void* cy, - RNNDataDescriptor kDesc, const void* keys, - RNNDataDescriptor cDesc, void* cAttn, - RNNDataDescriptor iDesc, void* iAttn, - RNNDataDescriptor qDesc, void* queries, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnRNNForwardTrainingEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - FilterDescriptor wDesc, const void* w, - RNNDataDescriptor yDesc, void* y, - TensorDescriptor hyDesc, void* hy, - TensorDescriptor cyDesc, void* cy, - RNNDataDescriptor kDesc, const void* keys, - RNNDataDescriptor cDesc, void* cAttn, - RNNDataDescriptor iDesc, void* iAttn, - RNNDataDescriptor qDesc, void* queries, - void* workSpace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardDataEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor yDesc, const void* y, - RNNDataDescriptor dyDesc, const void* dy, - RNNDataDescriptor dcDesc, const void* dcAttn, - TensorDescriptor dhyDesc, const void* dhy, - TensorDescriptor dcyDesc, const void* dcy, - FilterDescriptor wDesc, const void* w, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - RNNDataDescriptor dxDesc, void* dx, - TensorDescriptor dhxDesc, void* dhx, - TensorDescriptor dcxDesc, void* dcx, - RNNDataDescriptor dkDesc, void* dkeys, - void* workSpace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardWeightsEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - RNNDataDescriptor yDesc, const void* y, - void* workSpace, size_t workSpaceSizeInBytes, - FilterDescriptor dwDesc, void* dw, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - - # Spatial Transformer - int cudnnCreateSpatialTransformerDescriptor( - SpatialTransformerDescriptor* stDesc) - int cudnnDestroySpatialTransformerDescriptor( - SpatialTransformerDescriptor stDesc) - int cudnnSetSpatialTransformerNdDescriptor( - SpatialTransformerDescriptor stDesc, SamplerType samplerType, - DataType dataType, int nbDims, int dimA[]) - int cudnnSpatialTfGridGeneratorForward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* theta, void* grid) - int cudnnSpatialTfGridGeneratorBackward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* dgrid, void* dtheta) - int cudnnSpatialTfSamplerForward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* alpha, TensorDescriptor xDesc, void* x, - void* grid, void* beta, TensorDescriptor yDesc, void* y) - int cudnnSpatialTfSamplerBackward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* alpha, TensorDescriptor xDesc, void* x, void* beta, - TensorDescriptor dxDesc, void* dx, void* alphaDgrid, - TensorDescriptor dyDesc, void* dy, void* grid, - void* betaDgrid, void* dgrid) - - # Fused Ops - int cudnnCreateFusedOpsConstParamPack( - FusedOpsConstParamPack* constPack, int ops) - int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) - int cudnnSetFusedOpsConstParamPackAttribute( - FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, - const void *param) - int cudnnGetFusedOpsConstParamPackAttribute( - const FusedOpsConstParamPack constPack, - FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) - int cudnnCreateFusedOpsVariantParamPack( - FusedOpsVariantParamPack *varPack, FusedOps ops) - int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) - int cudnnSetFusedOpsVariantParamPackAttribute( - FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, - void *ptr) - int cudnnGetFusedOpsVariantParamPackAttribute( - const FusedOpsVariantParamPack varPack, - FusedOpsVariantParamLabel paramLabel, void *ptr) - int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) - int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) - int cudnnMakeFusedOpsPlan( - Handle handle, FusedOpsPlan plan, - const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) - int cudnnFusedOpsExecute( - Handle handle, const FusedOpsPlan plan, - FusedOpsVariantParamPack varPack) - - # Build-time version - int CUDNN_VERSION - - # Constants - double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' +IF CUPY_USE_GEN_HIP_CODE: + from cupy_backends.cuda.libs.miopen import * +ELSE: + ############################################################################### + # Extern + ############################################################################### + + cdef extern from '../../cupy_cudnn.h' nogil: + # Types + ctypedef int ActivationMode 'cudnnActivationMode_t' + ctypedef int AddMode 'cudnnAddMode_t' + ctypedef int BatchNormMode 'cudnnBatchNormMode_t' + ctypedef int BatchNormOps 'cudnnBatchNormOps_t' + ctypedef int ConvolutionBwdDataAlgo 'cudnnConvolutionBwdDataAlgo_t' + ctypedef int ConvolutionBwdDataPreference \ + 'cudnnConvolutionBwdDataPreference_t' + ctypedef struct ConvolutionBwdDataAlgoPerf \ + 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ + 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionBwdFilterAlgo 'cudnnConvolutionBwdFilterAlgo_t' + ctypedef int ConvolutionBwdFilterPreference \ + 'cudnnConvolutionBwdFilterPreference_t' + ctypedef struct ConvolutionBwdFilterAlgoPerf \ + 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ + 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionFwdAlgo 'cudnnConvolutionFwdAlgo_t' + ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' + ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionFwdAlgoPerf_v7 \ + 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionMode 'cudnnConvolutionMode_t' + ctypedef int DataType 'cudnnDataType_t' + ctypedef int MathType 'cudnnMathType_t' + ctypedef int DirectionMode 'cudnnDirectionMode_t' + ctypedef int NanPropagation 'cudnnNanPropagation_t' + ctypedef int PoolingMode 'cudnnPoolingMode_t' + ctypedef int RNNInputMode 'cudnnRNNInputMode_t' + ctypedef int CTCLossAlgo 'cudnnCTCLossAlgo_t' + ctypedef int RNNMode 'cudnnRNNMode_t' + ctypedef int RNNAlgo 'cudnnRNNAlgo_t' + ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' + ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' + ctypedef int SoftmaxAlgorithm 'cudnnSoftmaxAlgorithm_t' + ctypedef int SoftmaxMode 'cudnnSoftmaxMode_t' + ctypedef int Status 'cudnnStatus_t' + ctypedef int TensorFormat 'cudnnTensorFormat_t' + ctypedef int OpTensorOp 'cudnnOpTensorOp_t' + ctypedef int ReduceTensorOp 'cudnnReduceTensorOp_t' + ctypedef int ReduceTensorIndices 'cudnnReduceTensorIndices_t' + ctypedef int IndicesType 'cudnnIndicesType_t' + ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' + ctypedef int FusedOps 'cudnnFusedOps_t' + ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' + ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' + ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' + ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' + + ctypedef void* ActivationDescriptor 'cudnnActivationDescriptor_t' + ctypedef void* ConvolutionDescriptor 'cudnnConvolutionDescriptor_t' + ctypedef void* DropoutDescriptor 'cudnnDropoutDescriptor_t' + ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' + ctypedef void* Handle 'cudnnHandle_t' + ctypedef void* PoolingDescriptor 'cudnnPoolingDescriptor_t' + ctypedef void* CTCLossDescriptor 'cudnnCTCLossDescriptor_t' + ctypedef void* RNNDescriptor 'cudnnRNNDescriptor_t' + ctypedef void* RNNDataDescriptor 'cudnnRNNDataDescriptor_t' + ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' + ctypedef void* TensorDescriptor 'cudnnTensorDescriptor_t' + ctypedef void* OpTensorDescriptor 'cudnnOpTensorDescriptor_t' + ctypedef void* ReduceTensorDescriptor 'cudnnReduceTensorDescriptor_t' + ctypedef void* SpatialTransformerDescriptor \ + 'cudnnSpatialTransformerDescriptor_t' + ctypedef void* SamplerType 'cudnnSamplerType_t' + ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' + ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' + ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' + + # Error handling + const char* cudnnGetErrorString(Status status) + + # Version + size_t cudnnGetVersion() + + # Runtime error checking + int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + ErrQueryMode mode, RuntimeTag *tag) + + # Initialization and CUDA cooperation + int cudnnCreate(Handle* handle) + int cudnnDestroy(Handle handle) + int cudnnSetStream(Handle handle, driver.Stream stream) + int cudnnGetStream(Handle handle, driver.Stream* stream) + + # Tensor manipulation + int cudnnCreateTensorDescriptor(TensorDescriptor* descriptor) + int cudnnSetTensor4dDescriptor( + TensorDescriptor tensorDesc, TensorFormat format, + DataType dataType, int n, int c, int h, int w) + int cudnnSetTensor4dDescriptorEx( + TensorDescriptor tensorDesc, DataType dataType, + int n, int c, int h, int w, + int nStride, int cStride, int hStride, int wStride) + int cudnnGetTensor4dDescriptor( + TensorDescriptor tensorDesc, DataType* dataType, + int* n, int* c, int* h, int* w, + int* nStride, int* cStride, int* hStride, int* wStride) + int cudnnSetTensorNdDescriptor( + TensorDescriptor tensorDesc, DataType dataType, int nbDims, + int* dimA, int* strideA) + int cudnnDestroyTensorDescriptor(TensorDescriptor tensorDesc) + int cudnnAddTensor_v3( + Handle handle, void* alpha, TensorDescriptor bDesc, + void* b, void* beta, TensorDescriptor yDesc, void* y) + + # Tensor operations + int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) + int cudnnSetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, + DataType opTensorCompType, NanPropagation opTensorNanOpt) + int cudnnGetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, + DataType* opTensorCompType, NanPropagation* opTensorNanOpt) + int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) + int cudnnOpTensor( + Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, + TensorDescriptor aDesc, void* A, void* alpha2, + TensorDescriptor bDesc, void* B, void* beta, + TensorDescriptor cDesc, void* C) + + # Tensor reductions + int cudnnCreateReduceTensorDescriptor( + ReduceTensorDescriptor* reduceTensorDesc) + int cudnnSetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, + DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, + ReduceTensorIndices reduceTensorIndices, + IndicesType reduceTensorIndicesType) + int cudnnGetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, + ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, + NanPropagation* reduceTensorNanOpt, + ReduceTensorIndices* reduceTensorIndices, + IndicesType* reduceTensorIndicesType) + int cudnnDestroyReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc) + int cudnnGetReductionIndicesSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int cudnnGetReductionWorkspaceSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int cudnnReduceTensor( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, + size_t indicesSizeInBytes, void* workspace, + size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, + void* A, void* beta, TensorDescriptor cDesc, void* c) + int cudnnSetTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) + int cudnnScaleTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* alpha) + + # Filter manipulation + int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) + int cudnnSetFilter4dDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int k, int c, int h, int w) + int cudnnSetFilterNdDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int nbDims, const int filterDimA[]) + int cudnnGetFilterNdDescriptor_v4( + FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, + TensorFormat* format, int* nbDims, int filterDimA[]) + int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) + + # Convolution + int cudnnCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) + int cudnnSetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType mathType) + int cudnnGetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType *mathType) + int cudnnSetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int groupCount) + int cudnnGetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int *groupCount) + int cudnnSetConvolution2dDescriptor_v4( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode) + int cudnnSetConvolution2dDescriptor_v5( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode, + DataType computeType) + int cudnnSetConvolutionNdDescriptor_v3( + ConvolutionDescriptor convDesc, int arrayLength, int* padA, + int* filterStrideA, int* dilationA, ConvolutionMode mode, + DataType dataType) + int cudnnDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) + int cudnnFindConvolutionForwardAlgorithm( + Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, + ConvolutionDescriptor convDesc, TensorDescriptor yDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionFwdAlgoPerf* perfResults) + int cudnnFindConvolutionForwardAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionForwardAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionForwardAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, ConvolutionFwdPreference preference, + size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) + int cudnnGetConvolutionForwardAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) + int cudnnGetConvolutionForwardWorkspaceSize( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, ConvolutionFwdAlgo algo, + size_t* sizeInBytes) + int cudnnConvolutionForward( + Handle handle, void* alpha, TensorDescriptor srcDesc, + void* srcData, FilterDescriptor filterDesc, void* filterData, + ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnConvolutionBackwardBias( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnFindConvolutionBackwardFilterAlgorithm( + Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardFilterAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionBackwardFilterAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) + int cudnnGetConvolutionBackwardFilterAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf_v7* perfResults) + int cudnnGetConvolutionBackwardFilterWorkspaceSize( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) + int cudnnConvolutionBackwardFilter_v3( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + FilterDescriptor gradDesc, void* gradData) + int cudnnGetConvolutionBackwardDataAlgorithm_v6( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + ConvolutionBwdDataPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) + int cudnnGetConvolutionBackwardDataAlgorithm_v7( + Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf_v7* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithm( + Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithmEx( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionBackwardDataWorkspaceSize( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + ConvolutionBwdDataAlgo algo, size_t* sizeInBytes) + int cudnnConvolutionBackwardData_v3( + Handle handle, void* alpha, + FilterDescriptor filterDesc, void* filterData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor gradDesc, void* gradData) + + # Pooling + int cudnnCreatePoolingDescriptor(PoolingDescriptor* desc) + int cudnnSetPooling2dDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, + int verticalPadding, int horizontalPadding, int verticalStride, + int horizontalStride) + int cudnnSetPoolingNdDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int nbDims, + int* windowDimA, int* paddingA, int* strideA) + int cudnnDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) + int cudnnPoolingForward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnPoolingBackward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Batch Normalization + int cudnnDeriveBNTensorDescriptor( + TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, + BatchNormMode mode) + int cudnnBatchNormalizationForwardTraining( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, void* resultSaveMean, + void* resultSaveInvVariance) + int cudnnBatchNormalizationForwardInference( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, void* estimatedMean, void* estimatedVariance, + double epsilon) + int cudnnBatchNormalizationBackward( + Handle handle, BatchNormMode mode, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, void* bnScale, + void* dBnScaleResult, void* dBnBiasResult, + double epsilon, void* savedMean, void* savedInvVariance) + + int cudnnBatchNormalizationForwardTrainingEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + void* alpha, void* beta, + TensorDescriptor xDesc, void* x, + TensorDescriptor zDesc, void* z, + TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, + void* bnScale, void* bnBias, + double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, + void* resultSaveMean, void* resultSaveInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor zDesc, + TensorDescriptor yDesc, + TensorDescriptor bnScaleBiasMeanVarDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnBatchNormalizationBackwardEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnops, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor yDesc, void* y, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dzDesc, void* dz, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, + void* bnScaleData, void* bnBiasData, + void* dBnScaleData, void* dBnBiasData, + double epsilon, + void* savedMean, void* savedInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationBackwardExWorkspaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor yDesc, + TensorDescriptor dyDesc, + TensorDescriptor dzDesc, + TensorDescriptor dxDesc, + TensorDescriptor dBnScaleBiasDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + ActivationDescriptor activationDesc, + TensorDescriptor xDesc, + size_t* sizeInBytes) + + # Activation + int cudnnCreateActivationDescriptor( + ActivationDescriptor* activationDesc) + int cudnnSetActivationDescriptor( + ActivationDescriptor activationDesc, ActivationMode mode, + NanPropagation reluNanOpt, double reluCeiling) + int cudnnDestroyActivationDescriptor( + ActivationDescriptor activationDesc) + int cudnnSoftmaxForward( + Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, + void* alpha, TensorDescriptor srcDesc, void* srcData, + void* beta, TensorDescriptor dstDesc, void* dstData) + int cudnnSoftmaxBackward( + Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, + void* alpha, TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + int cudnnActivationForward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnActivationBackward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Dropout + int cudnnCreateDropoutDescriptor(DropoutDescriptor* desc) + int cudnnDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) + int cudnnDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) + int cudnnDropoutGetReserveSpaceSize( + TensorDescriptor xDesc, size_t* sizeInBytes) + int cudnnSetDropoutDescriptor( + DropoutDescriptor dropoutDesc, Handle handle, float dropout, + void* states, size_t stateSizeInBytes, unsigned long long seed) + int cudnnDropoutForward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor dstDesc, void* dstData, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnDropoutBackward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, + void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # CTC + int cudnnCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) + int cudnnDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) + int cudnnSetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType dataType) + int cudnnGetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType* dataType) + int cudnnGetCTCLossWorkspaceSize( + Handle handle, TensorDescriptor probsDesc, + TensorDescriptor gradientsDesc, int* labels, + int* labelLengths, int* inputLengths, CTCLossAlgo algo, + CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) + int cudnnCTCLoss( + Handle handle, TensorDescriptor probsDesc, + void* probs, int* labels, int* labelLengths, int* inputLengths, + void* costs, TensorDescriptor gradientsDesc, void* gradients, + CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, + void* workspace, size_t workSpaceSizeInBytes) + # RNN + int cudnnCreateRNNDescriptor(RNNDescriptor* rnnDesc) + int cudnnDestroyRNNDescriptor(RNNDescriptor rnnDesc) + int cudnnCreatePersistentRNNPlan( + RNNDescriptor rnnDesc, + const int minibatch, DataType dataType, + PersistentRNNPlan* plan) + int cudnnSetPersistentRNNPlan( + RNNDescriptor rnnDesc, PersistentRNNPlan plan) + int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) + int cudnnSetRNNDescriptor_v5( + RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, DataType dataType) + int cudnnSetRNNDescriptor_v6( + Handle handle, RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) + int cudnnSetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) + int cudnnGetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) + int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) + int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) + int cudnnSetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, + int maxSeqLength, int batchSize, int vectorSize, + const int seqLengthArray[], void *paddingFill) + int cudnnGetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType* dataType, + RNNDataLayout* layout, int* maxSeqLength, int* batchSize, + int* vectorSize, int arrayLengthRequested, int seqLengthArray[], + void* paddingFill) + int cudnnGetRNNWorkspaceSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int cudnnGetRNNTrainingReserveSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int cudnnGetRNNParamsSize( + Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, + size_t* sizeInBytes, DataType dataType) + int cudnnGetRNNLinLayerMatrixParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerMatDesc, + void** linLayerMat) + int cudnnGetRNNLinLayerBiasParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerBiasDesc, + void** linLayerBias) + int cudnnRNNForwardInference( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, + void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, + void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, + void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, + void* cy, void* workspace, size_t workSpaceSizeInBytes) + int cudnnRNNForwardTraining( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, + TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, + FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, + TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, + void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardData( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* yDesc, void* y, + TensorDescriptor* dyDesc, void* dy, + TensorDescriptor dhyDesc, void* dhy, + TensorDescriptor dcyDesc, void* dcy, + FilterDescriptor wDesc, void* w, + TensorDescriptor hxDesc, void* hx, + TensorDescriptor cxDesc, void* cx, + TensorDescriptor* dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, void* workspace, + size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeights( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, + TensorDescriptor* yDesc, void* y, + void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, + void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + int cudnnRNNForwardInferenceEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnRNNForwardTrainingEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardDataEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor yDesc, const void* y, + RNNDataDescriptor dyDesc, const void* dy, + RNNDataDescriptor dcDesc, const void* dcAttn, + TensorDescriptor dhyDesc, const void* dhy, + TensorDescriptor dcyDesc, const void* dcy, + FilterDescriptor wDesc, const void* w, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + RNNDataDescriptor dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, + RNNDataDescriptor dkDesc, void* dkeys, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeightsEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + RNNDataDescriptor yDesc, const void* y, + void* workSpace, size_t workSpaceSizeInBytes, + FilterDescriptor dwDesc, void* dw, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # Spatial Transformer + int cudnnCreateSpatialTransformerDescriptor( + SpatialTransformerDescriptor* stDesc) + int cudnnDestroySpatialTransformerDescriptor( + SpatialTransformerDescriptor stDesc) + int cudnnSetSpatialTransformerNdDescriptor( + SpatialTransformerDescriptor stDesc, SamplerType samplerType, + DataType dataType, int nbDims, int dimA[]) + int cudnnSpatialTfGridGeneratorForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* theta, void* grid) + int cudnnSpatialTfGridGeneratorBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* dgrid, void* dtheta) + int cudnnSpatialTfSamplerForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, + void* grid, void* beta, TensorDescriptor yDesc, void* y) + int cudnnSpatialTfSamplerBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, void* beta, + TensorDescriptor dxDesc, void* dx, void* alphaDgrid, + TensorDescriptor dyDesc, void* dy, void* grid, + void* betaDgrid, void* dgrid) + + # Fused Ops + int cudnnCreateFusedOpsConstParamPack( + FusedOpsConstParamPack* constPack, int ops) + int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) + int cudnnSetFusedOpsConstParamPackAttribute( + FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, + const void *param) + int cudnnGetFusedOpsConstParamPackAttribute( + const FusedOpsConstParamPack constPack, + FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) + int cudnnCreateFusedOpsVariantParamPack( + FusedOpsVariantParamPack *varPack, FusedOps ops) + int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) + int cudnnSetFusedOpsVariantParamPackAttribute( + FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, + void *ptr) + int cudnnGetFusedOpsVariantParamPackAttribute( + const FusedOpsVariantParamPack varPack, + FusedOpsVariantParamLabel paramLabel, void *ptr) + int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) + int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) + int cudnnMakeFusedOpsPlan( + Handle handle, FusedOpsPlan plan, + const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) + int cudnnFusedOpsExecute( + Handle handle, const FusedOpsPlan plan, + FusedOpsVariantParamPack varPack) + + # Build-time version + int CUDNN_VERSION + + # Constants + double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' cdef class CuDNNAlgoPerf: @@ -847,3 +848,4 @@ cpdef destroy(intptr_t handle): + diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index 078460816e4..d8a29eb9924 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -164,8 +164,8 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'cupyx.cusolver', 'cupy_backends.cuda.libs.curand_hip', 'cupy_backends.cuda.libs.nvrtc_hip', - 'cupy_backends.cuda.libs.cudnn', 'cupy_backends.cuda.libs.miopen', + 'cupy_backends.cuda.libs.cudnn', ], 'include': [ 'hip/hip_runtime_api.h', From c1b539620e66b0cce5ccb913754a7a0d77c24fb4 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Sun, 3 Dec 2023 06:18:00 +0000 Subject: [PATCH 16/26] almost working build --- cupy_backends/cuda/libs/cudnn.pyx | 130 ++++++- cupy_backends/cuda/libs/miopen.pyx | 598 +++++------------------------ cupy_backends/cupy_cudnn.h | 5 +- cupy_backends/cupy_miopen.h | 20 + install/cupy_builder/_features.py | 4 +- 5 files changed, 247 insertions(+), 510 deletions(-) create mode 100644 cupy_backends/cupy_miopen.h diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 835b8e0570c..f0474899fa2 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -6,10 +6,124 @@ cimport cython # NOQA from libcpp cimport vector from cupy_backends.cuda.api cimport driver -from cupy_backends.cuda.api cimport runtime +from cupy_backends.cuda.api import runtime from cupy_backends.cuda cimport stream as stream_module IF CUPY_USE_GEN_HIP_CODE: from cupy_backends.cuda.libs.miopen import * + cdef extern from '../../cupy_cudnn.h' nogil: + # Types + ctypedef int ActivationMode 'miopenActivationMode_t' + ctypedef int AddMode 'cudnnAddMode_t' + ctypedef int BatchNormMode 'miopenBatchNormMode_t' + ctypedef int BatchNormOps 'cudnnBatchNormOps_t' + ctypedef int ConvolutionBwdDataAlgo 'miopenBwdDataAlgorithm_t' + ctypedef int ConvolutionBwdDataPreference \ + 'cudnnConvolutionBwdDataPreference_t' + ctypedef struct ConvolutionBwdDataAlgoPerf \ + 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ + 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionBwdFilterAlgo 'miopenConvBwdWeightsAlgorithm_t' + ctypedef int ConvolutionBwdFilterPreference \ + 'cudnnConvolutionBwdFilterPreference_t' + ctypedef struct ConvolutionBwdFilterAlgoPerf \ + 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ + 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionFwdAlgo 'miopenConvolutionFwdAlgorithm_t' + ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' + ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionFwdAlgoPerf_v7 \ + 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionMode 'miopenConvolutionMode_t' + ctypedef int DataType 'miopenDataType_t' + ctypedef int MathType 'cudnnMathType_t' + ctypedef int DirectionMode 'miopenRNNDirectionMode_t' + ctypedef int NanPropagation 'miopenNanPropagation_t' + ctypedef int PoolingMode 'miopenPoolingMode_t' + ctypedef int RNNInputMode 'miopenRNNInputMode_t' + ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' + ctypedef int RNNMode 'miopenRNNMode_t' + ctypedef int RNNAlgo 'miopenRNNAlgo_t' + ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' + ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' + ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' + ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' + ctypedef int Status 'miopenStatus_t' + ctypedef int TensorFormat 'cudnnTensorFormat_t' + ctypedef int OpTensorOp 'miopenTensorOp_t' + + ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' + ctypedef int ReduceTensorIndices 'miopenReduceTensorIndices_t' + ctypedef int IndicesType 'miopenIndicesType_t' + ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' + ctypedef int FusedOps 'cudnnFusedOps_t' + ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' + ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' + ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' + ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' + + ctypedef void* ActivationDescriptor 'miopenActivationDescriptor_t' + ctypedef void* ConvolutionDescriptor 'miopenConvolutionDescriptor_t' + ctypedef void* DropoutDescriptor 'miopenDropoutDescriptor_t' + ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' + ctypedef void* Handle 'miopenHandle_t' + #cdef void* Handle 'miopenHandle_t' + ctypedef void* PoolingDescriptor 'miopenPoolingDescriptor_t' + ctypedef void* CTCLossDescriptor 'miopenCTCLossDescriptor_t' + ctypedef void* RNNDescriptor 'miopenRNNDescriptor_t' + ctypedef void* RNNDataDescriptor 'miopenRNNDataDescriptor_t' + ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' + ctypedef void* TensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* OpTensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* ReduceTensorDescriptor 'miopenReduceTensorDescriptor_t' + ctypedef void* SpatialTransformerDescriptor \ + 'cudnnSpatialTransformerDescriptor_t' + ctypedef void* SamplerType 'cudnnSamplerType_t' + ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' + ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' + ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' + # Error handling + const char* miopenGetErrorString(Status status) + # Version + size_t miopenGetVersion() + # Runtime error checking + int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + ErrQueryMode mode, RuntimeTag *tag) + # Initialization and CUDA cooperation + int miopenCreate(Handle* handle) + int miopenDestroy(Handle handle) + ELSE: ############################################################################### # Extern @@ -760,7 +874,7 @@ class CuDNNError(RuntimeError): def __init__(self, int status): self.status = status - if runtime._is_hip_environment: + if runtime._is_hip: msg = miopenGetErrorString(status) else: msg = cudnnGetErrorString(status) @@ -804,8 +918,8 @@ def get_build_version(): ############################################################################### cpdef size_t getVersion() except? 0: - if runtime._is_hip_environment: - return miopen.miopenGetVersion() + if runtime._is_hip: + return miopenGetVersion() else: return cudnnGetVersion() @@ -830,8 +944,8 @@ cpdef queryRuntimeError(intptr_t handle, int mode): cpdef intptr_t create() except? 0: cdef Handle handle with nogil: - if runtime._is_hip_environment: - status = miopen.miopenCreate(&handle) + if runtime._is_hip: + status = miopenCreate(&handle) else: status = cudnnCreate(&handle) check_status(status) @@ -840,8 +954,8 @@ cpdef intptr_t create() except? 0: cpdef destroy(intptr_t handle): with nogil: - if runtime._is_hip_environment: - status = miopen.miopenDestroy(handle) + if runtime._is_hip: + status = miopenDestroy(handle) else: status = cudnnDestroy(handle) check_status(status) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index cd68ca9f693..ab4b9b9b693 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -13,64 +13,15 @@ from cupy_backends.cuda cimport stream as stream_module # Extern ############################################################################### -cdef extern from '../../cupy_cudnn.h' nogil: +cdef extern from '../../cupy_miopen.h' nogil: # Types ctypedef int ActivationMode 'miopenActivationMode_t' - ctypedef int AddMode 'cudnnAddMode_t' ctypedef int BatchNormMode 'miopenBatchNormMode_t' - ctypedef int BatchNormOps 'cudnnBatchNormOps_t' ctypedef int ConvolutionBwdDataAlgo 'miopenBwdDataAlgorithm_t' - ctypedef int ConvolutionBwdDataPreference \ - 'cudnnConvolutionBwdDataPreference_t' - ctypedef struct ConvolutionBwdDataAlgoPerf \ - 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ - 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType ctypedef int ConvolutionBwdFilterAlgo 'miopenConvBwdWeightsAlgorithm_t' - ctypedef int ConvolutionBwdFilterPreference \ - 'cudnnConvolutionBwdFilterPreference_t' - ctypedef struct ConvolutionBwdFilterAlgoPerf \ - 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ - 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType ctypedef int ConvolutionFwdAlgo 'miopenConvolutionFwdAlgorithm_t' - ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' - ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionFwdAlgoPerf_v7 \ - 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType ctypedef int ConvolutionMode 'miopenConvolutionMode_t' ctypedef int DataType 'miopenDataType_t' - ctypedef int MathType 'cudnnMathType_t' ctypedef int DirectionMode 'miopenRNNDirectionMode_t' ctypedef int NanPropagation 'miopenNanPropagation_t' ctypedef int PoolingMode 'miopenPoolingMode_t' @@ -78,43 +29,26 @@ cdef extern from '../../cupy_cudnn.h' nogil: ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' ctypedef int RNNMode 'miopenRNNMode_t' ctypedef int RNNAlgo 'miopenRNNAlgo_t' - ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' - ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' ctypedef int Status 'miopenStatus_t' - ctypedef int TensorFormat 'cudnnTensorFormat_t' ctypedef int OpTensorOp 'miopenTensorOp_t' ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' ctypedef int ReduceTensorIndices 'miopenReduceTensorIndices_t' ctypedef int IndicesType 'miopenIndicesType_t' - ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' - ctypedef int FusedOps 'cudnnFusedOps_t' - ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' - ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' - ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' - ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' - ctypedef void* ActivationDescriptor 'miopenActivationDescriptor_t' ctypedef void* ConvolutionDescriptor 'miopenConvolutionDescriptor_t' ctypedef void* DropoutDescriptor 'miopenDropoutDescriptor_t' - ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' ctypedef void* Handle 'miopenHandle_t' ctypedef void* PoolingDescriptor 'miopenPoolingDescriptor_t' ctypedef void* CTCLossDescriptor 'miopenCTCLossDescriptor_t' ctypedef void* RNNDescriptor 'miopenRNNDescriptor_t' ctypedef void* RNNDataDescriptor 'miopenRNNDataDescriptor_t' - ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' ctypedef void* TensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* FilterDescriptor 'miopenTensorDescriptor_t' ctypedef void* OpTensorDescriptor 'miopenTensorDescriptor_t' ctypedef void* ReduceTensorDescriptor 'miopenReduceTensorDescriptor_t' - ctypedef void* SpatialTransformerDescriptor \ - 'cudnnSpatialTransformerDescriptor_t' - ctypedef void* SamplerType 'cudnnSamplerType_t' - ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' - ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' - ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' # Error handling const char* miopenGetErrorString(Status status) @@ -123,8 +57,8 @@ cdef extern from '../../cupy_cudnn.h' nogil: size_t miopenGetVersion() # Runtime error checking - int cudnnQueryRuntimeError(Handle handle, Status *rstatus, - ErrQueryMode mode, RuntimeTag *tag) + #int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + # ErrQueryMode mode, RuntimeTag *tag) # Initialization and CUDA cooperation int miopenCreate(Handle* handle) @@ -145,23 +79,9 @@ cdef extern from '../../cupy_cudnn.h' nogil: TensorDescriptor tensorDesc, DataType* dataType, int* n, int* c, int* h, int* w, int* nStride, int* cStride, int* hStride, int* wStride) - int cudnnSetTensorNdDescriptor( - TensorDescriptor tensorDesc, DataType dataType, int nbDims, - int* dimA, int* strideA) int miopenDestroyTensorDescriptor(TensorDescriptor tensorDesc) - int cudnnAddTensor_v3( - Handle handle, void* alpha, TensorDescriptor bDesc, - void* b, void* beta, TensorDescriptor yDesc, void* y) # Tensor operations - int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) - int cudnnSetOpTensorDescriptor( - OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, - DataType opTensorCompType, NanPropagation opTensorNanOpt) - int cudnnGetOpTensorDescriptor( - OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, - DataType* opTensorCompType, NanPropagation* opTensorNanOpt) - int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) int miopenOpTensor( Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, TensorDescriptor aDesc, void* A, void* alpha2, @@ -201,184 +121,28 @@ cdef extern from '../../cupy_cudnn.h' nogil: Handle handle, TensorDescriptor yDesc, void* y, void* alpha) # Filter manipulation - int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) - int cudnnSetFilter4dDescriptor_v4( - FilterDescriptor filterDesc, DataType dataType, - TensorFormat format, int k, int c, int h, int w) - int cudnnSetFilterNdDescriptor_v4( - FilterDescriptor filterDesc, DataType dataType, - TensorFormat format, int nbDims, const int filterDimA[]) - int cudnnGetFilterNdDescriptor_v4( - FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, - TensorFormat* format, int* nbDims, int filterDimA[]) - int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) # Convolution int miopenCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) - int cudnnSetConvolutionMathType( - ConvolutionDescriptor convDesc, MathType mathType) - int cudnnGetConvolutionMathType( - ConvolutionDescriptor convDesc, MathType *mathType) int miopenSetConvolutionGroupCount( ConvolutionDescriptor convDesc, int groupCount) int miopenGetConvolutionGroupCount( ConvolutionDescriptor convDesc, int *groupCount) - int cudnnSetConvolution2dDescriptor_v4( - ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, - int v, int dilation_h, int dilation_w, ConvolutionMode mode) - int cudnnSetConvolution2dDescriptor_v5( - ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, - int v, int dilation_h, int dilation_w, ConvolutionMode mode, - DataType computeType) - int cudnnSetConvolutionNdDescriptor_v3( - ConvolutionDescriptor convDesc, int arrayLength, int* padA, - int* filterStrideA, int* dilationA, ConvolutionMode mode, - DataType dataType) int miopenDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) - int cudnnFindConvolutionForwardAlgorithm( - Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, - ConvolutionDescriptor convDesc, TensorDescriptor yDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionFwdAlgoPerf* perfResults) - int cudnnFindConvolutionForwardAlgorithmEx( - Handle handle, TensorDescriptor xDesc, void* x, - FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, - TensorDescriptor yDesc, void* y, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionForwardAlgorithmEx_v7( - Handle handle, TensorDescriptor xDesc, void* x, - FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, - TensorDescriptor yDesc, void* y, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionForwardAlgorithm_v6( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, ConvolutionFwdPreference preference, - size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) - int cudnnGetConvolutionForwardAlgorithm_v7( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) int miopenConvolutionForwardGetWorkSpaceSize( Handle handle, TensorDescriptor srcDesc, FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, TensorDescriptor destDesc, size_t* sizeInBytes) - int cudnnConvolutionForward( - Handle handle, void* alpha, TensorDescriptor srcDesc, - void* srcData, FilterDescriptor filterDesc, void* filterData, - ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - TensorDescriptor destDesc, void* destData) - int cudnnConvolutionBackwardBias( - Handle handle, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor destDesc, void* destData) - int cudnnFindConvolutionBackwardFilterAlgorithm( - Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, - ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdFilterAlgoPerf* perfResults) - int cudnnFindConvolutionBackwardFilterAlgorithmEx( - Handle handle, TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( - Handle handle, TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionBackwardFilterAlgorithm_v6( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, - ConvolutionBwdFilterPreference preference, - size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) - int cudnnGetConvolutionBackwardFilterAlgorithm_v7( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdFilterAlgoPerf_v7* perfResults) - int cudnnGetConvolutionBackwardFilterWorkspaceSize( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, - ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) - int cudnnConvolutionBackwardFilter_v3( - Handle handle, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor diffDesc, void* diffData, - ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - FilterDescriptor gradDesc, void* gradData) - int cudnnGetConvolutionBackwardDataAlgorithm_v6( - Handle handle, FilterDescriptor filterDesc, - TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, - ConvolutionBwdDataPreference preference, - size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) - int cudnnGetConvolutionBackwardDataAlgorithm_v7( - Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdDataAlgoPerf_v7* perfResults) - int cudnnFindConvolutionBackwardDataAlgorithm( - Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, - ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdDataAlgoPerf* perfResults) - int cudnnFindConvolutionBackwardDataAlgorithmEx( - Handle handle, FilterDescriptor wDesc, void* w, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( - Handle handle, FilterDescriptor wDesc, void* w, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) int miopenConvolutionBackwardDataGetWorkSpaceSize( Handle handle, FilterDescriptor filterDesc, TensorDescriptor diffDesc, ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, size_t* sizeInBytes) - int cudnnConvolutionBackwardData_v3( - Handle handle, void* alpha, - FilterDescriptor filterDesc, void* filterData, - TensorDescriptor diffDesc, void* diffData, - ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - TensorDescriptor gradDesc, void* gradData) # Pooling int miopenCreatePoolingDescriptor(PoolingDescriptor* desc) - int cudnnSetPooling2dDescriptor_v4( - PoolingDescriptor poolingDesc, PoolingMode mode, - NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, - int verticalPadding, int horizontalPadding, int verticalStride, - int horizontalStride) - int cudnnSetPoolingNdDescriptor_v4( - PoolingDescriptor poolingDesc, PoolingMode mode, - NanPropagation maxpoolingNanOpt, int nbDims, - int* windowDimA, int* paddingA, int* strideA) int miopenDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) - int cudnnPoolingForward( - Handle handle, PoolingDescriptor poolingDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor dstDesc, void* dstData) - int cudnnPoolingBackward( - Handle handle, PoolingDescriptor poolingDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, - TensorDescriptor destDesc, void* destData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - # Batch Normalization int miopenDeriveBNTensorDescriptor( TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, @@ -410,68 +174,6 @@ cdef extern from '../../cupy_cudnn.h' nogil: void* dBnScaleResult, void* dBnBiasResult, double epsilon, void* savedMean, void* savedInvVariance) - int cudnnBatchNormalizationForwardTrainingEx( - Handle handle, - BatchNormMode mode, BatchNormOps bnOps, - void* alpha, void* beta, - TensorDescriptor xDesc, void* x, - TensorDescriptor zDesc, void* z, - TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, - void* bnScale, void* bnBias, - double exponentialAverageFactor, - void* resultRunningMean, void* resultRunningVariance, - double epsilon, - void* resultSaveMean, void* resultSaveInvVariance, - ActivationDescriptor activationDesc, - void* workspace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - Handle handle, - BatchNormMode mode, BatchNormOps bnOps, - TensorDescriptor xDesc, - TensorDescriptor zDesc, - TensorDescriptor yDesc, - TensorDescriptor bnScaleBiasMeanVarDesc, - ActivationDescriptor activationDesc, - size_t* sizeInBytes) - int cudnnBatchNormalizationBackwardEx( - Handle handle, - BatchNormMode mode, BatchNormOps bnops, - void* alphaDataDiff, void* betaDataDiff, - void* alphaParamDiff, void* betaParamDiff, - TensorDescriptor xDesc, void* x, - TensorDescriptor yDesc, void* y, - TensorDescriptor dyDesc, void* dy, - TensorDescriptor dzDesc, void* dz, - TensorDescriptor dxDesc, void* dx, - TensorDescriptor dBnScaleBiasDesc, - void* bnScaleData, void* bnBiasData, - void* dBnScaleData, void* dBnBiasData, - double epsilon, - void* savedMean, void* savedInvVariance, - ActivationDescriptor activationDesc, - void* workspace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnGetBatchNormalizationBackwardExWorkspaceSize( - Handle handle, - BatchNormMode mode, - BatchNormOps bnOps, - TensorDescriptor xDesc, - TensorDescriptor yDesc, - TensorDescriptor dyDesc, - TensorDescriptor dzDesc, - TensorDescriptor dxDesc, - TensorDescriptor dBnScaleBiasDesc, - ActivationDescriptor activationDesc, - size_t* sizeInBytes) - int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - Handle handle, - BatchNormMode mode, - BatchNormOps bnOps, - ActivationDescriptor activationDesc, - TensorDescriptor xDesc, - size_t* sizeInBytes) # Activation int miopenCreateActivationDescriptor( @@ -490,16 +192,6 @@ cdef extern from '../../cupy_cudnn.h' nogil: void* alpha, TensorDescriptor srcDesc, void* srcData, TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, TensorDescriptor destDiffDesc, void* destDiffData) - int cudnnActivationForward_v4( - Handle handle, ActivationDescriptor activationDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor dstDesc, void* dstData) - int cudnnActivationBackward_v4( - Handle handle, ActivationDescriptor activationDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, - TensorDescriptor destDesc, void* destData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) # Dropout int miopenCreateDropoutDescriptor(DropoutDescriptor* desc) @@ -507,26 +199,10 @@ cdef extern from '../../cupy_cudnn.h' nogil: int miopenDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) int miopenDropoutGetReserveSpaceSize( TensorDescriptor xDesc, size_t* sizeInBytes) - int cudnnSetDropoutDescriptor( - DropoutDescriptor dropoutDesc, Handle handle, float dropout, - void* states, size_t stateSizeInBytes, unsigned long long seed) - int cudnnDropoutForward( - Handle handle, DropoutDescriptor dropoutDesc, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor dstDesc, void* dstData, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnDropoutBackward( - Handle handle, DropoutDescriptor dropoutDesc, - TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, - void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) # CTC int miopenCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) int miopenDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) - int cudnnSetCTCLossDescriptor( - CTCLossDescriptor ctcLossDesc, DataType dataType) - int cudnnGetCTCLossDescriptor( - CTCLossDescriptor ctcLossDesc, DataType* dataType) int miopenGetCTCLossWorkspaceSize( Handle handle, TensorDescriptor probsDesc, TensorDescriptor gradientsDesc, int* labels, @@ -541,36 +217,6 @@ cdef extern from '../../cupy_cudnn.h' nogil: # RNN int miopenCreateRNNDescriptor(RNNDescriptor* rnnDesc) int miopenDestroyRNNDescriptor(RNNDescriptor rnnDesc) - int cudnnCreatePersistentRNNPlan( - RNNDescriptor rnnDesc, - const int minibatch, DataType dataType, - PersistentRNNPlan* plan) - int cudnnSetPersistentRNNPlan( - RNNDescriptor rnnDesc, PersistentRNNPlan plan) - int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) - int cudnnSetRNNDescriptor_v5( - RNNDescriptor rnnDesc, int hiddenSize, - int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, - DirectionMode direction, RNNMode mode, DataType dataType) - int cudnnSetRNNDescriptor_v6( - Handle handle, RNNDescriptor rnnDesc, int hiddenSize, - int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, - DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) - int cudnnSetRNNPaddingMode( - RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) - int cudnnGetRNNPaddingMode( - RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) - int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) - int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) - int cudnnSetRNNDataDescriptor( - RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, - int maxSeqLength, int batchSize, int vectorSize, - const int seqLengthArray[], void *paddingFill) - int cudnnGetRNNDataDescriptor( - RNNDataDescriptor RNNDataDesc, DataType* dataType, - RNNDataLayout* layout, int* maxSeqLength, int* batchSize, - int* vectorSize, int arrayLengthRequested, int seqLengthArray[], - void* paddingFill) int miopenGetRNNWorkspaceSize( Handle handle, RNNDescriptor rnnDesc, int seqLength, TensorDescriptor* xDesc, size_t* sizeInBytes) @@ -580,16 +226,6 @@ cdef extern from '../../cupy_cudnn.h' nogil: int miopenGetRNNParamsSize( Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, size_t* sizeInBytes, DataType dataType) - int cudnnGetRNNLinLayerMatrixParams( - Handle handle, RNNDescriptor rnnDesc, int layer, - TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, - int linLayerID, FilterDescriptor linLayerMatDesc, - void** linLayerMat) - int cudnnGetRNNLinLayerBiasParams( - Handle handle, RNNDescriptor rnnDesc, int layer, - TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, - int linLayerID, FilterDescriptor linLayerBiasDesc, - void** linLayerBias) int miopenRNNForwardInference( Handle handle, RNNDescriptor rnnDesc, int seqLength, TensorDescriptor* xDesc, @@ -605,137 +241,105 @@ cdef extern from '../../cupy_cudnn.h' nogil: TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardData( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* yDesc, void* y, - TensorDescriptor* dyDesc, void* dy, - TensorDescriptor dhyDesc, void* dhy, - TensorDescriptor dcyDesc, void* dcy, - FilterDescriptor wDesc, void* w, - TensorDescriptor hxDesc, void* hx, - TensorDescriptor cxDesc, void* cx, - TensorDescriptor* dxDesc, void* dx, - TensorDescriptor dhxDesc, void* dhx, - TensorDescriptor dcxDesc, void* dcx, void* workspace, - size_t workSpaceSizeInBytes, void* reserveSpace, - size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardWeights( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, - TensorDescriptor* yDesc, void* y, - void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, - void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) - - int cudnnRNNForwardInferenceEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - FilterDescriptor wDesc, const void* w, - RNNDataDescriptor yDesc, void* y, - TensorDescriptor hyDesc, void* hy, - TensorDescriptor cyDesc, void* cy, - RNNDataDescriptor kDesc, const void* keys, - RNNDataDescriptor cDesc, void* cAttn, - RNNDataDescriptor iDesc, void* iAttn, - RNNDataDescriptor qDesc, void* queries, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnRNNForwardTrainingEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - FilterDescriptor wDesc, const void* w, - RNNDataDescriptor yDesc, void* y, - TensorDescriptor hyDesc, void* hy, - TensorDescriptor cyDesc, void* cy, - RNNDataDescriptor kDesc, const void* keys, - RNNDataDescriptor cDesc, void* cAttn, - RNNDataDescriptor iDesc, void* iAttn, - RNNDataDescriptor qDesc, void* queries, - void* workSpace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardDataEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor yDesc, const void* y, - RNNDataDescriptor dyDesc, const void* dy, - RNNDataDescriptor dcDesc, const void* dcAttn, - TensorDescriptor dhyDesc, const void* dhy, - TensorDescriptor dcyDesc, const void* dcy, - FilterDescriptor wDesc, const void* w, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - RNNDataDescriptor dxDesc, void* dx, - TensorDescriptor dhxDesc, void* dhx, - TensorDescriptor dcxDesc, void* dcx, - RNNDataDescriptor dkDesc, void* dkeys, - void* workSpace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardWeightsEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - RNNDataDescriptor yDesc, const void* y, - void* workSpace, size_t workSpaceSizeInBytes, - FilterDescriptor dwDesc, void* dw, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - - # Spatial Transformer - int cudnnCreateSpatialTransformerDescriptor( - SpatialTransformerDescriptor* stDesc) - int cudnnDestroySpatialTransformerDescriptor( - SpatialTransformerDescriptor stDesc) - int cudnnSetSpatialTransformerNdDescriptor( - SpatialTransformerDescriptor stDesc, SamplerType samplerType, - DataType dataType, int nbDims, int dimA[]) - int cudnnSpatialTfGridGeneratorForward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* theta, void* grid) - int cudnnSpatialTfGridGeneratorBackward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* dgrid, void* dtheta) - int cudnnSpatialTfSamplerForward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* alpha, TensorDescriptor xDesc, void* x, - void* grid, void* beta, TensorDescriptor yDesc, void* y) - int cudnnSpatialTfSamplerBackward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* alpha, TensorDescriptor xDesc, void* x, void* beta, - TensorDescriptor dxDesc, void* dx, void* alphaDgrid, - TensorDescriptor dyDesc, void* dy, void* grid, - void* betaDgrid, void* dgrid) - - # Fused Ops - int cudnnCreateFusedOpsConstParamPack( - FusedOpsConstParamPack* constPack, int ops) - int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) - int cudnnSetFusedOpsConstParamPackAttribute( - FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, - const void *param) - int cudnnGetFusedOpsConstParamPackAttribute( - const FusedOpsConstParamPack constPack, - FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) - int cudnnCreateFusedOpsVariantParamPack( - FusedOpsVariantParamPack *varPack, FusedOps ops) - int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) - int cudnnSetFusedOpsVariantParamPackAttribute( - FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, - void *ptr) - int cudnnGetFusedOpsVariantParamPackAttribute( - const FusedOpsVariantParamPack varPack, - FusedOpsVariantParamLabel paramLabel, void *ptr) - int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) - int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) - int cudnnMakeFusedOpsPlan( - Handle handle, FusedOpsPlan plan, - const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) - int cudnnFusedOpsExecute( - Handle handle, const FusedOpsPlan plan, - FusedOpsVariantParamPack varPack) # Build-time version - int CUDNN_VERSION + int HIP_VERSION # Constants - double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' + double _EPSILON 'EPSILON' + +cdef class CuDNNAlgoPerf: + + def __init__(self, algo, status, time, memory, determinism, mathType): + self.algo = algo + self.status = status + self.time = time + self.memory = memory + self.determinism = determinism + self.mathType = mathType + + +############################################################################### +# Error handling +############################################################################### + +class CuDNNError(RuntimeError): + + def __init__(self, int status): + self.status = status + msg = miopenGetErrorString(status) + super(CuDNNError, self).__init__( + 'cuDNN Error: {}'.format(msg.decode())) + self._infos = [] + + def add_info(self, info): + assert isinstance(info, str) + self._infos.append(info) + + def add_infos(self, infos): + assert isinstance(infos, list) + self._infos.extend(infos) + + def __str__(self): + base = super(CuDNNError, self).__str__() + return base + ''.join( + '\n ' + info for info in self._infos) + + def __reduce__(self): + return (type(self), (self.status,)) + + +@cython.profile(False) +cpdef inline check_status(int status): + if status != 0: + raise CuDNNError(status) + + +############################################################################### +# Build-time version +############################################################################### + +def get_build_version(): + return HIP_VERSION + + +############################################################################### +# Version +############################################################################### + +cpdef size_t getVersion() except? 0: + return miopenGetVersion() + + +############################################################################### +# Runtime error checking +############################################################################### + +#cpdef queryRuntimeError(intptr_t handle, int mode): +# cdef Status rstatus +# with nogil: +# status = cudnnQueryRuntimeError(handle, &rstatus, +# mode, 0) +# check_status(status) +# return rstatus + + +############################################################################### +# Initialization and CUDA cooperation +############################################################################### + +cpdef intptr_t create() except? 0: + cdef Handle handle + with nogil: + status = miopenCreate(&handle) + check_status(status) + return handle + + +cpdef destroy(intptr_t handle): + with nogil: + status = miopenDestroy(handle) + check_status(status) + + diff --git a/cupy_backends/cupy_cudnn.h b/cupy_backends/cupy_cudnn.h index f30147ac9e8..4e32789fe45 100644 --- a/cupy_backends/cupy_cudnn.h +++ b/cupy_backends/cupy_cudnn.h @@ -1,5 +1,6 @@ // This file is a stub header file of cudnn for Read the Docs. + #ifndef INCLUDE_GUARD_CUPY_CUDNN_H #define INCLUDE_GUARD_CUPY_CUDNN_H #if CUPY_USE_HIP @@ -17,15 +18,13 @@ #endif // #ifdef CUPY_NO_CUDA - - /////////////////////////////////////////////////////////////////////////////// // Definitions are for compatibility with cuDNN v5 and v6. /////////////////////////////////////////////////////////////////////////////// extern "C" { -#if defined(CUPY_NO_CUDA) || (CUDNN_VERSION < 6000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 6000) typedef enum {} cudnnRNNAlgo_t; typedef enum {} cudnnReduceTensorOp_t; diff --git a/cupy_backends/cupy_miopen.h b/cupy_backends/cupy_miopen.h new file mode 100644 index 00000000000..9b6e5f74d79 --- /dev/null +++ b/cupy_backends/cupy_miopen.h @@ -0,0 +1,20 @@ +// This file is a stub header file of cudnn for Read the Docs. + + +#ifndef INCLUDE_GUARD_CUPY_CUDNN_H +#define INCLUDE_GUARD_CUPY_CUDNN_H +#if CUPY_USE_HIP + +#include "miopen/miopen.h" + +#elif !defined(CUPY_NO_CUDA) + +#include + +#elif defined(CUPY_NO_CUDA) + +#include "stub/cupy_cuda_common.h" +#include "stub/cupy_cudnn.h" + + +#endif // #ifdef CUPY_NO_CUDA diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index d8a29eb9924..2a26f842f7f 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -161,11 +161,11 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'cupy_backends.cuda.libs.nvtx', 'cupy_backends.cuda.libs.cusolver', 'cupy_backends.cuda.libs.cusolver_hip', - 'cupyx.cusolver', + #'cupyx.cusolver', 'cupy_backends.cuda.libs.curand_hip', 'cupy_backends.cuda.libs.nvrtc_hip', 'cupy_backends.cuda.libs.miopen', - 'cupy_backends.cuda.libs.cudnn', + #'cupy_backends.cuda.libs.cudnn', ], 'include': [ 'hip/hip_runtime_api.h', From 5f451a32cb664eee71e05cc9037a1339142bc82b Mon Sep 17 00:00:00 2001 From: bmedishe Date: Mon, 4 Dec 2023 18:54:36 +0000 Subject: [PATCH 17/26] update got miopenGetVersion --- cupy_backends/cuda/libs/cudnn.pyx | 2022 +++++++++++++++++++++++++--- cupy_backends/cuda/libs/miopen.pyx | 8 +- cupy_backends/cupy_miopen.h | 3 +- install/cupy_builder/_features.py | 4 +- 4 files changed, 1809 insertions(+), 228 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index f0474899fa2..9e6a28d84da 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -6,124 +6,12 @@ cimport cython # NOQA from libcpp cimport vector from cupy_backends.cuda.api cimport driver -from cupy_backends.cuda.api import runtime +from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module IF CUPY_USE_GEN_HIP_CODE: from cupy_backends.cuda.libs.miopen import * - cdef extern from '../../cupy_cudnn.h' nogil: - # Types - ctypedef int ActivationMode 'miopenActivationMode_t' - ctypedef int AddMode 'cudnnAddMode_t' - ctypedef int BatchNormMode 'miopenBatchNormMode_t' - ctypedef int BatchNormOps 'cudnnBatchNormOps_t' - ctypedef int ConvolutionBwdDataAlgo 'miopenBwdDataAlgorithm_t' - ctypedef int ConvolutionBwdDataPreference \ - 'cudnnConvolutionBwdDataPreference_t' - ctypedef struct ConvolutionBwdDataAlgoPerf \ - 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ - 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionBwdFilterAlgo 'miopenConvBwdWeightsAlgorithm_t' - ctypedef int ConvolutionBwdFilterPreference \ - 'cudnnConvolutionBwdFilterPreference_t' - ctypedef struct ConvolutionBwdFilterAlgoPerf \ - 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ - 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionFwdAlgo 'miopenConvolutionFwdAlgorithm_t' - ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' - ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionFwdAlgoPerf_v7 \ - 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionMode 'miopenConvolutionMode_t' - ctypedef int DataType 'miopenDataType_t' - ctypedef int MathType 'cudnnMathType_t' - ctypedef int DirectionMode 'miopenRNNDirectionMode_t' - ctypedef int NanPropagation 'miopenNanPropagation_t' - ctypedef int PoolingMode 'miopenPoolingMode_t' - ctypedef int RNNInputMode 'miopenRNNInputMode_t' - ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' - ctypedef int RNNMode 'miopenRNNMode_t' - ctypedef int RNNAlgo 'miopenRNNAlgo_t' - ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' - ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' - ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' - ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' - ctypedef int Status 'miopenStatus_t' - ctypedef int TensorFormat 'cudnnTensorFormat_t' - ctypedef int OpTensorOp 'miopenTensorOp_t' - - ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' - ctypedef int ReduceTensorIndices 'miopenReduceTensorIndices_t' - ctypedef int IndicesType 'miopenIndicesType_t' - ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' - ctypedef int FusedOps 'cudnnFusedOps_t' - ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' - ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' - ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' - ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' - - ctypedef void* ActivationDescriptor 'miopenActivationDescriptor_t' - ctypedef void* ConvolutionDescriptor 'miopenConvolutionDescriptor_t' - ctypedef void* DropoutDescriptor 'miopenDropoutDescriptor_t' - ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' - ctypedef void* Handle 'miopenHandle_t' - #cdef void* Handle 'miopenHandle_t' - ctypedef void* PoolingDescriptor 'miopenPoolingDescriptor_t' - ctypedef void* CTCLossDescriptor 'miopenCTCLossDescriptor_t' - ctypedef void* RNNDescriptor 'miopenRNNDescriptor_t' - ctypedef void* RNNDataDescriptor 'miopenRNNDataDescriptor_t' - ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' - ctypedef void* TensorDescriptor 'miopenTensorDescriptor_t' - ctypedef void* OpTensorDescriptor 'miopenTensorDescriptor_t' - ctypedef void* ReduceTensorDescriptor 'miopenReduceTensorDescriptor_t' - ctypedef void* SpatialTransformerDescriptor \ - 'cudnnSpatialTransformerDescriptor_t' - ctypedef void* SamplerType 'cudnnSamplerType_t' - ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' - ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' - ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' - # Error handling - const char* miopenGetErrorString(Status status) - # Version - size_t miopenGetVersion() - # Runtime error checking - int cudnnQueryRuntimeError(Handle handle, Status *rstatus, - ErrQueryMode mode, RuntimeTag *tag) - # Initialization and CUDA cooperation - int miopenCreate(Handle* handle) - int miopenDestroy(Handle handle) - + from cupy_backends.cuda.libs.cusolver_hip import _get_cuda_build_version + from cupy_backends.cuda.libs.cusolver_hip import _getVersion ELSE: ############################################################################### # Extern @@ -853,113 +741,1805 @@ ELSE: # Constants double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' - - -cdef class CuDNNAlgoPerf: - - def __init__(self, algo, status, time, memory, determinism, mathType): - self.algo = algo - self.status = status - self.time = time - self.memory = memory - self.determinism = determinism - self.mathType = mathType - - -############################################################################### -# Error handling -############################################################################### - -class CuDNNError(RuntimeError): - - def __init__(self, int status): - self.status = status - if runtime._is_hip: - msg = miopenGetErrorString(status) - else: + + + cdef class CuDNNAlgoPerf: + + def __init__(self, algo, status, time, memory, determinism, mathType): + self.algo = algo + self.status = status + self.time = time + self.memory = memory + self.determinism = determinism + self.mathType = mathType + + + ############################################################################### + # Error handling + ############################################################################### + + class CuDNNError(RuntimeError): + + def __init__(self, int status): + self.status = status msg = cudnnGetErrorString(status) - super(CuDNNError, self).__init__( - 'cuDNN Error: {}'.format(msg.decode())) - self._infos = [] - - def add_info(self, info): - assert isinstance(info, str) - self._infos.append(info) - - def add_infos(self, infos): - assert isinstance(infos, list) - self._infos.extend(infos) - - def __str__(self): - base = super(CuDNNError, self).__str__() - return base + ''.join( - '\n ' + info for info in self._infos) - - def __reduce__(self): - return (type(self), (self.status,)) - - -@cython.profile(False) -cpdef inline check_status(int status): - if status != 0: - raise CuDNNError(status) - - -############################################################################### -# Build-time version -############################################################################### - -def get_build_version(): - return CUDNN_VERSION - - -############################################################################### -# Version -############################################################################### - -cpdef size_t getVersion() except? 0: - if runtime._is_hip: - return miopenGetVersion() - else: + super(CuDNNError, self).__init__( + 'cuDNN Error: {}'.format(msg.decode())) + self._infos = [] + + def add_info(self, info): + assert isinstance(info, str) + self._infos.append(info) + + def add_infos(self, infos): + assert isinstance(infos, list) + self._infos.extend(infos) + + def __str__(self): + base = super(CuDNNError, self).__str__() + return base + ''.join( + '\n ' + info for info in self._infos) + + def __reduce__(self): + return (type(self), (self.status,)) + + + @cython.profile(False) + cpdef inline check_status(int status): + if status != 0: + raise CuDNNError(status) + + + ############################################################################### + # Build-time version + ############################################################################### + + def get_build_version(): + return CUDNN_VERSION + + + ############################################################################### + # Version + ############################################################################### + + cpdef size_t getVersion() except? 0: return cudnnGetVersion() - - -############################################################################### -# Runtime error checking -############################################################################### - -cpdef queryRuntimeError(intptr_t handle, int mode): - cdef Status rstatus - with nogil: - status = cudnnQueryRuntimeError(handle, &rstatus, - mode, 0) - check_status(status) - return rstatus - - -############################################################################### -# Initialization and CUDA cooperation -############################################################################### - -cpdef intptr_t create() except? 0: - cdef Handle handle - with nogil: - if runtime._is_hip: - status = miopenCreate(&handle) - else: + + + ############################################################################### + # Runtime error checking + ############################################################################### + + cpdef queryRuntimeError(intptr_t handle, int mode): + cdef Status rstatus + with nogil: + status = cudnnQueryRuntimeError(handle, &rstatus, + mode, 0) + check_status(status) + return rstatus + + + ############################################################################### + # Initialization and CUDA cooperation + ############################################################################### + + cpdef intptr_t create() except? 0: + cdef Handle handle + with nogil: status = cudnnCreate(&handle) - check_status(status) - return handle - - -cpdef destroy(intptr_t handle): - with nogil: - if runtime._is_hip: - status = miopenDestroy(handle) - else: + check_status(status) + return handle + + + cpdef destroy(intptr_t handle): + with nogil: status = cudnnDestroy(handle) - check_status(status) - - - - + check_status(status) + + + cpdef setStream(intptr_t handle, size_t stream): + # TODO(leofang): The support of stream capture is not mentioned at all in + # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. + if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): + raise NotImplementedError( + 'calling cuDNN API during stream capture is currently ' + 'unsupported') + + status = cudnnSetStream(handle, stream) + check_status(status) + + + cpdef size_t getStream(intptr_t handle) except? 0: + cdef driver.Stream stream + status = cudnnGetStream(handle, &stream) + check_status(status) + return stream + + + cdef _setStream(intptr_t handle): + """Set current stream""" + setStream(handle, stream_module.get_current_stream_ptr()) + + ############################################################################### + # Tensor manipulation + ############################################################################### + + cpdef size_t createTensorDescriptor() except? 0: + cdef TensorDescriptor descriptor + status = cudnnCreateTensorDescriptor(&descriptor) + check_status(status) + return descriptor + + + cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w): + status = cudnnSetTensor4dDescriptor( + tensorDesc, format, + dataType, n, c, h, w) + check_status(status) + + + cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride): + status = cudnnSetTensor4dDescriptorEx( + tensorDesc, dataType, n, c, h, w, + nStride, cStride, hStride, wStride) + check_status(status) + + + cpdef tuple getTensor4dDescriptor(size_t tensorDesc): + cdef DataType dataType + cdef int n, c, h, w, nStride, cStride, hStride, wStride + status = cudnnGetTensor4dDescriptor( + tensorDesc, &dataType, + &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) + check_status(status) + return dataType, n, c, h, w, nStride, cStride, hStride, wStride + + + cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA): + status = cudnnSetTensorNdDescriptor( + tensorDesc, dataType, nbDims, + dimA, strideA) + check_status(status) + + + cpdef destroyTensorDescriptor(size_t tensorDesc): + status = cudnnDestroyTensorDescriptor(tensorDesc) + check_status(status) + + + cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnAddTensor_v3( + handle, alpha, bDesc, + b, beta, yDesc, y) + check_status(status) + + + ############################################################################### + # Tensor operations + ############################################################################### + + cpdef size_t createOpTensorDescriptor() except? 0: + cdef OpTensorDescriptor opTensorDesc + status = cudnnCreateOpTensorDescriptor(&opTensorDesc) + check_status(status) + return opTensorDesc + + + cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt): + status = cudnnSetOpTensorDescriptor( + opTensorDesc, opTensorOp, + opTensorCompType, opTensorNanOpt) + check_status(status) + + + cpdef getOpTensorDescriptor(size_t opTensorDesc): + cdef OpTensorOp opTensorOp + cdef DataType opTensorCompType + cdef NanPropagation opTensorNanOpt + status = cudnnGetOpTensorDescriptor( + opTensorDesc, &opTensorOp, &opTensorCompType, + &opTensorNanOpt) + check_status(status) + return opTensorOp, opTensorCompType, opTensorNanOpt + + + cpdef destroyOpTensorDescriptor(size_t opTensorDesc): + status = cudnnDestroyOpTensorDescriptor(opTensorDesc) + check_status(status) + + + cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) + check_status(status) + + + ############################################################################### + # Tensor reductions + ############################################################################### + + cpdef size_t createReduceTensorDescriptor() except? 0: + cdef ReduceTensorDescriptor reduceTensorDesc + status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) + check_status(status) + return reduceTensorDesc + + cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, + int reduceTensorNanOpt, int reduceTensorIndices, + int reduceTensorIndicesType): + status = cudnnSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) + check_status(status) + + + cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): + cdef ReduceTensorOp redOp + cdef DataType redCompType + cdef NanPropagation redNanOpt + cdef ReduceTensorIndices redIndices + cdef IndicesType redIndicesType + status = cudnnGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) + check_status(status) + return redOp, redCompType, redNanOpt, redIndices, redIndicesType + + + cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): + status = cudnnDestroyReduceTensorDescriptor( + reduceTensorDesc) + check_status(status) + + + cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef size_t getReductionWorkspaceSize(intptr_t handle, + size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) + check_status(status) + + + cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): + _setStream(handle) + with nogil: + status = cudnnSetTensor( + handle, yDesc, y, + valuePtr) + check_status(status) + + + cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): + _setStream(handle) + with nogil: + status = cudnnScaleTensor( + handle, yDesc, y, + alpha) + check_status(status) + + + ############################################################################### + # Filter manipulation + ############################################################################### + + cpdef size_t createFilterDescriptor() except? 0: + cdef FilterDescriptor desc + status = cudnnCreateFilterDescriptor(&desc) + check_status(status) + return desc + + + cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, + int format, int k, int c, int h, int w): + status = cudnnSetFilter4dDescriptor_v4( + filterDesc, dataType, + format, k, c, h, w) + check_status(status) + + + cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, + int format, int nbDims, size_t filterDimA): + status = cudnnSetFilterNdDescriptor_v4( + filterDesc, dataType, + format, nbDims, filterDimA) + check_status(status) + + + cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): + cdef DataType dataType + cdef TensorFormat format + cdef int nbDims + cdef vector.vector[int] filterDimA + filterDimA.resize(nbDimsRequested) + + status = cudnnGetFilterNdDescriptor_v4( + wDesc, nbDimsRequested, &dataType, + &format, &nbDims, filterDimA.data()) + check_status(status) + return dataType, format, nbDims, tuple(filterDimA) + + + cpdef destroyFilterDescriptor(size_t filterDesc): + status = cudnnDestroyFilterDescriptor(filterDesc) + check_status(status) + + + ############################################################################### + # Convolution + ############################################################################### + + cpdef size_t createConvolutionDescriptor() except? 0: + cdef ConvolutionDescriptor desc + status = cudnnCreateConvolutionDescriptor(&desc) + check_status(status) + return desc + + + cpdef setConvolutionMathType(size_t convDesc, size_t mathType): + status = cudnnSetConvolutionMathType( + convDesc, mathType) + check_status(status) + + + cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: + cdef MathType mathType + status = cudnnGetConvolutionMathType( + convDesc, &mathType) + check_status(status) + return mathType + + + cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): + status = cudnnSetConvolutionGroupCount( + convDesc, groupCount) + check_status(status) + + + cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: + cdef int groupCount + status = cudnnGetConvolutionGroupCount( + convDesc, &groupCount) + check_status(status) + return groupCount + + + cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode): + status = cudnnSetConvolution2dDescriptor_v4( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode) + check_status(status) + + + cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType): + status = cudnnSetConvolution2dDescriptor_v5( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode, computeType) + check_status(status) + + + cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType): + status = cudnnSetConvolutionNdDescriptor_v3( + convDesc, arrayLength, padA, + filterStrideA, dilationA, mode, + dataType) + check_status(status) + + + cpdef destroyConvolutionDescriptor(size_t convDesc): + status = cudnnDestroyConvolutionDescriptor( + convDesc) + check_status(status) + + + cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, + size_t yDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithm( + handle, xDesc, wDesc, + convDesc, yDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + + cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx_v7( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + + cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: + cdef ConvolutionFwdAlgo algo + status = cudnnGetConvolutionForwardAlgorithm_v6( + handle, srcDesc, + filterDesc, convDesc, + destDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + + cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionForwardAlgorithm_v7( + handle, srcDesc, + filterDesc, convDesc, + destDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + + cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionForwardWorkspaceSize( + handle, srcDesc, + filterDesc, convDesc, + destDesc, algo, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) + check_status(status) + + + cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) + check_status(status) + + + cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithm( + handle, xDesc, dyDesc, + convDesc, dwDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + + cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + + cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdFilterAlgo algo + status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, + preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + + cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( + handle, srcDesc, diffDesc, + convDesc, gradDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + + cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, algo, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardFilter_v3( + handle, alpha, + srcDesc, srcData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + + + cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithm( + handle, wDesc, dyDesc, + convDesc, dxDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + + cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + + cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdDataAlgo algo + status = cudnnGetConvolutionBackwardDataAlgorithm_v6( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + + cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardDataAlgorithm_v7( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + + cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, filterDesc, + diffDesc, + convDesc, gradDesc, + algo, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardData_v3( + handle, alpha, + filterDesc, filterData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + + ############################################################################### + # Pooling + ############################################################################### + + cpdef size_t createPoolingDescriptor() except? 0: + cdef PoolingDescriptor desc + status = cudnnCreatePoolingDescriptor(&desc) + check_status(status) + return desc + + + cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride): + status = cudnnSetPooling2dDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, windowHeight, windowWidth, + verticalPadding, horizontalPadding, verticalStride, horizontalStride) + check_status(status) + + + cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA): + status = cudnnSetPoolingNdDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, nbDims, + windowDimA, paddingA, strideA) + check_status(status) + + + cpdef destroyPoolingDescriptor(size_t poolingDesc): + status = cudnnDestroyPoolingDescriptor(poolingDesc) + check_status(status) + + + cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnPoolingForward( + handle, poolingDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + + cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnPoolingBackward( + handle, poolingDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + + ############################################################################### + # Batch Normalization + ############################################################################### + + CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON + + cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode): + status = cudnnDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) + check_status(status) + + + cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) + check_status(status) + + + cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) + check_status(status) + + + cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) + check_status(status) + + + cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardTrainingEx( + handle, mode, bnOps, + alpha, beta, + xDesc, x, + zDesc, z, + yDesc, y, + bnScaleBiasMeanVarDesc, + bnScale, bnBias, + exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + handle, + mode, bnOps, + xDesc, + zDesc, + yDesc, + bnScaleBiasMeanVarDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationBackwardEx( + handle, + mode, bnops, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + yDesc, y, + dyDesc, dy, + dzDesc, dz, + dxDesc, dx, + dBnScaleBiasDesc, + bnScaleData, bnBiasData, + dBnScaleData, dBnBiasData, + epsilon, + savedMean, savedInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( + handle, + mode, + bnOps, + xDesc, + yDesc, + dyDesc, + dzDesc, + dxDesc, + dBnScaleBiasDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + handle, + mode, + bnOps, + activationDesc, + xDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + ############################################################################### + # Activation + ############################################################################### + + cpdef size_t createActivationDescriptor() except? 0: + cdef ActivationDescriptor activationDesc + status = cudnnCreateActivationDescriptor(&activationDesc) + check_status(status) + return activationDesc + + + cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): + status = cudnnSetActivationDescriptor( + activationDesc, mode, + reluNanOpt, reluCeiling) + check_status(status) + + + cpdef destroyActivationDescriptor(size_t activationDesc): + status = cudnnDestroyActivationDescriptor( + activationDesc) + check_status(status) + + + cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnSoftmaxForward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) + check_status(status) + + + cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnSoftmaxBackward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + check_status(status) + + + cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnActivationForward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + + cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnActivationBackward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + + + ############################################################################### + # Dropout + ############################################################################### + + cpdef size_t createDropoutDescriptor() except? 0: + cdef DropoutDescriptor desc + status = cudnnCreateDropoutDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyDropoutDescriptor(size_t dropoutDesc): + status = cudnnDestroyDropoutDescriptor(dropoutDesc) + check_status(status) + + + cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: + cdef size_t sizeInBytes + status = cudnnDropoutGetStatesSize( + handle, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed): + status = cudnnSetDropoutDescriptor( + dropoutDesc, handle, dropout, + states, stateSizeInBytes, seed) + check_status(status) + + + cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutForward( + handle, dropoutDesc, + srcDesc, srcData, + dstDesc, dstData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutBackward( + handle, dropoutDesc, + dyDesc, dyData, + dxDesc, dxData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + ############################################################################### + # CTC + ############################################################################### + cpdef size_t createCTCLossDescriptor() except? 0: + cdef CTCLossDescriptor desc + status = cudnnCreateCTCLossDescriptor(&desc) + check_status(status) + return desc + + cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): + status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) + check_status(status) + + cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): + status = cudnnSetCTCLossDescriptor( + ctcLossDesc, dataType) + check_status(status) + + cpdef getCTCLossDescriptor(size_t ctcLossDesc): + cdef DataType compType + status = cudnnGetCTCLossDescriptor( + ctcLossDesc, &compType) + check_status(status) + return compType + + cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, + int algo, size_t ctcLossDesc, + size_t workspace, size_t workSpaceSizeInBytes): + status = cudnnCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) + check_status(status) + + + ############################################################################### + # RNN + ############################################################################### + + cpdef size_t createRNNDescriptor() except? 0: + cdef RNNDescriptor desc + status = cudnnCreateRNNDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyRNNDescriptor(size_t rnnDesc): + status = cudnnDestroyRNNDescriptor(rnnDesc) + check_status(status) + + + cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, + int dataType) except? 0: + cdef PersistentRNNPlan plan + status = cudnnCreatePersistentRNNPlan( + rnnDesc, + minibatch, dataType, &plan) + check_status(status) + return plan + + + cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): + status = cudnnSetPersistentRNNPlan( + rnnDesc, plan) + check_status(status) + + + cpdef destroyPersistentRNNPlan(size_t plan): + status = cudnnDestroyPersistentRNNPlan(plan) + check_status(status) + + + cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType): + status = cudnnSetRNNDescriptor_v5( + rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, dataType) + check_status(status) + + + cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType): + status = cudnnSetRNNDescriptor_v6( + handle, rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, algo, + dataType) + check_status(status) + + + cpdef setRNNPaddingMode( + size_t rnnDesc, int paddingMode): + status = cudnnSetRNNPaddingMode( + rnnDesc, paddingMode) + check_status(status) + + + cpdef getRNNPaddingMode(size_t rnnDesc): + cdef RNNPaddingMode paddingMode + status = cudnnGetRNNPaddingMode( + rnnDesc, &paddingMode) + check_status(status) + return paddingMode + + + cpdef size_t createRNNDataDescriptor() except? 0: + cdef RNNDataDescriptor desc + status = cudnnCreateRNNDataDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): + status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) + check_status(status) + + + cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill): + status = cudnnSetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, vectorSize, + seqLengthArray, paddingFill) + check_status(status) + + + cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill): + status = cudnnGetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, + vectorSize, arrayLengthRequested, seqLengthArray, + paddingFill) + check_status(status) + + + cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = cudnnGetRNNWorkspaceSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = cudnnGetRNNTrainingReserveSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): + cdef size_t sizeInBytes + status = cudnnGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) + check_status(status) + return sizeInBytes + + + cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): + status = cudnnGetRNNLinLayerMatrixParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerMatDesc, linLayerMat) + check_status(status) + + + cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias): + status = cudnnGetRNNLinLayerBiasParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerBiasDesc, linLayerBias) + check_status(status) + + + cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardData( + handle, rnnDesc, seqLength, + yDesc, y, + dyDesc, dy, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeights( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + yDesc, y, + workspace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardInferenceEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardTrainingEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardDataEx( + handle, rnnDesc, + yDesc, y, + dyDesc, dy, + dcDesc, dcAttn, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + dkDesc, dkeys, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeightsEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + yDesc, y, + workSpace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + ############################################################################### + # Spatial Transformer + ############################################################################### + + cpdef size_t createSpatialTransformerDescriptor() except? 0: + cdef SpatialTransformerDescriptor stDesc + status = cudnnCreateSpatialTransformerDescriptor(&stDesc) + check_status(status) + return stDesc + + + cpdef destroySpatialTransformerDescriptor(size_t stDesc): + status = cudnnDestroySpatialTransformerDescriptor( + stDesc) + check_status(status) + + + cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA): + status = cudnnSetSpatialTransformerNdDescriptor( + stDesc, samplerType, + dataType, nbDims, dimA) + check_status(status) + + + cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorForward( + handle, stDesc, + theta, grid) + check_status(status) + + + cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorBackward( + handle, stDesc, + dgrid, dtheta) + check_status(status) + + + cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerForward( + handle, stDesc, + alpha, xDesc, x, grid, + beta, yDesc, y) + check_status(status) + + + cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerBackward( + handle, stDesc, + alpha, xDesc, x, beta, + dxDesc, dx, alphaDgrid, + dyDesc, dy, grid, + betaDgrid, dgrid) + check_status(status) + + ############################################################################### + # Fused Ops + ############################################################################### + + cpdef createFusedOpsConstParamPack(int ops): + cdef FusedOpsConstParamPack constPack + with nogil: + status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) + check_status(status) + return constPack + + cpdef destroyFusedOpsConstParamPack(size_t constPack): + with nogil: + status = cudnnDestroyFusedOpsConstParamPack( + constPack) + check_status(status) + + cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + with nogil: + status = cudnnSetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param) + check_status(status) + + cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + cdef int isNULL = 0 + with nogil: + status = cudnnGetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param, &isNULL) + check_status(status) + return isNULL + + cpdef createFusedOpsVariantParamPack(int ops): + cdef FusedOpsVariantParamPack varPack + with nogil: + status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) + check_status(status) + return varPack + + cpdef destroyFusedOpsVariantParamPack(size_t varPack): + with nogil: + status = cudnnDestroyFusedOpsVariantParamPack( + varPack) + check_status(status) + + cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnSetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + + cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnGetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + + cpdef createFusedOpsPlan(int ops): + cdef FusedOpsPlan plan + with nogil: + status = cudnnCreateFusedOpsPlan(&plan, ops) + check_status(status) + return plan + + cpdef destroyFusedOpsPlan(size_t plan): + with nogil: + status = cudnnDestroyFusedOpsPlan(plan) + check_status(status) + + cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): + cdef size_t workspaceSizeInBytes + _setStream(handle) + with nogil: + status = cudnnMakeFusedOpsPlan(handle, plan, + constPack, + &workspaceSizeInBytes) + check_status(status) + return workspaceSizeInBytes + + cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): + _setStream(handle) + with nogil: + status = cudnnFusedOpsExecute(handle, plan, + varPack) + check_status(status) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index ab4b9b9b693..c8ed491f84a 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -54,7 +54,7 @@ cdef extern from '../../cupy_miopen.h' nogil: const char* miopenGetErrorString(Status status) # Version - size_t miopenGetVersion() + size_t miopenGetVersion(size_t major,size_t minor, size_t patch) # Runtime error checking #int cudnnQueryRuntimeError(Handle handle, Status *rstatus, @@ -300,15 +300,15 @@ cpdef inline check_status(int status): ############################################################################### def get_build_version(): - return HIP_VERSION + return CUPY_HIP_VERSION ############################################################################### # Version ############################################################################### -cpdef size_t getVersion() except? 0: - return miopenGetVersion() +cpdef size_t miopen_getVersion(size_t major, size_t minor, size_t patch) except? 0: + return miopenGetVersion(major, minor, patch) ############################################################################### diff --git a/cupy_backends/cupy_miopen.h b/cupy_backends/cupy_miopen.h index 9b6e5f74d79..15247b50530 100644 --- a/cupy_backends/cupy_miopen.h +++ b/cupy_backends/cupy_miopen.h @@ -5,7 +5,7 @@ #define INCLUDE_GUARD_CUPY_CUDNN_H #if CUPY_USE_HIP -#include "miopen/miopen.h" +#include #elif !defined(CUPY_NO_CUDA) @@ -18,3 +18,4 @@ #endif // #ifdef CUPY_NO_CUDA +#endif // #ifndef INCLUDE_GUARD_CUPY_CUDNN_H diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index 2a26f842f7f..d8a29eb9924 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -161,11 +161,11 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'cupy_backends.cuda.libs.nvtx', 'cupy_backends.cuda.libs.cusolver', 'cupy_backends.cuda.libs.cusolver_hip', - #'cupyx.cusolver', + 'cupyx.cusolver', 'cupy_backends.cuda.libs.curand_hip', 'cupy_backends.cuda.libs.nvrtc_hip', 'cupy_backends.cuda.libs.miopen', - #'cupy_backends.cuda.libs.cudnn', + 'cupy_backends.cuda.libs.cudnn', ], 'include': [ 'hip/hip_runtime_api.h', From c712949d923fc292a04da064bbc1a999e07823a3 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Mon, 4 Dec 2023 19:00:46 +0000 Subject: [PATCH 18/26] size_t to size_t* --- cupy_backends/cuda/libs/miopen.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index c8ed491f84a..99030a2a175 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -54,7 +54,7 @@ cdef extern from '../../cupy_miopen.h' nogil: const char* miopenGetErrorString(Status status) # Version - size_t miopenGetVersion(size_t major,size_t minor, size_t patch) + size_t miopenGetVersion(size_t* major,size_t* minor, size_t* patch) # Runtime error checking #int cudnnQueryRuntimeError(Handle handle, Status *rstatus, @@ -307,7 +307,7 @@ def get_build_version(): # Version ############################################################################### -cpdef size_t miopen_getVersion(size_t major, size_t minor, size_t patch) except? 0: +cpdef size_t miopen_getVersion(size_t* major, size_t* minor, size_t* patch) except? 0: return miopenGetVersion(major, minor, patch) From 8541e90a09ca6d4ca4b71cf105a04502a08f55db Mon Sep 17 00:00:00 2001 From: bmedishe Date: Mon, 4 Dec 2023 19:26:59 +0000 Subject: [PATCH 19/26] update cupy_backends/cuda/libs/miopen.pyx --- cupy_backends/cuda/libs/miopen.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index 99030a2a175..7222d8d1211 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -54,7 +54,7 @@ cdef extern from '../../cupy_miopen.h' nogil: const char* miopenGetErrorString(Status status) # Version - size_t miopenGetVersion(size_t* major,size_t* minor, size_t* patch) + #size_t miopenGetVersion() # Runtime error checking #int cudnnQueryRuntimeError(Handle handle, Status *rstatus, @@ -307,8 +307,8 @@ def get_build_version(): # Version ############################################################################### -cpdef size_t miopen_getVersion(size_t* major, size_t* minor, size_t* patch) except? 0: - return miopenGetVersion(major, minor, patch) +cpdef size_t miopen_getVersion() except? 0: + return CUPY_HIP_VERSION ############################################################################### From 028a4753953f30c6fca121e18c3d9f3d9a1ec64f Mon Sep 17 00:00:00 2001 From: bmedishe Date: Tue, 5 Dec 2023 16:01:43 +0000 Subject: [PATCH 20/26] update cudnn.pyx --- cupyx/cudnn.pyx | 10 +++++++--- install/cupy_builder/_features.py | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cupyx/cudnn.pyx b/cupyx/cudnn.pyx index fcfb9e98c10..8cb09d0170f 100644 --- a/cupyx/cudnn.pyx +++ b/cupyx/cudnn.pyx @@ -15,7 +15,11 @@ from cupy._core.core cimport _ndarray_base from cupy._core cimport internal from cupy.cuda cimport device from cupy.cuda cimport memory as _memory -from cupy_backends.cuda.libs cimport cudnn +IF CUPY_HIP_VERSION != 0: + from cupy_backends.cuda.libs import miopen as cudnn + #from cupy_backends.cuda.libs.cudnn import * +ELSE: + from cupy_backends.cuda.libs cimport cudnn from cupy._core._ufuncs import elementwise_copy as _elementwise_copy from cupy import _util @@ -1356,7 +1360,7 @@ cpdef _warn_algorithm_fwd( .format(x.shape, W.shape, y.shape, conv_param[0], conv_param[1]), _util.PerformanceWarning) - +""" cpdef _Algorithm _find_algorithm_fwd( _ndarray_base x, _ndarray_base W, _ndarray_base y, tuple conv_param, size_t handle, size_t x_desc, size_t filter_desc, size_t conv_desc, @@ -1639,7 +1643,7 @@ cpdef _Algorithm _get_algorithm_bwd_data( _get_algorithm_bwd_data_cache[key] = algo return algo - +""" cpdef bint _should_use_tensor_core( tensor_core_mode, object dtype) except *: if tensor_core_mode == 'auto': diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index d8a29eb9924..25a9d1b5eb4 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -166,6 +166,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'cupy_backends.cuda.libs.nvrtc_hip', 'cupy_backends.cuda.libs.miopen', 'cupy_backends.cuda.libs.cudnn', + 'cupyx.cudnn', ], 'include': [ 'hip/hip_runtime_api.h', From e09ddbd5f7f6aff6fdf42a36f5feebb3266c60c3 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Tue, 5 Dec 2023 16:39:47 +0000 Subject: [PATCH 21/26] comment out miopen unsupported apis --- cupyx/cudnn.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cupyx/cudnn.pyx b/cupyx/cudnn.pyx index 8cb09d0170f..5b39220574e 100644 --- a/cupyx/cudnn.pyx +++ b/cupyx/cudnn.pyx @@ -1643,7 +1643,6 @@ cpdef _Algorithm _get_algorithm_bwd_data( _get_algorithm_bwd_data_cache[key] = algo return algo -""" cpdef bint _should_use_tensor_core( tensor_core_mode, object dtype) except *: if tensor_core_mode == 'auto': @@ -1974,7 +1973,7 @@ def convolution_backward_data( cudnn.destroyFilterDescriptor(filter_desc) cudnn.destroyConvolutionDescriptor(conv_desc) - +""" def pooling_forward( _ndarray_base x, _ndarray_base y, tuple ksize, tuple stride, tuple pad, int mode): From 93d5906bac90b4b3f8ad9869e0f5320b1bc447ff Mon Sep 17 00:00:00 2001 From: bmedishe Date: Tue, 5 Dec 2023 16:41:53 +0000 Subject: [PATCH 22/26] add dropout apis --- cupy_backends/cuda/libs/miopen.pyx | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index 7222d8d1211..965503fa934 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -342,4 +342,27 @@ cpdef destroy(intptr_t handle): check_status(status) +cpdef size_t createDropoutDescriptor() except? 0: + cdef DropoutDescriptor desc + status = miopen.miopenCreateDropoutDescriptor(&desc) + check_status(status) + return desc + +cpdef destroyDropoutDescriptor(size_t dropoutDesc): + status = miopen.miopenDestroyDropoutDescriptor(dropoutDesc) + check_status(status) + +cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: + cdef size_t sizeInBytes + status = miopen.miopenDropoutGetStatesSize( + handle, &sizeInBytes) + check_status(status) + return sizeInBytes + +cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = miopen.miopenDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes From 5ab3ece4f7c24667488681ba06e2bf45d9a47a61 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Thu, 7 Dec 2023 20:55:59 +0000 Subject: [PATCH 23/26] update enum constants miopen cudnn --- cupy_backends/cuda/libs/cudnn.pxd | 405 +++++--------------- cupy_backends/cuda/libs/miopen.pxd | 577 +++++------------------------ cupy_backends/cuda/libs/miopen.pyx | 21 +- 3 files changed, 201 insertions(+), 802 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pxd b/cupy_backends/cuda/libs/cudnn.pxd index 8fcd754470f..c1f233a3e06 100644 --- a/cupy_backends/cuda/libs/cudnn.pxd +++ b/cupy_backends/cuda/libs/cudnn.pxd @@ -4,245 +4,90 @@ from libc.stdint cimport intptr_t ############################################################################### # Enum ############################################################################### -IF CUPY_HIP_VERSION != 0: + +cpdef enum: + + CUDNN_NOT_PROPAGATE_NAN = 0 + CUDNN_PROPAGATE_NAN = 1 + + CUDNN_OP_TENSOR_ADD = 0 + CUDNN_OP_TENSOR_MUL = 1 + CUDNN_OP_TENSOR_MIN = 2 + CUDNN_OP_TENSOR_MAX = 3 + + CUDNN_REDUCE_TENSOR_ADD = 0 + CUDNN_REDUCE_TENSOR_MUL = 1 + CUDNN_REDUCE_TENSOR_MIN = 2 + CUDNN_REDUCE_TENSOR_MAX = 3 + CUDNN_REDUCE_TENSOR_AMAX = 4 + CUDNN_REDUCE_TENSOR_AVG = 5 + CUDNN_REDUCE_TENSOR_NORM1 = 6 + CUDNN_REDUCE_TENSOR_NORM2 = 7 + + CUDNN_REDUCE_TENSOR_NO_INDICES = 0 + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1 + + CUDNN_32BIT_INDICES = 0 + CUDNN_64BIT_INDICES = 1 + CUDNN_16BIT_INDICES = 2 + CUDNN_8BIT_INDICES = 3 + + # TODO Confirm from miopen team + CUDNN_CONVOLUTION = 0 + CUDNN_CROSS_CORRELATION = 1 + + CUDNN_SOFTMAX_FAST = 0 + CUDNN_SOFTMAX_ACCURATE = 1 + CUDNN_SOFTMAX_LOG = 2 + + CUDNN_SOFTMAX_MODE_INSTANCE = 0 + CUDNN_SOFTMAX_MODE_CHANNEL = 1 + + CUDNN_BATCHNORM_PER_ACTIVATION = 0 + CUDNN_BATCHNORM_SPATIAL = 1 + + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0 + + CUDNN_RNN_RELU = 0 + CUDNN_RNN_TANH = 1 + CUDNN_LSTM = 2 + CUDNN_GRU = 3 + + CUDNN_UNIDIRECTIONAL = 0 + CUDNN_BIDIRECTIONAL = 1 + + CUDNN_RNN_PADDED_IO_DISABLED = 0 + CUDNN_RNN_PADDED_IO_ENABLED = 1 + + CUDNN_LINEAR_INPUT = 0 + CUDNN_SKIP_INPUT = 1 + + CUDNN_STATUS_SUCCESS = 0 +IF CUPY_HIP_VERSION > 0: cpdef enum: - CUDNN_DATA_FLOAT = 200 - CUDNN_DATA_DOUBLE = 201 - CUDNN_DATA_HALF = 202 - - CUDNN_DEFAULT_MATH = 210 - CUDNN_TENSOR_OP_MATH = 211 - - CUDNN_NOT_PROPAGATE_NAN = 220 - CUDNN_PROPAGATE_NAN = 221 - - CUDNN_NON_DETERMINISTIC = 230 - CUDNN_DETERMINISTIC = 231 - - CUDNN_TENSOR_NCHW = 240 - CUDNN_TENSOR_NHWC = 241 - - CUDNN_OP_TENSOR_ADD = 250 - CUDNN_OP_TENSOR_MUL = 251 - CUDNN_OP_TENSOR_MIN = 252 - CUDNN_OP_TENSOR_MAX = 253 - CUDNN_OP_TENSOR_SQRT = 254 - CUDNN_OP_TENSOR_NOT = 255 - - CUDNN_REDUCE_TENSOR_ADD = 260 - CUDNN_REDUCE_TENSOR_MUL = 261 - CUDNN_REDUCE_TENSOR_MIN = 262 - CUDNN_REDUCE_TENSOR_MAX = 263 - CUDNN_REDUCE_TENSOR_AMAX = 264 - CUDNN_REDUCE_TENSOR_AVG = 265 - CUDNN_REDUCE_TENSOR_NORM1 = 266 - CUDNN_REDUCE_TENSOR_NORM2 = 267 - CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 268 - - CUDNN_REDUCE_TENSOR_NO_INDICES = 270 - CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 271 - - CUDNN_32BIT_INDICES = 280 - CUDNN_64BIT_INDICES = 281 - CUDNN_16BIT_INDICES = 282 - CUDNN_8BIT_INDICES = 283 - - CUDNN_ADD_IMAGE = 290 - CUDNN_ADD_SAME_HW = 290 - CUDNN_ADD_FEATURE_MAP = 291 - CUDNN_ADD_SAME_CHW = 291 - CUDNN_ADD_SAME_C = 292 - CUDNN_ADD_FULL_TENSOR = 293 - - CUDNN_CONVOLUTION = 300 - CUDNN_CROSS_CORRELATION = 301 - - CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 310 - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 311 - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 312 - - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 320 - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 321 - CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 322 - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 323 - CUDNN_CONVOLUTION_FWD_ALGO_FFT = 324 - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 325 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 326 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 327 - - CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 330 - CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 331 - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 332 - - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 340 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 341 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 342 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 343 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 344 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 345 - - CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 350 - CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 351 - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 352 - - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 360 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 361 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 362 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 363 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 364 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 365 - - CUDNN_SOFTMAX_FAST = 370 - CUDNN_SOFTMAX_ACCURATE = 371 - CUDNN_SOFTMAX_LOG = 372 - - CUDNN_SOFTMAX_MODE_INSTANCE = 380 - CUDNN_SOFTMAX_MODE_CHANNEL = 381 - - CUDNN_POOLING_MAX = 390 - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 391 - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 392 - CUDNN_POOLING_MAX_DETERMINISTIC = 393 - - CUDNN_ACTIVATION_SIGMOID = 400 - CUDNN_ACTIVATION_RELU = 401 - CUDNN_ACTIVATION_TANH = 402 - CUDNN_ACTIVATION_CLIPPED_RELU = 403 - CUDNN_ACTIVATION_ELU = 404 - CUDNN_ACTIVATION_IDENTITY = 405 - - CUDNN_LRN_CROSS_CHANNEL_DIM1 = 410 - - CUDNN_DIVNORM_PRECOMPUTED_MEANS = 420 - - CUDNN_BATCHNORM_PER_ACTIVATION = 430 - CUDNN_BATCHNORM_SPATIAL = 431 - CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 432 - - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 440 - CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 441 - - CUDNN_BATCHNORM_OPS_BN = 450 - CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 451 - CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 452 - - CUDNN_RNN_RELU = 460 - CUDNN_RNN_TANH = 461 - CUDNN_LSTM = 462 - CUDNN_GRU = 463 - - CUDNN_UNIDIRECTIONAL = 470 - CUDNN_BIDIRECTIONAL = 471 - - CUDNN_RNN_ALGO_STANDARD = 480 - CUDNN_RNN_ALGO_PERSIST_STATIC = 481 - CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 482 - - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 490 - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 491 - CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 492 - - CUDNN_RNN_PADDED_IO_DISABLED = 500 - CUDNN_RNN_PADDED_IO_ENABLED = 501 - - CUDNN_LINEAR_INPUT = 510 - CUDNN_SKIP_INPUT = 511 - - CUDNN_SAMPLER_BILINEAR = 520 - - CUDNN_STATUS_SUCCESS = 530 - CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 541 - CUDNN_STATUS_RUNTIME_IN_PROGRESS = 542 - CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 543 - - CUDNN_ERRQUERY_RAWCODE = 550 - CUDNN_ERRQUERY_NONBLOCKING = 551 - CUDNN_ERRQUERY_BLOCKING = 552 - - # cudnnFusedOps_t - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 560 - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 561 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 562 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 563 - CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 564 - CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 565 - CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 566 - - # cudnnFusedOpsConstParamLabel_t - CUDNN_PARAM_XDESC = 570 - CUDNN_PARAM_XDATA_PLACEHOLDER = 571 - CUDNN_PARAM_BN_MODE = 572 - CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 573 - CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 574 - CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 575 - CUDNN_PARAM_ACTIVATION_DESC = 576 - CUDNN_PARAM_CONV_DESC = 577 - CUDNN_PARAM_WDESC = 578 - CUDNN_PARAM_WDATA_PLACEHOLDER = 579 - CUDNN_PARAM_DWDESC = 580 - CUDNN_PARAM_DWDATA_PLACEHOLDER = 581 - CUDNN_PARAM_YDESC = 582 - CUDNN_PARAM_YDATA_PLACEHOLDER = 583 - CUDNN_PARAM_DYDESC = 584 - CUDNN_PARAM_DYDATA_PLACEHOLDER = 585 - CUDNN_PARAM_YSTATS_DESC = 586 - CUDNN_PARAM_YSUM_PLACEHOLDER = 587 - CUDNN_PARAM_YSQSUM_PLACEHOLDER = 588 - CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 589 - CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 590 - CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 591 - CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 592 - CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 593 - CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 594 - CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 595 - CUDNN_PARAM_ZDESC = 596 - CUDNN_PARAM_ZDATA_PLACEHOLDER = 597 - CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 598 - CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 599 - CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 600 - CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 601 - CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 602 - CUDNN_PARAM_DXDESC = 603 - CUDNN_PARAM_DXDATA_PLACEHOLDER = 604 - CUDNN_PARAM_DZDESC = 605 - CUDNN_PARAM_DZDATA_PLACEHOLDER = 606 - CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 607 - CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 608 - - # cudnnFusedOpsPointerPlaceHolder_t - CUDNN_PTR_NULL = 610 - CUDNN_PTR_ELEM_ALIGNED = 611 - CUDNN_PTR_16B_ALIGNED = 612 - - # cudnnFusedOpsVariantParamLabel_t - CUDNN_PTR_XDATA = 620 - CUDNN_PTR_BN_EQSCALE = 621 - CUDNN_PTR_BN_EQBIAS = 622 - CUDNN_PTR_WDATA = 623 - CUDNN_PTR_DWDATA = 624 - CUDNN_PTR_YDATA = 625 - CUDNN_PTR_DYDATA = 626 - CUDNN_PTR_YSUM = 627 - CUDNN_PTR_YSQSUM = 628 - CUDNN_PTR_WORKSPACE = 629 - CUDNN_PTR_BN_SCALE = 630 - CUDNN_PTR_BN_BIAS = 631 - CUDNN_PTR_BN_SAVED_MEAN = 632 - CUDNN_PTR_BN_SAVED_INVSTD = 633 - CUDNN_PTR_BN_RUNNING_MEAN = 634 - CUDNN_PTR_BN_RUNNING_VAR = 635 - CUDNN_PTR_ZDATA = 636 - CUDNN_PTR_BN_Z_EQSCALE = 637 - CUDNN_PTR_BN_Z_EQBIAS = 638 - CUDNN_PTR_ACTIVATION_BITMASK = 639 - CUDNN_PTR_DXDATA = 640 - CUDNN_PTR_DZDATA = 641 - CUDNN_PTR_BN_DSCALE = 642 - CUDNN_PTR_BN_DBIAS = 643 - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 720 - CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 721 - CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 722 - CUDNN_SCALAR_DOUBLE_BN_EPSILON = 723 + CUDNN_DATA_FLOAT = 1 + CUDNN_DATA_DOUBLE = 6 + CUDNN_DATA_HALF = 0 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 5 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 1 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 2 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 3 + + CUDNN_POOLING_MAX = 0 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 2 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 1 + + CUDNN_ACTIVATION_RELU = 3 + CUDNN_ACTIVATION_TANH = 2 + CUDNN_ACTIVATION_CLIPPED_RELU = 7 + CUDNN_ACTIVATION_ELU = 9 + CUDNN_ACTIVATION_IDENTITY = 0 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 1 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 2 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 3 ELSE: cpdef enum: CUDNN_DATA_FLOAT = 0 @@ -252,40 +97,17 @@ ELSE: CUDNN_DEFAULT_MATH = 0 CUDNN_TENSOR_OP_MATH = 1 - CUDNN_NOT_PROPAGATE_NAN = 0 - CUDNN_PROPAGATE_NAN = 1 - CUDNN_NON_DETERMINISTIC = 0 CUDNN_DETERMINISTIC = 1 CUDNN_TENSOR_NCHW = 0 CUDNN_TENSOR_NHWC = 1 - CUDNN_OP_TENSOR_ADD = 0 - CUDNN_OP_TENSOR_MUL = 1 - CUDNN_OP_TENSOR_MIN = 2 - CUDNN_OP_TENSOR_MAX = 3 CUDNN_OP_TENSOR_SQRT = 4 CUDNN_OP_TENSOR_NOT = 5 - CUDNN_REDUCE_TENSOR_ADD = 0 - CUDNN_REDUCE_TENSOR_MUL = 1 - CUDNN_REDUCE_TENSOR_MIN = 2 - CUDNN_REDUCE_TENSOR_MAX = 3 - CUDNN_REDUCE_TENSOR_AMAX = 4 - CUDNN_REDUCE_TENSOR_AVG = 5 - CUDNN_REDUCE_TENSOR_NORM1 = 6 - CUDNN_REDUCE_TENSOR_NORM2 = 7 CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 - - CUDNN_REDUCE_TENSOR_NO_INDICES = 0 - CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1 - - CUDNN_32BIT_INDICES = 0 - CUDNN_64BIT_INDICES = 1 - CUDNN_16BIT_INDICES = 2 - CUDNN_8BIT_INDICES = 3 - + CUDNN_ADD_IMAGE = 0 CUDNN_ADD_SAME_HW = 0 CUDNN_ADD_FEATURE_MAP = 1 @@ -331,13 +153,6 @@ ELSE: CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 - CUDNN_SOFTMAX_FAST = 0 - CUDNN_SOFTMAX_ACCURATE = 1 - CUDNN_SOFTMAX_LOG = 2 - - CUDNN_SOFTMAX_MODE_INSTANCE = 0 - CUDNN_SOFTMAX_MODE_CHANNEL = 1 - CUDNN_POOLING_MAX = 0 CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 @@ -354,25 +169,14 @@ ELSE: CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 - CUDNN_BATCHNORM_PER_ACTIVATION = 0 - CUDNN_BATCHNORM_SPATIAL = 1 CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 - - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0 + CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 CUDNN_BATCHNORM_OPS_BN = 0 CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 - - CUDNN_RNN_RELU = 0 - CUDNN_RNN_TANH = 1 - CUDNN_LSTM = 2 - CUDNN_GRU = 3 - - CUDNN_UNIDIRECTIONAL = 0 - CUDNN_BIDIRECTIONAL = 1 - + CUDNN_RNN_ALGO_STANDARD = 0 CUDNN_RNN_ALGO_PERSIST_STATIC = 1 CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 @@ -381,15 +185,8 @@ ELSE: CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 - CUDNN_RNN_PADDED_IO_DISABLED = 0 - CUDNN_RNN_PADDED_IO_ENABLED = 1 - - CUDNN_LINEAR_INPUT = 0 - CUDNN_SKIP_INPUT = 1 - CUDNN_SAMPLER_BILINEAR = 0 - CUDNN_STATUS_SUCCESS = 0 CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 @@ -483,22 +280,21 @@ ELSE: CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 - - -############################################################################### -# Class -############################################################################### - -cdef class CuDNNAlgoPerf: - cdef: - int algo - int status - float time - size_t memory - int determinism - int mathType - IF CUPY_HIP_VERSION == 0: + ############################################################################### + # Class + ############################################################################### + + cdef class CuDNNAlgoPerf: + cdef: + int algo + int status + float time + size_t memory + int determinism + int mathType + + ############################################################################### # Version ############################################################################### @@ -1016,4 +812,3 @@ IF CUPY_HIP_VERSION == 0: cpdef destroyFusedOpsPlan(size_t plan) cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) - diff --git a/cupy_backends/cuda/libs/miopen.pxd b/cupy_backends/cuda/libs/miopen.pxd index 8fcd754470f..30a93cec1eb 100644 --- a/cupy_backends/cuda/libs/miopen.pxd +++ b/cupy_backends/cuda/libs/miopen.pxd @@ -6,500 +6,101 @@ from libc.stdint cimport intptr_t ############################################################################### IF CUPY_HIP_VERSION != 0: cpdef enum: - CUDNN_DATA_FLOAT = 200 - CUDNN_DATA_DOUBLE = 201 - CUDNN_DATA_HALF = 202 - - CUDNN_DEFAULT_MATH = 210 - CUDNN_TENSOR_OP_MATH = 211 - - CUDNN_NOT_PROPAGATE_NAN = 220 - CUDNN_PROPAGATE_NAN = 221 - - CUDNN_NON_DETERMINISTIC = 230 - CUDNN_DETERMINISTIC = 231 - - CUDNN_TENSOR_NCHW = 240 - CUDNN_TENSOR_NHWC = 241 - - CUDNN_OP_TENSOR_ADD = 250 - CUDNN_OP_TENSOR_MUL = 251 - CUDNN_OP_TENSOR_MIN = 252 - CUDNN_OP_TENSOR_MAX = 253 - CUDNN_OP_TENSOR_SQRT = 254 - CUDNN_OP_TENSOR_NOT = 255 - - CUDNN_REDUCE_TENSOR_ADD = 260 - CUDNN_REDUCE_TENSOR_MUL = 261 - CUDNN_REDUCE_TENSOR_MIN = 262 - CUDNN_REDUCE_TENSOR_MAX = 263 - CUDNN_REDUCE_TENSOR_AMAX = 264 - CUDNN_REDUCE_TENSOR_AVG = 265 - CUDNN_REDUCE_TENSOR_NORM1 = 266 - CUDNN_REDUCE_TENSOR_NORM2 = 267 - CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 268 - - CUDNN_REDUCE_TENSOR_NO_INDICES = 270 - CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 271 - - CUDNN_32BIT_INDICES = 280 - CUDNN_64BIT_INDICES = 281 - CUDNN_16BIT_INDICES = 282 - CUDNN_8BIT_INDICES = 283 - - CUDNN_ADD_IMAGE = 290 - CUDNN_ADD_SAME_HW = 290 - CUDNN_ADD_FEATURE_MAP = 291 - CUDNN_ADD_SAME_CHW = 291 - CUDNN_ADD_SAME_C = 292 - CUDNN_ADD_FULL_TENSOR = 293 - - CUDNN_CONVOLUTION = 300 - CUDNN_CROSS_CORRELATION = 301 - - CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 310 - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 311 - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 312 - - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 320 - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 321 - CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 322 - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 323 - CUDNN_CONVOLUTION_FWD_ALGO_FFT = 324 - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 325 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 326 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 327 - - CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 330 - CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 331 - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 332 - - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 340 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 341 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 342 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 343 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 344 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 345 - - CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 350 - CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 351 - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 352 - - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 360 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 361 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 362 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 363 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 364 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 365 - - CUDNN_SOFTMAX_FAST = 370 - CUDNN_SOFTMAX_ACCURATE = 371 - CUDNN_SOFTMAX_LOG = 372 - - CUDNN_SOFTMAX_MODE_INSTANCE = 380 - CUDNN_SOFTMAX_MODE_CHANNEL = 381 - - CUDNN_POOLING_MAX = 390 - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 391 - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 392 - CUDNN_POOLING_MAX_DETERMINISTIC = 393 - - CUDNN_ACTIVATION_SIGMOID = 400 - CUDNN_ACTIVATION_RELU = 401 - CUDNN_ACTIVATION_TANH = 402 - CUDNN_ACTIVATION_CLIPPED_RELU = 403 - CUDNN_ACTIVATION_ELU = 404 - CUDNN_ACTIVATION_IDENTITY = 405 - - CUDNN_LRN_CROSS_CHANNEL_DIM1 = 410 - - CUDNN_DIVNORM_PRECOMPUTED_MEANS = 420 - - CUDNN_BATCHNORM_PER_ACTIVATION = 430 - CUDNN_BATCHNORM_SPATIAL = 431 - CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 432 - - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 440 - CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 441 - - CUDNN_BATCHNORM_OPS_BN = 450 - CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 451 - CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 452 - - CUDNN_RNN_RELU = 460 - CUDNN_RNN_TANH = 461 - CUDNN_LSTM = 462 - CUDNN_GRU = 463 - - CUDNN_UNIDIRECTIONAL = 470 - CUDNN_BIDIRECTIONAL = 471 - - CUDNN_RNN_ALGO_STANDARD = 480 - CUDNN_RNN_ALGO_PERSIST_STATIC = 481 - CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 482 - - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 490 - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 491 - CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 492 - - CUDNN_RNN_PADDED_IO_DISABLED = 500 - CUDNN_RNN_PADDED_IO_ENABLED = 501 - - CUDNN_LINEAR_INPUT = 510 - CUDNN_SKIP_INPUT = 511 - - CUDNN_SAMPLER_BILINEAR = 520 - - CUDNN_STATUS_SUCCESS = 530 - CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 541 - CUDNN_STATUS_RUNTIME_IN_PROGRESS = 542 - CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 543 - - CUDNN_ERRQUERY_RAWCODE = 550 - CUDNN_ERRQUERY_NONBLOCKING = 551 - CUDNN_ERRQUERY_BLOCKING = 552 - - # cudnnFusedOps_t - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 560 - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 561 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 562 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 563 - CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 564 - CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 565 - CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 566 - - # cudnnFusedOpsConstParamLabel_t - CUDNN_PARAM_XDESC = 570 - CUDNN_PARAM_XDATA_PLACEHOLDER = 571 - CUDNN_PARAM_BN_MODE = 572 - CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 573 - CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 574 - CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 575 - CUDNN_PARAM_ACTIVATION_DESC = 576 - CUDNN_PARAM_CONV_DESC = 577 - CUDNN_PARAM_WDESC = 578 - CUDNN_PARAM_WDATA_PLACEHOLDER = 579 - CUDNN_PARAM_DWDESC = 580 - CUDNN_PARAM_DWDATA_PLACEHOLDER = 581 - CUDNN_PARAM_YDESC = 582 - CUDNN_PARAM_YDATA_PLACEHOLDER = 583 - CUDNN_PARAM_DYDESC = 584 - CUDNN_PARAM_DYDATA_PLACEHOLDER = 585 - CUDNN_PARAM_YSTATS_DESC = 586 - CUDNN_PARAM_YSUM_PLACEHOLDER = 587 - CUDNN_PARAM_YSQSUM_PLACEHOLDER = 588 - CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 589 - CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 590 - CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 591 - CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 592 - CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 593 - CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 594 - CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 595 - CUDNN_PARAM_ZDESC = 596 - CUDNN_PARAM_ZDATA_PLACEHOLDER = 597 - CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 598 - CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 599 - CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 600 - CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 601 - CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 602 - CUDNN_PARAM_DXDESC = 603 - CUDNN_PARAM_DXDATA_PLACEHOLDER = 604 - CUDNN_PARAM_DZDESC = 605 - CUDNN_PARAM_DZDATA_PLACEHOLDER = 606 - CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 607 - CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 608 - - # cudnnFusedOpsPointerPlaceHolder_t - CUDNN_PTR_NULL = 610 - CUDNN_PTR_ELEM_ALIGNED = 611 - CUDNN_PTR_16B_ALIGNED = 612 - - # cudnnFusedOpsVariantParamLabel_t - CUDNN_PTR_XDATA = 620 - CUDNN_PTR_BN_EQSCALE = 621 - CUDNN_PTR_BN_EQBIAS = 622 - CUDNN_PTR_WDATA = 623 - CUDNN_PTR_DWDATA = 624 - CUDNN_PTR_YDATA = 625 - CUDNN_PTR_DYDATA = 626 - CUDNN_PTR_YSUM = 627 - CUDNN_PTR_YSQSUM = 628 - CUDNN_PTR_WORKSPACE = 629 - CUDNN_PTR_BN_SCALE = 630 - CUDNN_PTR_BN_BIAS = 631 - CUDNN_PTR_BN_SAVED_MEAN = 632 - CUDNN_PTR_BN_SAVED_INVSTD = 633 - CUDNN_PTR_BN_RUNNING_MEAN = 634 - CUDNN_PTR_BN_RUNNING_VAR = 635 - CUDNN_PTR_ZDATA = 636 - CUDNN_PTR_BN_Z_EQSCALE = 637 - CUDNN_PTR_BN_Z_EQBIAS = 638 - CUDNN_PTR_ACTIVATION_BITMASK = 639 - CUDNN_PTR_DXDATA = 640 - CUDNN_PTR_DZDATA = 641 - CUDNN_PTR_BN_DSCALE = 642 - CUDNN_PTR_BN_DBIAS = 643 - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 720 - CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 721 - CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 722 - CUDNN_SCALAR_DOUBLE_BN_EPSILON = 723 -ELSE: cpdef enum: - CUDNN_DATA_FLOAT = 0 - CUDNN_DATA_DOUBLE = 1 - CUDNN_DATA_HALF = 2 - - CUDNN_DEFAULT_MATH = 0 - CUDNN_TENSOR_OP_MATH = 1 - - CUDNN_NOT_PROPAGATE_NAN = 0 - CUDNN_PROPAGATE_NAN = 1 - - CUDNN_NON_DETERMINISTIC = 0 - CUDNN_DETERMINISTIC = 1 - - CUDNN_TENSOR_NCHW = 0 - CUDNN_TENSOR_NHWC = 1 - - CUDNN_OP_TENSOR_ADD = 0 - CUDNN_OP_TENSOR_MUL = 1 - CUDNN_OP_TENSOR_MIN = 2 - CUDNN_OP_TENSOR_MAX = 3 - CUDNN_OP_TENSOR_SQRT = 4 - CUDNN_OP_TENSOR_NOT = 5 - - CUDNN_REDUCE_TENSOR_ADD = 0 - CUDNN_REDUCE_TENSOR_MUL = 1 - CUDNN_REDUCE_TENSOR_MIN = 2 - CUDNN_REDUCE_TENSOR_MAX = 3 - CUDNN_REDUCE_TENSOR_AMAX = 4 - CUDNN_REDUCE_TENSOR_AVG = 5 - CUDNN_REDUCE_TENSOR_NORM1 = 6 - CUDNN_REDUCE_TENSOR_NORM2 = 7 - CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 - - CUDNN_REDUCE_TENSOR_NO_INDICES = 0 - CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1 - - CUDNN_32BIT_INDICES = 0 - CUDNN_64BIT_INDICES = 1 - CUDNN_16BIT_INDICES = 2 - CUDNN_8BIT_INDICES = 3 - - CUDNN_ADD_IMAGE = 0 - CUDNN_ADD_SAME_HW = 0 - CUDNN_ADD_FEATURE_MAP = 1 - CUDNN_ADD_SAME_CHW = 1 - CUDNN_ADD_SAME_C = 2 - CUDNN_ADD_FULL_TENSOR = 3 - - CUDNN_CONVOLUTION = 0 - CUDNN_CROSS_CORRELATION = 1 - - CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0 - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1 - CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3 - CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4 - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7 - - CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5 - - CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 - - CUDNN_SOFTMAX_FAST = 0 - CUDNN_SOFTMAX_ACCURATE = 1 - CUDNN_SOFTMAX_LOG = 2 - - CUDNN_SOFTMAX_MODE_INSTANCE = 0 - CUDNN_SOFTMAX_MODE_CHANNEL = 1 - - CUDNN_POOLING_MAX = 0 - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 - CUDNN_POOLING_MAX_DETERMINISTIC = 3 - - CUDNN_ACTIVATION_SIGMOID = 0 - CUDNN_ACTIVATION_RELU = 1 - CUDNN_ACTIVATION_TANH = 2 - CUDNN_ACTIVATION_CLIPPED_RELU = 3 - CUDNN_ACTIVATION_ELU = 4 - CUDNN_ACTIVATION_IDENTITY = 5 - - CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0 - - CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 - - CUDNN_BATCHNORM_PER_ACTIVATION = 0 - CUDNN_BATCHNORM_SPATIAL = 1 - CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 - - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0 - CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 - - CUDNN_BATCHNORM_OPS_BN = 0 - CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 - CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 - - CUDNN_RNN_RELU = 0 - CUDNN_RNN_TANH = 1 - CUDNN_LSTM = 2 - CUDNN_GRU = 3 - - CUDNN_UNIDIRECTIONAL = 0 - CUDNN_BIDIRECTIONAL = 1 - - CUDNN_RNN_ALGO_STANDARD = 0 - CUDNN_RNN_ALGO_PERSIST_STATIC = 1 - CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 - - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0 - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 - CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 - - CUDNN_RNN_PADDED_IO_DISABLED = 0 - CUDNN_RNN_PADDED_IO_ENABLED = 1 - - CUDNN_LINEAR_INPUT = 0 - CUDNN_SKIP_INPUT = 1 - - CUDNN_SAMPLER_BILINEAR = 0 - - CUDNN_STATUS_SUCCESS = 0 - CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 - CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 - CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 - - CUDNN_ERRQUERY_RAWCODE = 0 - CUDNN_ERRQUERY_NONBLOCKING = 1 - CUDNN_ERRQUERY_BLOCKING = 2 - - # cudnnFusedOps_t - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0 - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3 - CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4 - CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5 - CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6 - - # cudnnFusedOpsConstParamLabel_t - CUDNN_PARAM_XDESC = 0 - CUDNN_PARAM_XDATA_PLACEHOLDER = 1 - CUDNN_PARAM_BN_MODE = 2 - CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3 - CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4 - CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5 - CUDNN_PARAM_ACTIVATION_DESC = 6 - CUDNN_PARAM_CONV_DESC = 7 - CUDNN_PARAM_WDESC = 8 - CUDNN_PARAM_WDATA_PLACEHOLDER = 9 - CUDNN_PARAM_DWDESC = 10 - CUDNN_PARAM_DWDATA_PLACEHOLDER = 11 - CUDNN_PARAM_YDESC = 12 - CUDNN_PARAM_YDATA_PLACEHOLDER = 13 - CUDNN_PARAM_DYDESC = 14 - CUDNN_PARAM_DYDATA_PLACEHOLDER = 15 - CUDNN_PARAM_YSTATS_DESC = 16 - CUDNN_PARAM_YSUM_PLACEHOLDER = 17 - CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18 - CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19 - CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20 - CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21 - CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22 - CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23 - CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24 - CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25 - CUDNN_PARAM_ZDESC = 26 - CUDNN_PARAM_ZDATA_PLACEHOLDER = 27 - CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28 - CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29 - CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30 - CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31 - CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32 - CUDNN_PARAM_DXDESC = 33 - CUDNN_PARAM_DXDATA_PLACEHOLDER = 34 - CUDNN_PARAM_DZDESC = 35 - CUDNN_PARAM_DZDATA_PLACEHOLDER = 36 - CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37 - CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38 + miopenFloat = 1 + miopenDouble = 6 + miopenHalf = 0 + + miopenConvolutionFwdAlgoGEMM = 0 + miopenConvolutionFwdAlgoDirect = 1 + miopenConvolutionFwdAlgoFFT = 2 + miopenConvolutionFwdAlgoWinograd = 3 + miopenConvolutionFwdAlgoImplicitGEMM = 5 - # cudnnFusedOpsPointerPlaceHolder_t - CUDNN_PTR_NULL = 0 - CUDNN_PTR_ELEM_ALIGNED = 1 - CUDNN_PTR_16B_ALIGNED = 2 - - # cudnnFusedOpsVariantParamLabel_t - CUDNN_PTR_XDATA = 0 - CUDNN_PTR_BN_EQSCALE = 1 - CUDNN_PTR_BN_EQBIAS = 2 - CUDNN_PTR_WDATA = 3 - CUDNN_PTR_DWDATA = 4 - CUDNN_PTR_YDATA = 5 - CUDNN_PTR_DYDATA = 6 - CUDNN_PTR_YSUM = 7 - CUDNN_PTR_YSQSUM = 8 - CUDNN_PTR_WORKSPACE = 9 - CUDNN_PTR_BN_SCALE = 10 - CUDNN_PTR_BN_BIAS = 11 - CUDNN_PTR_BN_SAVED_MEAN = 12 - CUDNN_PTR_BN_SAVED_INVSTD = 13 - CUDNN_PTR_BN_RUNNING_MEAN = 14 - CUDNN_PTR_BN_RUNNING_VAR = 15 - CUDNN_PTR_ZDATA = 16 - CUDNN_PTR_BN_Z_EQSCALE = 17 - CUDNN_PTR_BN_Z_EQBIAS = 18 - CUDNN_PTR_ACTIVATION_BITMASK = 19 - CUDNN_PTR_DXDATA = 20 - CUDNN_PTR_DZDATA = 21 - CUDNN_PTR_BN_DSCALE = 22 - CUDNN_PTR_BN_DBIAS = 23 - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100 - CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101 - CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 - CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 - - - -############################################################################### -# Class -############################################################################### + miopenPoolingMax = 0 + miopenPoolingAverage = 1 + miopenPoolingAverageInclusive = 2 -cdef class CuDNNAlgoPerf: - cdef: - int algo - int status - float time - size_t memory - int determinism - int mathType + miopenActivationPASTHRU = 0 + miopenActivationTANH = 2 + miopenActivationRELU = 3 + miopenActivationCLIPPEDRELU = 7 + miopenActivationELU = 9 + + miopenRNNDataSeqMajorNotPadded = 1 + miopenRNNDataSeqMajorPadded = 2 + miopenRNNDataBatchMajorPadded = 3 + + MIOPEN_NOT_PROPAGATE_NAN = 0 + MIOPEN_PROPAGATE_NAN = 1 + + miopenTensorOpAdd = 0 + miopenTensorOpMul = 1 + miopenTensorOpMin = 2 + miopenTensorOpMax = 3 + + MIOPEN_REDUCE_TENSOR_ADD = 0 + MIOPEN_REDUCE_TENSOR_MUL = 1 + MIOPEN_REDUCE_TENSOR_MIN = 2 + MIOPEN_REDUCE_TENSOR_MAX = 3 + MIOPEN_REDUCE_TENSOR_AMAX = 4 + MIOPEN_REDUCE_TENSOR_AVG = 5 + MIOPEN_REDUCE_TENSOR_NORM1 = 6 + MIOPEN_REDUCE_TENSOR_NORM2 = 7 + + MIOPEN_REDUCE_TENSOR_NO_INDICES = 0 + MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES = 1 + + MIOPEN_32BIT_INDICES = 0 + MIOPEN_64BIT_INDICES = 1 + MIOPEN_16BIT_INDICES = 2 + MIOPEN_8BIT_INDICES = 3 + + miopenConvolution = 0 + miopenTranspose = 1 + + MIOPEN_SOFTMAX_FAST = 0 + MIOPEN_SOFTMAX_ACCURATE = 1 + MIOPEN_SOFTMAX_LOG = 2 + + MIOPEN_SOFTMAX_MODE_INSTANCE = 0 + MIOPEN_SOFTMAX_MODE_CHANNEL = 1 + + miopenBNPerActivation = 0 + miopenBNSpatial = 1 + + MIOPEN_CTC_LOSS_ALGO_DETERMINISTIC = 0 + + miopenRNNRELU = 0 + miopenRNNTANH = 1 + miopenLSTM = 2 + miopenGRU = 3 + + miopenRNNunidirection = 0 + miopenRNNbidirection = 1 + + miopenRNNIONotPadded = 0 + miopenRNNIOWithPadding = 1 + + miopenRNNlinear = 0 + miopenRNNskip = 1 + + miopenStatusSuccess = 0 IF CUPY_HIP_VERSION == 0: ############################################################################### + # Class + ############################################################################### + + cdef class CuDNNAlgoPerf: + cdef: + int algo + int status + float time + size_t memory + int determinism + int mathType + ############################################################################### # Version ############################################################################### diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index 965503fa934..aae4d56854c 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -17,9 +17,9 @@ cdef extern from '../../cupy_miopen.h' nogil: # Types ctypedef int ActivationMode 'miopenActivationMode_t' ctypedef int BatchNormMode 'miopenBatchNormMode_t' - ctypedef int ConvolutionBwdDataAlgo 'miopenBwdDataAlgorithm_t' + ctypedef int ConvolutionBwdDataAlgo 'miopenConvBwdDataAlgorithm_t' ctypedef int ConvolutionBwdFilterAlgo 'miopenConvBwdWeightsAlgorithm_t' - ctypedef int ConvolutionFwdAlgo 'miopenConvolutionFwdAlgorithm_t' + ctypedef int ConvolutionFwdAlgo 'miopenConvFwdAlgorithm_t' ctypedef int ConvolutionMode 'miopenConvolutionMode_t' ctypedef int DataType 'miopenDataType_t' ctypedef int DirectionMode 'miopenRNNDirectionMode_t' @@ -29,6 +29,9 @@ cdef extern from '../../cupy_miopen.h' nogil: ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' ctypedef int RNNMode 'miopenRNNMode_t' ctypedef int RNNAlgo 'miopenRNNAlgo_t' + #ctypedef int RNNDataLayout 'miopenRNNBaseLayout_t' + ctypedef int RNNPaddingMode 'miopenRNNPaddingMode_t' + ctypedef int RNNInputMode 'miopenRNNInputMode_t' ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' ctypedef int Status 'miopenStatus_t' @@ -247,7 +250,7 @@ cdef extern from '../../cupy_miopen.h' nogil: # Constants double _EPSILON 'EPSILON' - +""" cdef class CuDNNAlgoPerf: def __init__(self, algo, status, time, memory, determinism, mathType): @@ -257,7 +260,7 @@ cdef class CuDNNAlgoPerf: self.memory = memory self.determinism = determinism self.mathType = mathType - +""" ############################################################################### # Error handling @@ -307,7 +310,7 @@ def get_build_version(): # Version ############################################################################### -cpdef size_t miopen_getVersion() except? 0: +cpdef size_t getVersion() except? 0: return CUPY_HIP_VERSION @@ -344,25 +347,25 @@ cpdef destroy(intptr_t handle): cpdef size_t createDropoutDescriptor() except? 0: cdef DropoutDescriptor desc - status = miopen.miopenCreateDropoutDescriptor(&desc) + status = miopenCreateDropoutDescriptor(&desc) check_status(status) return desc cpdef destroyDropoutDescriptor(size_t dropoutDesc): - status = miopen.miopenDestroyDropoutDescriptor(dropoutDesc) + status = miopenDestroyDropoutDescriptor(dropoutDesc) check_status(status) cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: cdef size_t sizeInBytes - status = miopen.miopenDropoutGetStatesSize( + status = miopenDropoutGetStatesSize( handle, &sizeInBytes) check_status(status) return sizeInBytes cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: cdef size_t sizeInBytes - status = miopen.miopenDropoutGetReserveSpaceSize( + status = miopenDropoutGetReserveSpaceSize( xDesc, &sizeInBytes) check_status(status) return sizeInBytes From 30c13aec50898a802ad2d0c0363e4852e593f373 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Fri, 8 Dec 2023 21:25:19 +0000 Subject: [PATCH 24/26] from cupyx import cudnn does not throw error --- cupy_backends/cuda/libs/cudnn.pxd | 14 +++++++------- cupy_backends/cuda/libs/cudnn.pyx | 30 ++++++++++++++---------------- cupy_backends/cuda/libs/miopen.pxd | 6 ++++-- cupy_backends/cuda/libs/miopen.pyx | 4 ++-- cupyx/cudnn.pyx | 30 +++++++++++++++--------------- tests/cupyx_tests/test_cudnn.py | 1 - 6 files changed, 42 insertions(+), 43 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pxd b/cupy_backends/cuda/libs/cudnn.pxd index c1f233a3e06..aa2bb622746 100644 --- a/cupy_backends/cuda/libs/cudnn.pxd +++ b/cupy_backends/cuda/libs/cudnn.pxd @@ -65,10 +65,13 @@ cpdef enum: CUDNN_STATUS_SUCCESS = 0 IF CUPY_HIP_VERSION > 0: cpdef enum: - CUDNN_DATA_FLOAT = 1 - CUDNN_DATA_DOUBLE = 6 - CUDNN_DATA_HALF = 0 - + CUDNN_DATA_FLOAT = 1 + CUDNN_DATA_DOUBLE = 6 + CUDNN_DATA_HALF = 0 + + CUDNN_TENSOR_NCHW = 0 + CUDNN_TENSOR_NHWC = 1 + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 5 CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 1 @@ -100,9 +103,6 @@ ELSE: CUDNN_NON_DETERMINISTIC = 0 CUDNN_DETERMINISTIC = 1 - CUDNN_TENSOR_NCHW = 0 - CUDNN_TENSOR_NHWC = 1 - CUDNN_OP_TENSOR_SQRT = 4 CUDNN_OP_TENSOR_NOT = 5 diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 9e6a28d84da..2af12623dc6 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -10,8 +10,6 @@ from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module IF CUPY_USE_GEN_HIP_CODE: from cupy_backends.cuda.libs.miopen import * - from cupy_backends.cuda.libs.cusolver_hip import _get_cuda_build_version - from cupy_backends.cuda.libs.cusolver_hip import _getVersion ELSE: ############################################################################### # Extern @@ -742,7 +740,7 @@ ELSE: # Constants double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' - + """ cdef class CuDNNAlgoPerf: def __init__(self, algo, status, time, memory, determinism, mathType): @@ -752,7 +750,7 @@ ELSE: self.memory = memory self.determinism = determinism self.mathType = mathType - + """ ############################################################################### # Error handling @@ -1200,7 +1198,7 @@ ELSE: perfResults.resize(returnedAlgoCount) return perfResults - + """ cpdef list findConvolutionForwardAlgorithmEx( intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, @@ -1238,7 +1236,7 @@ ELSE: return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, p.determinism, p.mathType) for p in perfResults] - + """ cpdef int getConvolutionForwardAlgorithm_v6( intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, @@ -1252,7 +1250,7 @@ ELSE: check_status(status) return algo - + """ cpdef list getConvolutionForwardAlgorithm_v7( intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, size_t destDesc, int requestedAlgoCount): @@ -1269,7 +1267,7 @@ ELSE: return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, p.determinism, p.mathType) for p in perfResults] - + """ cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, @@ -1326,7 +1324,7 @@ ELSE: perfResults.resize(returnedAlgoCount) return perfResults - + """ cpdef list findConvolutionBackwardFilterAlgorithmEx( intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, @@ -1364,7 +1362,7 @@ ELSE: return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, p.determinism, p.mathType) for p in perfResults] - + """ cpdef int getConvolutionBackwardFilterAlgorithm_v6( intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, @@ -1380,7 +1378,7 @@ ELSE: check_status(status) return algo - + """ cpdef list getConvolutionBackwardFilterAlgorithm_v7( intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, size_t gradDesc, int requestedAlgoCount): @@ -1396,7 +1394,7 @@ ELSE: return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, p.determinism, p.mathType) for p in perfResults] - + """ cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, @@ -1442,7 +1440,7 @@ ELSE: perfResults.resize(returnedAlgoCount) return perfResults - + """ cpdef list findConvolutionBackwardDataAlgorithmEx( intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, size_t convDesc, size_t dxDesc, size_t dx, @@ -1480,7 +1478,7 @@ ELSE: return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, p.determinism, p.mathType) for p in perfResults] - + """ cpdef int getConvolutionBackwardDataAlgorithm_v6( intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, @@ -1495,7 +1493,7 @@ ELSE: check_status(status) return algo - + """ cpdef list getConvolutionBackwardDataAlgorithm_v7( intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, size_t gradDesc, int requestedAlgoCount): @@ -1512,7 +1510,7 @@ ELSE: return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, p.determinism, p.mathType) for p in perfResults] - + """ cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, diff --git a/cupy_backends/cuda/libs/miopen.pxd b/cupy_backends/cuda/libs/miopen.pxd index 30a93cec1eb..cab8d425237 100644 --- a/cupy_backends/cuda/libs/miopen.pxd +++ b/cupy_backends/cuda/libs/miopen.pxd @@ -5,7 +5,6 @@ from libc.stdint cimport intptr_t # Enum ############################################################################### IF CUPY_HIP_VERSION != 0: - cpdef enum: cpdef enum: miopenFloat = 1 miopenDouble = 6 @@ -33,7 +32,10 @@ IF CUPY_HIP_VERSION != 0: MIOPEN_NOT_PROPAGATE_NAN = 0 MIOPEN_PROPAGATE_NAN = 1 - + + miopenTensorNCHW = 0 + miopenTensorNHWC = 1 + miopenTensorOpAdd = 0 miopenTensorOpMul = 1 miopenTensorOpMin = 2 diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index aae4d56854c..cfdfac48a03 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -29,12 +29,12 @@ cdef extern from '../../cupy_miopen.h' nogil: ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' ctypedef int RNNMode 'miopenRNNMode_t' ctypedef int RNNAlgo 'miopenRNNAlgo_t' - #ctypedef int RNNDataLayout 'miopenRNNBaseLayout_t' + ctypedef int RNNDataLayout 'miopenRNNBaseLayout_t' ctypedef int RNNPaddingMode 'miopenRNNPaddingMode_t' - ctypedef int RNNInputMode 'miopenRNNInputMode_t' ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' ctypedef int Status 'miopenStatus_t' + ctypedef int TensorFormat 'miopenTensorLayout_t' ctypedef int OpTensorOp 'miopenTensorOp_t' ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' diff --git a/cupyx/cudnn.pyx b/cupyx/cudnn.pyx index 5b39220574e..75cac8dd5af 100644 --- a/cupyx/cudnn.pyx +++ b/cupyx/cudnn.pyx @@ -17,7 +17,7 @@ from cupy.cuda cimport device from cupy.cuda cimport memory as _memory IF CUPY_HIP_VERSION != 0: from cupy_backends.cuda.libs import miopen as cudnn - #from cupy_backends.cuda.libs.cudnn import * + from cupy_backends.cuda.libs.cudnn import * ELSE: from cupy_backends.cuda.libs cimport cudnn @@ -25,6 +25,7 @@ from cupy._core._ufuncs import elementwise_copy as _elementwise_copy from cupy import _util from cupy.cuda import cudnn as _py_cudnn +from cupy_backends.cuda.libs import cudnn as _cudnn cdef int _cudnn_version = -1 @@ -158,8 +159,7 @@ cpdef _create_tensor_nd_descriptor( c_strides.data()) -cpdef _create_tensor_descriptor(size_t desc, _ndarray_base arr, - int format=cudnn.CUDNN_TENSOR_NCHW): +cpdef _create_tensor_descriptor(size_t desc, _ndarray_base arr,int format=_cudnn.CUDNN_TENSOR_NCHW): if not arr._c_contiguous: raise ValueError('cupyx.cudnn supports c-contiguous arrays only') if arr._shape.size() == 4: @@ -189,7 +189,7 @@ cpdef _create_tensor_descriptor_as4darray(size_t desc, cpdef _create_filter_descriptor( - size_t desc, _ndarray_base arr, int format=cudnn.CUDNN_TENSOR_NCHW): + size_t desc, _ndarray_base arr, int format=_cudnn.CUDNN_TENSOR_NCHW): cdef vector.vector[int] c_shape cdef Py_ssize_t s, ndim = arr._shape.size() data_type = get_data_type(arr.dtype) @@ -272,7 +272,7 @@ cpdef _ndarray_base _ascontiguousarray_normalized_strides(_ndarray_base a): return newarray -def create_tensor_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): +def create_tensor_descriptor(arr, format=_cudnn.CUDNN_TENSOR_NCHW): desc = Descriptor(cudnn.createTensorDescriptor(), _py_cudnn.destroyTensorDescriptor) _create_tensor_descriptor(desc.value, arr, format) @@ -310,7 +310,7 @@ def create_tensor_nd_descriptor(_ndarray_base arr): return desc -def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): +def create_filter_descriptor(arr, format=_cudnn.CUDNN_TENSOR_NCHW): desc = Descriptor(cudnn.createFilterDescriptor(), _py_cudnn.destroyFilterDescriptor) _create_filter_descriptor(desc.value, arr, format) @@ -318,7 +318,7 @@ def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): def create_convolution_descriptor(pad, stride, dtype, - mode=cudnn.CUDNN_CROSS_CORRELATION, + mode=_cudnn.CUDNN_CROSS_CORRELATION, dilation=None, use_tensor_core=False, groups=1): @@ -620,7 +620,7 @@ def rnn_backward_weights_ex( return dw -def create_activation_descriptor(mode, nan_prop_mode=cudnn.CUDNN_PROPAGATE_NAN, +def create_activation_descriptor(mode, nan_prop_mode=_cudnn.CUDNN_PROPAGATE_NAN, coef=0.0): desc = Descriptor(cudnn.createActivationDescriptor(), _py_cudnn.destroyActivationDescriptor) @@ -2047,7 +2047,7 @@ def pooling_backward( cdef _create_tensor_descriptor_for_bn( size_t desc, _ndarray_base arr, bint is_for_conv2d, - int format=cudnn.CUDNN_TENSOR_NCHW): + int format=_cudnn.CUDNN_TENSOR_NCHW): assert arr._c_contiguous if is_for_conv2d: _create_tensor_descriptor(desc, arr, format) @@ -2080,7 +2080,7 @@ def batch_normalization_forward_training( _ndarray_base running_mean, _ndarray_base running_var, mean, inv_std, double eps, double decay, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): + int d_layout=_cudnn.CUDNN_TENSOR_NCHW): reserve_space, y, save_mean, save_inv_std = ( _batch_normalization_forward_training( @@ -2109,7 +2109,7 @@ def batch_normalization_forward_training_ex( _ndarray_base running_mean, _ndarray_base running_var, mean, inv_std, double eps, double decay, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): + int d_layout=_cudnn.CUDNN_TENSOR_NCHW): reserve_space, y, save_mean, save_inv_std = ( _batch_normalization_forward_training( @@ -2132,7 +2132,7 @@ cdef _batch_normalization_forward_training( _ndarray_base running_mean, _ndarray_base running_var, mean, inv_std, double eps, double decay, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): + int d_layout=_cudnn.CUDNN_TENSOR_NCHW): cdef _memory.MemoryPointer workspace = None cdef _memory.MemoryPointer reserve_space = None @@ -2285,7 +2285,7 @@ def batch_normalization_forward_inference( _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, _ndarray_base mean, _ndarray_base var, double eps, bint is_for_conv2d, int cudnn_mode, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): + int d_layout=_cudnn.CUDNN_TENSOR_NCHW): x = core._internal_ascontiguousarray(x) dtype = x.dtype y = _core.ndarray(x._shape, dtype) @@ -2330,7 +2330,7 @@ def batch_normalization_backward( _ndarray_base x, _ndarray_base gamma, _ndarray_base gy, _ndarray_base mean, _ndarray_base inv_std, double eps, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW, + int d_layout=_cudnn.CUDNN_TENSOR_NCHW, *, _memory.MemoryPointer reserve_space=None, ): @@ -2443,7 +2443,7 @@ def batch_normalization_backward( return gx, ggamma, gbeta -def create_activation_descriptor(mode, relu_nan_opt=cudnn.CUDNN_PROPAGATE_NAN, +def create_activation_descriptor(mode, relu_nan_opt=_cudnn.CUDNN_PROPAGATE_NAN, coef=0.0): desc = Descriptor(cudnn.createActivationDescriptor(), _py_cudnn.destroyActivationDescriptor) diff --git a/tests/cupyx_tests/test_cudnn.py b/tests/cupyx_tests/test_cudnn.py index 0087a1c661b..940c6375f97 100644 --- a/tests/cupyx_tests/test_cudnn.py +++ b/tests/cupyx_tests/test_cudnn.py @@ -13,7 +13,6 @@ if cudnn_enabled: modes = [ - libcudnn.CUDNN_ACTIVATION_SIGMOID, libcudnn.CUDNN_ACTIVATION_RELU, libcudnn.CUDNN_ACTIVATION_TANH, ] From 1b0b89222e9718a54cb5c20d6d9df309873260be Mon Sep 17 00:00:00 2001 From: bmedishe Date: Wed, 13 Dec 2023 18:20:24 +0000 Subject: [PATCH 25/26] adding miopen apis --- cupy_backends/cuda/libs/cudnn.pxd | 6 +-- cupy_backends/cuda/libs/miopen.pxd | 2 + cupy_backends/cuda/libs/miopen.pyx | 76 ++++++++++++++++++++++++++++-- 3 files changed, 76 insertions(+), 8 deletions(-) diff --git a/cupy_backends/cuda/libs/cudnn.pxd b/cupy_backends/cuda/libs/cudnn.pxd index aa2bb622746..530f1ff215c 100644 --- a/cupy_backends/cuda/libs/cudnn.pxd +++ b/cupy_backends/cuda/libs/cudnn.pxd @@ -10,6 +10,9 @@ cpdef enum: CUDNN_NOT_PROPAGATE_NAN = 0 CUDNN_PROPAGATE_NAN = 1 + CUDNN_TENSOR_NCHW = 0 + CUDNN_TENSOR_NHWC = 1 + CUDNN_OP_TENSOR_ADD = 0 CUDNN_OP_TENSOR_MUL = 1 CUDNN_OP_TENSOR_MIN = 2 @@ -69,9 +72,6 @@ IF CUPY_HIP_VERSION > 0: CUDNN_DATA_DOUBLE = 6 CUDNN_DATA_HALF = 0 - CUDNN_TENSOR_NCHW = 0 - CUDNN_TENSOR_NHWC = 1 - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 5 CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 1 diff --git a/cupy_backends/cuda/libs/miopen.pxd b/cupy_backends/cuda/libs/miopen.pxd index cab8d425237..5d1baf02526 100644 --- a/cupy_backends/cuda/libs/miopen.pxd +++ b/cupy_backends/cuda/libs/miopen.pxd @@ -89,6 +89,8 @@ IF CUPY_HIP_VERSION != 0: miopenStatusSuccess = 0 + MIOPEN_RNG_PSEUDO_XORWOW = 0 + IF CUPY_HIP_VERSION == 0: ############################################################################### # Class diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index cfdfac48a03..a621de15bce 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -4,11 +4,11 @@ # NOTE: This wrapper does not cover all APIs of cuDNN v4. cimport cython # NOQA from libcpp cimport vector - +from libcpp cimport bool from cupy_backends.cuda.api cimport driver from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module - +from cupy_backends.cuda.libs cimport miopen ############################################################################### # Extern ############################################################################### @@ -36,7 +36,7 @@ cdef extern from '../../cupy_miopen.h' nogil: ctypedef int Status 'miopenStatus_t' ctypedef int TensorFormat 'miopenTensorLayout_t' ctypedef int OpTensorOp 'miopenTensorOp_t' - + ctypedef int RNGType_t 'miopenRNGType_t' ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' ctypedef int ReduceTensorIndices 'miopenReduceTensorIndices_t' ctypedef int IndicesType 'miopenIndicesType_t' @@ -52,7 +52,7 @@ cdef extern from '../../cupy_miopen.h' nogil: ctypedef void* FilterDescriptor 'miopenTensorDescriptor_t' ctypedef void* OpTensorDescriptor 'miopenTensorDescriptor_t' ctypedef void* ReduceTensorDescriptor 'miopenReduceTensorDescriptor_t' - + ctypedef void* Stream 'miopenAcceleratorQueue_t' # Error handling const char* miopenGetErrorString(Status status) @@ -202,7 +202,15 @@ cdef extern from '../../cupy_miopen.h' nogil: int miopenDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) int miopenDropoutGetReserveSpaceSize( TensorDescriptor xDesc, size_t* sizeInBytes) - + int miopenSetDropoutDescriptor( + DropoutDescriptor dropoutDesc, Handle handle, float dropout, + void* states, size_t stateSizeInBytes, unsigned long long seed, + bool use_mask, bool state_evo, RNGType_t rng_mode) + int miopenDropoutForward( + Handle handle, DropoutDescriptor dropoutDesc, TensorDescriptor noise_shape, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor dstDesc, void* dstData, + void* reserveSpace, size_t reserveSpaceSizeInBytes) # CTC int miopenCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) int miopenDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) @@ -345,6 +353,39 @@ cpdef destroy(intptr_t handle): check_status(status) +cpdef setStream(intptr_t handle, size_t stream): + # TODO(leofang): The support of stream capture is not mentioned at all in + # the cuDNN docs (as of CUDA 11.5), so we disable this functionality. + if not runtime._is_hip_environment and runtime.streamIsCapturing(stream): + raise NotImplementedError( + 'calling cuDNN API during stream capture is currently ' + 'unsupported') + status = miopenSetStream(handle, stream) + check_status(status) + + +cpdef size_t getStream(intptr_t handle) except? 0: + cdef Stream stream + status = miopenGetStream(handle, &stream) + check_status(status) + return stream + + +cdef _setStream(intptr_t handle): + """Set current stream""" + setStream(handle, stream_module.get_current_stream_ptr()) + +cpdef size_t createTensorDescriptor() except? 0: + cdef TensorDescriptor descriptor + status = miopenCreateTensorDescriptor(&descriptor) + check_status(status) + return descriptor + +cpdef destroyTensorDescriptor(size_t tensorDesc): + status = miopenDestroyTensorDescriptor(tensorDesc) + check_status(status) + + cpdef size_t createDropoutDescriptor() except? 0: cdef DropoutDescriptor desc status = miopenCreateDropoutDescriptor(&desc) @@ -369,3 +410,28 @@ cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: xDesc, &sizeInBytes) check_status(status) return sizeInBytes + +cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed): + #cdef miopen.RNGType_t rngtype = miopen.MIOPEN_RNG_PSEUDO_XORWOW + cdef int rngtype = miopen.MIOPEN_RNG_PSEUDO_XORWOW + status = miopenSetDropoutDescriptor( + dropoutDesc, handle, dropout, + states, stateSizeInBytes, seed, False, False, rngtype) + check_status(status) + +cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = miopenDropoutForward( + handle, dropoutDesc, srcDesc, + srcDesc, srcData, + dstDesc, dstData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + From d76814e6e3c29d4497021313fee35418e879e5a4 Mon Sep 17 00:00:00 2001 From: bmedishe Date: Tue, 19 Dec 2023 16:49:06 +0000 Subject: [PATCH 26/26] activation tests working --- cupy_backends/cuda/libs/miopen.pyx | 93 +++++++++++++++++++++++++++--- cupyx/cudnn.pyx | 17 +++--- 2 files changed, 94 insertions(+), 16 deletions(-) diff --git a/cupy_backends/cuda/libs/miopen.pyx b/cupy_backends/cuda/libs/miopen.pyx index a621de15bce..912f2884001 100644 --- a/cupy_backends/cuda/libs/miopen.pyx +++ b/cupy_backends/cuda/libs/miopen.pyx @@ -181,9 +181,10 @@ cdef extern from '../../cupy_miopen.h' nogil: # Activation int miopenCreateActivationDescriptor( ActivationDescriptor* activationDesc) - int cudnnSetActivationDescriptor( - ActivationDescriptor activationDesc, ActivationMode mode, - NanPropagation reluNanOpt, double reluCeiling) + int miopenSetActivationDescriptor( + ActivationDescriptor activationDesc, ActivationMode mode, double activAlpha, + double activBeta, + double activGamma) int miopenDestroyActivationDescriptor( ActivationDescriptor activationDesc) int miopenSoftmaxForward( @@ -195,6 +196,17 @@ cdef extern from '../../cupy_miopen.h' nogil: void* alpha, TensorDescriptor srcDesc, void* srcData, TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, TensorDescriptor destDiffDesc, void* destDiffData) + int miopenActivationForward( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int miopenActivationBackward( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + # Dropout int miopenCreateDropoutDescriptor(DropoutDescriptor* desc) @@ -375,12 +387,23 @@ cdef _setStream(intptr_t handle): """Set current stream""" setStream(handle, stream_module.get_current_stream_ptr()) +############################################################################### +# Tensor manipulation +############################################################################### + cpdef size_t createTensorDescriptor() except? 0: cdef TensorDescriptor descriptor status = miopenCreateTensorDescriptor(&descriptor) check_status(status) return descriptor +cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w): + status = miopenSet4dTensorDescriptor( + tensorDesc, + dataType, n, c, h, w) + check_status(status) + cpdef destroyTensorDescriptor(size_t tensorDesc): status = miopenDestroyTensorDescriptor(tensorDesc) check_status(status) @@ -413,12 +436,10 @@ cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: cpdef setDropoutDescriptor( size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed): - #cdef miopen.RNGType_t rngtype = miopen.MIOPEN_RNG_PSEUDO_XORWOW - cdef int rngtype = miopen.MIOPEN_RNG_PSEUDO_XORWOW + size_t states, size_t stateSizeInBytes, unsigned long long seed, bool use_mask, bool state_evo, int rngtype): status = miopenSetDropoutDescriptor( dropoutDesc, handle, dropout, - states, stateSizeInBytes, seed, False, False, rngtype) + states, stateSizeInBytes, seed, use_mask, state_evo, rngtype) check_status(status) cpdef dropoutForward( @@ -435,3 +456,61 @@ cpdef dropoutForward( reserveSpace, reserveSpaceSizeInBytes) check_status(status) +############################################################################### +# Activation +############################################################################### + +cpdef size_t createActivationDescriptor() except? 0: + cdef ActivationDescriptor activationDesc + status = miopenCreateActivationDescriptor(&activationDesc) + check_status(status) + return activationDesc + + +cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): + status = miopenSetActivationDescriptor( + activationDesc, mode, 1.0, 0.0, 0.0) + check_status(status) + + +cpdef destroyActivationDescriptor(size_t activationDesc): + status = miopenDestroyActivationDescriptor( + activationDesc) + check_status(status) + + +cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = miopenSoftmaxForward( + handle, alpha, srcDesc, srcData, + beta, dstDesc, dstData) + check_status(status) + + +cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData): + _setStream(handle) + with nogil: + status = miopenSoftmaxBackward( + handle, alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + check_status(status) + + +cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = miopenActivationForward( + handle, activationDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) diff --git a/cupyx/cudnn.pyx b/cupyx/cudnn.pyx index 75cac8dd5af..5038a7d8192 100644 --- a/cupyx/cudnn.pyx +++ b/cupyx/cudnn.pyx @@ -114,11 +114,11 @@ cdef class Descriptor: cpdef int get_data_type(dtype) except? -1: cdef char t = ord(dtype.char) if t == b'f': - return cudnn.CUDNN_DATA_FLOAT + return _cudnn.CUDNN_DATA_FLOAT elif t == b'd': - return cudnn.CUDNN_DATA_DOUBLE + return _cudnn.CUDNN_DATA_DOUBLE elif t == b'e': - return cudnn.CUDNN_DATA_HALF + return _cudnn.CUDNN_DATA_HALF else: raise TypeError('Dtype {} is not supported in cuDNN'.format(dtype)) @@ -158,8 +158,7 @@ cpdef _create_tensor_nd_descriptor( desc, data_type, arr._shape.size(), c_shape.data(), c_strides.data()) - -cpdef _create_tensor_descriptor(size_t desc, _ndarray_base arr,int format=_cudnn.CUDNN_TENSOR_NCHW): +cpdef _create_tensor_descriptor(size_t desc, _ndarray_base arr,int format=cudnn.miopenTensorNCHW): if not arr._c_contiguous: raise ValueError('cupyx.cudnn supports c-contiguous arrays only') if arr._shape.size() == 4: @@ -184,7 +183,7 @@ cpdef _create_tensor_descriptor_as4darray(size_t desc, if arr._shape.size() > 0: dim1 = arr._shape[0] dim2 = arr.size // dim1 - cudnn.setTensor4dDescriptor(desc, cudnn.CUDNN_TENSOR_NCHW, data_type, + cudnn.setTensor4dDescriptor(desc, _cudnn.CUDNN_TENSOR_NCHW, data_type, dim1, dim2, 1, 1) @@ -649,7 +648,7 @@ def activation_forward(_ndarray_base x, int mode, double coef=0.0): try: _create_tensor_descriptor_as4darray(desc, x) cudnn.setActivationDescriptor( - act_desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, coef) + act_desc, mode, _cudnn.CUDNN_NOT_PROPAGATE_NAN, coef) cudnn.activationForward_v4( handle, act_desc, one, desc, x.data.ptr, zero, desc, y.data.ptr) @@ -777,13 +776,13 @@ def create_dropout_descriptor( desc = Descriptor(cudnn.createDropoutDescriptor(), _py_cudnn.destroyDropoutDescriptor) cudnn.setDropoutDescriptor(desc.value, handle, dropout, - states, state_size_in_bytes, seed) + states, state_size_in_bytes, seed, False, False, 0) return desc def set_dropout_descriptor(desc, handle, dropout): # When the fourth argument is NULL, random state is not updated. - cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0) + cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0, False, False, 0) def _create_ctc_loss_descriptor(data_type):