Matrix-operation-optimization/XTensor.cuh at master · xiaozhang521/Matrix-operation-optimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/* NiuTrans - an open-source MT toolkit
 * Copyright (C) 2017, Natural Language Processing Lab. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 */

/*
 * $Id:
 * XTensor; XTensor.h
 * implementation of tensors used in this work. It it is the basis of XMatrix and
 * XVector
 *
 * $Version:
 * 0.1.0
 *
 * $Created by:
 * XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2017-07-31
 * I'm working while most of the students are enjoying their holidays :(
 * $Update by:
 * LI Yinqiao (email: 1023907632@qq.com) 2017-11-18 bug fixes
 *
 */

#ifndef __XTENSOR_CUH__
#define __XTENSOR_CUH__

#ifdef USE_CUDA

#include "XTensor.h"

/**************************************/
/* copy all elements from a source matrix to a target matrix */
extern "C"
bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);

/**************************************/
/* flush a list of XTensor to GPU memory */
void CudaCPUToGPUFlush(List * mList, XMem * GPUMem);

/* copy the data from GPU memory to CPU memory */
void CudaGPUToCPUFlush(XTensor * tensor);


/**************************************/
/* set the cell to the ascending order along a given dimension */
extern "C"
void CudaSetAscendingOrder(XTensor * a, int dim);

/**************************************/
/* set each entry to its negtive value (CUDA Kernel) */
__global__
void KernelNegate(DTYPE * d, int size);

/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
__global__
void KernelNegate(__half * d, int size);

/* set each entry to its negtive value */
extern "C"
void CudaNegate(XTensor * a);

/**************************************/
/* set all entries to its root (CUDA Kernel) */
__global__
void KernelSqrtV2(DTYPE * d, int size);

/* set all entries to its root (CUDA Kernel) */
__global__
void KernelSqrtV2(__half * d, int size);

/* get the power of the entries */
extern "C"
void CudaPower(XTensor * a, DTYPE p);

/**************************************/
/* scale and shift all matrix entires p = p * scale + shift (CUDA Kernel) */
__global__
void KernelScaleAndShift(DTYPE * d, int size, DTYPE scale, DTYPE shift);

/* scale and shift all matrix entires p = p * scale + shift (CUDA Kernel) with float16 data type */
__global__
void KernelScaleAndShift(__half * d, int size, __half scale, __half shift);

/* scale and shift all tensor entires */
extern "C"
void CudaScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift);

/**************************************/
/* copy a number of blocks to target positions */
__global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);

/* copy a number of blocks to target positions (cuda version) */
extern "C"
void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);

/**************************************/
/* copy a number of blocks form source positions to target positions */
__global__
void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, int blockNum, DTYPE * target, int * targetBlocks);

/* copy a number of blocks form source positions to target positions (cuda version) */
extern "C"
void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem);

/**************************************/
/* set target data block index for the data movement in split */
extern "C"
void CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum);

/**************************************/

/* copy a number of blocks (of different sizes) to target positions */
__global__
void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);

/* merge data by blocks (cuda version) */
extern "C"
void CudaMergeBlockLists(List * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);

/**************************************/
/* set target data block index for the data movement in split */
extern "C"
void CudaMakeMergeBlockIndex(int devID,
                             int * blockIndex, int blockNum, int blockNumInMerge,
                             int splitSizeInGrid, int gridSize, int gridNum);

/**************************************/
/* duplicate the data along a given dimension */
extern "C"
void CudaUnsqueeze(XTensor * a, XTensor * b, int dim, int dSize);

/**************************************/
/* sort the tensor along a given dimension */
void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);

/**************************************/
/* get the top-k items along a given dimension */
void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k);
/**************************************/
/**************************************/
/* get the top-k items along a given dimension, use radixSelect algorithm */
void CudaTopKRadixSelect(XTensor * a, XTensor * b, XTensor * index, int dim, int k);
/**************************************/

/* summation of data arrays (CUDA Kernel) */
extern "C" __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);

/* tensor summation c = a + b * \beta (cuda version) */
extern "C"
void CudaSum(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);

/*  tensor summation c = a + b * \beta (cuda version) with an input handle */
extern "C"
void CudaSumWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);

/**************************************/
/* summation of a tensor and a vector (column vector) */
extern "C"
void CudaSumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

/**************************************/
/* summation of a vector (column vector) and a tensor */
extern "C"
void CudaSumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);


/**************************************/
/*
mutilication of a dense matrix with a sparse vector
c = a * b * \alpha
*/
extern "C" __global__
void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
                                    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum,int bColSize, int bRowSize,
                                    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);

/*
matrix multiplication (for 2d tensors) (cuda version)
c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired
*/
extern "C"
void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
                        DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);


/**************************************/

/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */
extern "C" __global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);

/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */
extern "C" __global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);

/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i)  */
template<int nonZeroAlpha>__global__
void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);

/* element-wise product of two tensors */
extern "C"
void CudaMultiplyElementWise(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha);

/**************************************/
/*
sum the items along a dimension of the tensor (cuda version)
For a 1-dimensional data array a,
sum = \sum_i ((a_i + shift)^power) if isExp == false
sum = \sum_i exp((a_i + shift)^power) if isExp == true
*/
extern "C"
void CudaReduceSumXT(XTensor * input, XTensor * output, int dim, XTensor * shift, DTYPE power, bool isExp);

/**************************************/
/* get the max-valued items along a dimension of the tensor (cuda version) */
extern "C"
void CudaReduceMaxXT(XTensor * input, XTensor * output, int dim);

/**************************************/
/* normalized the data with normal distribution (Kernel code). For an input x,
   y = a * (x-mean)/sqrt(variance+\epsilon) + b
   where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/
__global__
void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
                     DTYPE * a, DTYPE * b, DTYPE epsilon,
                     int stride, int strideNum, int blockNum);

/* normalized the data with normal distribution. For an input x,
   y = a * (x-mean)/sqrt(variance+\epsilon) + b
   where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/
extern "C"
void CudaNormalize(XTensor * input, XTensor * output, int dim,
                   XTensor * mean, XTensor * var,
                   XTensor * a, XTensor * b, DTYPE epsilon);

#endif

#endif