SparseConvNet/TerminalPoolingLayer.cu at master · SpiderMonkey1975/SparseConvNet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
//Average everything that makes it to the final layer

#define TERMINAL_POOLING_MAX_ACTIVE_SITES 1024
#include <iostream>
#include <cassert>
#include "utilities.h"
#include "cudaUtilities.h"
#include "TerminalPoolingLayer.h"

void terminalGridPoolingRules
(SparseGrid &inputGrid,
 SparseGrid &outputGrid,
 int S,
 int &nOutputSpatialSites,
 std::vector<int>& rules) {
  assert(inputGrid.mp.size()<=TERMINAL_POOLING_MAX_ACTIVE_SITES); //Upper bound for ease of kernel memory management
  //std::cout << inputGrid.mp.size() << std::endl;
  if (inputGrid.mp.size()==0) { //Danger, total loss of information
    rules.push_back(inputGrid.backgroundCol);
  } else {
    for (auto iter = inputGrid.mp.begin(); iter != inputGrid.mp.end(); ++iter)
      rules.push_back(iter->second);
  }
  outputGrid.mp[0]=nOutputSpatialSites++;
  rules.resize(S*nOutputSpatialSites,-1); //pad with -1 values
}

__global__ void dTerminalPool(float* g1, float* g2, int* rules, int nOut, int ps2) {
  __shared__ int r[TERMINAL_POOLING_MAX_ACTIVE_SITES];
  int i=blockIdx.x*nOut;//for output g2
  for (int p=threadIdx.x;p<ps2;p+=KERNELBLOCKSIZE)
    r[p]=rules[blockIdx.x*ps2+p]*nOut;  //for input g1
  __syncthreads();
  for (int j=threadIdx.x;j<nOut;j+=KERNELBLOCKSIZE) { //nOut is a multiple of KERNELBLOCKSIZE!!!
    float t=0;
    int p=0;
    for (;p<ps2 and r[p]>=0;p++) {
      t+=g1[r[p]+j];
    }
    g2[i+j]=t/p;
  }
}

void terminalPool(float* g1, float* g2, int* rules, int count, int ps2, int nOut) {
  int processed=0;
  assert(ps2<=TERMINAL_POOLING_MAX_ACTIVE_SITES);// if ps2>KERNELBLOCKSIZE, i.e. if poolSize>32, allocate more memory in dTerminalPool and dTerminalPoolBackProp
  while (processed<count) {
    int batch=min(32768,count-processed);
    dTerminalPool<<<batch,KERNELBLOCKSIZE,0,cnnMemStream->stream>>> (g1, g2+processed*nOut, rules+processed*ps2, nOut, ps2);
    processed+=batch;
  }
  cudaCheckError();
}


__global__ void dTerminalPoolBackProp(float* d1, float* d2, int* rules, int nOut, int ps2) {
  __shared__ int r[TERMINAL_POOLING_MAX_ACTIVE_SITES];  //Allocate at least size ps2 !!!!!!!!!!!
  int i=blockIdx.x*nOut;//for input d2
  for (int p=threadIdx.x;p<ps2;p+=KERNELBLOCKSIZE) {
    r[p]=rules[blockIdx.x*ps2+p]*nOut;  //for output d1
  }
  __syncthreads();
  int maxP=0;
  while (maxP<ps2 and r[maxP]>=0)
    ++maxP;
  __syncthreads();  //delete line??
  for (int j=threadIdx.x;j<nOut;j+=KERNELBLOCKSIZE) {
    float t=d2[i+j]/maxP;
    for (int p=0;p<maxP;p++) {
      d1[r[p]+j]=t;
    }
  }
}

void terminalPoolBackProp(float* d1, float* d2, int* rules, int count, int nOut, int ps2) {
  int processed=0;
  while (processed<count) {
    int batch=min(32768,count-processed);
    dTerminalPoolBackProp<<<batch,KERNELBLOCKSIZE,0,cnnMemStream->stream>>> (d1, d2+processed*nOut, rules+processed*ps2, nOut, ps2);
    processed+=batch;
  }
  cudaCheckError();
}

TerminalPoolingLayer::TerminalPoolingLayer(int poolSize, int S)
  : inSpatialSize(poolSize), outSpatialSize(1), poolSize(poolSize), S(S) {
  std::cout << "TerminalPooling " << poolSize << " " << S << std::endl;
}
void TerminalPoolingLayer::preprocess
(SpatiallySparseBatch &batch,
 SpatiallySparseBatchInterface &input,
 SpatiallySparseBatchInterface &output) {
  assert(input.spatialSize==inSpatialSize);
  output.nFeatures=input.nFeatures;
  output.featuresPresent.hVector()=input.featuresPresent.hVector();
  output.spatialSize=outSpatialSize;
  output.nSpatialSites=0;
  output.grids.resize(batch.batchSize);
  output.backpropErrors=input.backpropErrors;
  for (int item=0;item<batch.batchSize;item++)
    terminalGridPoolingRules
      (input.grids[item],
       output.grids[item],
       S,
       output.nSpatialSites,
       output.rules.hVector());
}
void TerminalPoolingLayer::forwards
(SpatiallySparseBatch &batch,
 SpatiallySparseBatchInterface &input,
 SpatiallySparseBatchInterface &output) {
  output.sub->poolingChoices.resize(output.nSpatialSites*output.featuresPresent.size());
  output.sub->features.resize(output.nSpatialSites*output.featuresPresent.size());
  cudaCheckError();
  terminalPool(input.sub->features.dPtr(),output.sub->features.dPtr(),output.rules.dPtr(),output.nSpatialSites,S,output.featuresPresent.size());
  cudaCheckError();
}
void TerminalPoolingLayer::backwards
(SpatiallySparseBatch &batch,
 SpatiallySparseBatchInterface &input,
 SpatiallySparseBatchInterface &output,
 float learningRate,
 float momentum) {
  if (input.backpropErrors) {
    input.sub->dfeatures.resize(input.nSpatialSites*input.featuresPresent.size());
    input.sub->dfeatures.setZero();
    terminalPoolBackProp
      (input.sub->dfeatures.dPtr(), output.sub->dfeatures.dPtr(), output.rules.dPtr(),output.nSpatialSites, output.featuresPresent.size(),S);
    // output.sub->features.resize(0);
    // output.sub->dfeatures.resize(0);
    // cudaCheckError();
  }
}
int TerminalPoolingLayer::calculateInputSpatialSize(int outputSpatialSize) {
  assert(outputSpatialSize==1);
  std::cout << "(" << outSpatialSize <<"TP" <<inSpatialSize << ") ";
  return inSpatialSize;
}