diff --git a/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h b/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h index b63ca26..8514e60 100644 --- a/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h +++ b/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h @@ -7,6 +7,8 @@ #ifndef SUBMANIFOLDCONVOLUTIONRULES_H #define SUBMANIFOLDCONVOLUTIONRULES_H +#include + // Full input region for an output point template RectangularRegion @@ -22,7 +24,6 @@ InputRegionCalculator_Submanifold(const Point &output, long *size) { // Call for each convolutional / max-pooling layer, once for each batch item. // rules is used to carry out the "lowering" whilst carrying out the convolution - template double SubmanifoldConvolution_SgToRules(SparseGrid &grid, RuleBook &rules, long *size) { @@ -44,11 +45,63 @@ double SubmanifoldConvolution_SgToRules(SparseGrid &grid, return countActiveInputs; } +template +double SubmanifoldConvolution_SgToRules_par(SparseGrid &grid, + std::vector &rulebooks, + long *size, const Int threadCount) { + double countActiveInputs = 0; + std::vector threads; + std::vector activeInputs(threadCount, 0); + + auto func = [&](const int order) { + auto outputIter = grid.mp.begin(); + auto &rb = rulebooks[order]; + int rem = grid.mp.size(); + int aciveInputCount = 0; + + if (rem > order) { + std::advance(outputIter, order); + rem -= order; + + for (; outputIter != grid.mp.end(); + std::advance(outputIter, std::min(threadCount, rem)), + rem -= threadCount) { + auto inRegion = InputRegionCalculator_Submanifold( + outputIter->first, size); + Int rulesOffset = 0; + for (auto inputPoint : inRegion) { + auto inputIter = grid.mp.find(inputPoint); + if (inputIter != grid.mp.end()) { + aciveInputCount++; + rb[rulesOffset].push_back(inputIter->second + grid.ctr); + rb[rulesOffset].push_back(outputIter->second + grid.ctr); + } + rulesOffset++; + } + } + } + + activeInputs[order] = aciveInputCount; + }; + + for (Int t = 0; t < threadCount; ++t) { + threads.push_back(std::thread(func, t)); + } + + for (Int t = 0; t < threadCount; ++t) { + threads[t].join(); + countActiveInputs += activeInputs[t]; + } + + return countActiveInputs; +} + template Int SubmanifoldConvolution_SgsToRules(SparseGrids &SGs, RuleBook &rules, long *size) { Int sd = volume(size); Int countActiveInputs = 0; + rules.clear(); rules.resize(sd); for (Int i = 0; i < (Int)SGs.size(); i++) @@ -56,21 +109,31 @@ Int SubmanifoldConvolution_SgsToRules(SparseGrids &SGs, SubmanifoldConvolution_SgToRules(SGs[i], rules, size); return countActiveInputs; } + template Int SubmanifoldConvolution_SgsToRules_OMP(SparseGrids &SGs, RuleBook &rules, long *size) { - std::vector rbs(SGs.size()); + std::vector> rbs(SGs.size()); std::vector countActiveInputs(SGs.size()); rules.clear(); Int sd = volume(size); rules.resize(sd); + const Int threadCount = 4; + + for (Int i = 0; i < SGs.size(); ++i) { + std::vector rulebooks; + for (Int t = 0; t < threadCount; ++t) { + rulebooks.push_back(RuleBook(sd)); + } + rbs.push_back(rulebooks); + } + { Int i; #pragma omp parallel for private(i) for (i = 0; i < (Int)SGs.size(); i++) { - rbs[i].resize(sd); - countActiveInputs[i] = - SubmanifoldConvolution_SgToRules(SGs[i], rbs[i], size); + countActiveInputs[i] = SubmanifoldConvolution_SgToRules_par( + SGs[i], rbs[i], size, threadCount); } } { @@ -78,7 +141,8 @@ Int SubmanifoldConvolution_SgsToRules_OMP(SparseGrids &SGs, #pragma omp parallel for private(i) for (i = 0; i < sd; i++) for (auto const &rb : rbs) - rules[i].insert(rules[i].end(), rb[i].begin(), rb[i].end()); + for (Int t = 0; t < threadCount; ++t) + rules[i].insert(rules[i].end(), rb[i][t].begin(), rb[i][t].end()); } Int countActiveInputs_ = 0; for (auto &i : countActiveInputs)