From 48789766bbc74f1f9c3ec32841557dd519979b85 Mon Sep 17 00:00:00 2001
From: Michal Pandy <michalpandy@fb.com>
Date: Thu, 29 Aug 2019 08:12:52 -0700
Subject: [PATCH 1/2] Multithreaded rulebook

---
 .../Metadata/SubmanifoldConvolutionRules.h    | 62 +++++++++++++++----
 1 file changed, 51 insertions(+), 11 deletions(-)
diff --git a/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h b/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h
index b63ca26..b8ae665 100644
--- a/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h
+++ b/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h
@@ -7,6 +7,8 @@
 #ifndef SUBMANIFOLDCONVOLUTIONRULES_H
 #define SUBMANIFOLDCONVOLUTIONRULES_H
 
+#include <algorithm>
+
 // Full input region for an output point
 template <Int dimension>
 RectangularRegion<dimension>
@@ -27,20 +29,58 @@ template <Int dimension>
 double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid,
                                         RuleBook &rules, long *size) {
   double countActiveInputs = 0;
-  for (auto const &outputIter : grid.mp) {
-    auto inRegion =
-        InputRegionCalculator_Submanifold<dimension>(outputIter.first, size);
-    Int rulesOffset = 0;
-    for (auto inputPoint : inRegion) {
-      auto inputIter = grid.mp.find(inputPoint);
-      if (inputIter != grid.mp.end()) {
-        rules[rulesOffset].push_back(inputIter->second + grid.ctr);
-        rules[rulesOffset].push_back(outputIter.second + grid.ctr);
-        countActiveInputs++;
+  const Int threadCount = 4;
+  std::vector<std::thread> threads;
+  std::array<int, threadCount> activeInputs = {};
+  std::vector<RuleBook> rulebooks;
+  for (Int t = 0; t < threadCount; ++t) {
+    rulebooks.push_back(RuleBook(rules.size()));
+  }
+
+  auto func = [&](const int order) {
+    auto outputIter = grid.mp.begin();
+    auto &rb = rulebooks[order];
+    int rem = grid.mp.size();
+    int aciveInputCount = 0;
+
+    if (rem > order) {
+      std::advance(outputIter, order);
+      rem -= order;
+
+      for (; outputIter != grid.mp.end();
+           std::advance(outputIter, std::min(threadCount, rem)),
+           rem -= threadCount) {
+        auto inRegion = InputRegionCalculator_Submanifold<dimension>(
+            outputIter->first, size);
+        Int rulesOffset = 0;
+        for (auto inputPoint : inRegion) {
+          auto inputIter = grid.mp.find(inputPoint);
+          if (inputIter != grid.mp.end()) {
+            aciveInputCount++;
+            rb[rulesOffset].push_back(inputIter->second + grid.ctr);
+            rb[rulesOffset].push_back(outputIter->second + grid.ctr);
+          }
+          rulesOffset++;
+        }
       }
-      rulesOffset++;
     }
+
+    activeInputs[order] = aciveInputCount;
+  };
+
+  for (Int t = 0; t < threadCount; ++t) {
+    threads.push_back(std::thread(func, t));
   }
+
+  for (Int t = 0; t < threadCount; ++t) {
+    threads[t].join();
+    countActiveInputs += activeInputs[t];
+    for (std::size_t i = 0; i < rulebooks[t].size(); ++i) {
+      rules[i].insert(rules[i].end(), rulebooks[t][i].begin(),
+                      rulebooks[t][i].end());
+    }
+  }
+
   return countActiveInputs;
 }
 

From 39a194c8611658c4ac73aa3c01ddf23a75f485d1 Mon Sep 17 00:00:00 2001
From: Michal Pandy <michalpandy@fb.com>
Date: Wed, 4 Sep 2019 02:26:38 -0700
Subject: [PATCH 2/2] Amalgamate rulebooks after all threads done processing

---
 .../Metadata/SubmanifoldConvolutionRules.h    | 56 +++++++++++++------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h b/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h
index b8ae665..8514e60 100644
--- a/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h
+++ b/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h
@@ -24,18 +24,34 @@ InputRegionCalculator_Submanifold(const Point<dimension> &output, long *size) {
 
 // Call for each convolutional / max-pooling layer, once for each batch item.
 // rules is used to carry out the "lowering" whilst carrying out the convolution
-
 template <Int dimension>
 double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid,
                                         RuleBook &rules, long *size) {
   double countActiveInputs = 0;
-  const Int threadCount = 4;
-  std::vector<std::thread> threads;
-  std::array<int, threadCount> activeInputs = {};
-  std::vector<RuleBook> rulebooks;
-  for (Int t = 0; t < threadCount; ++t) {
-    rulebooks.push_back(RuleBook(rules.size()));
+  for (auto const &outputIter : grid.mp) {
+    auto inRegion =
+        InputRegionCalculator_Submanifold<dimension>(outputIter.first, size);
+    Int rulesOffset = 0;
+    for (auto inputPoint : inRegion) {
+      auto inputIter = grid.mp.find(inputPoint);
+      if (inputIter != grid.mp.end()) {
+        rules[rulesOffset].push_back(inputIter->second + grid.ctr);
+        rules[rulesOffset].push_back(outputIter.second + grid.ctr);
+        countActiveInputs++;
+      }
+      rulesOffset++;
+    }
   }
+  return countActiveInputs;
+}
+
+template <Int dimension>
+double SubmanifoldConvolution_SgToRules_par(SparseGrid<dimension> &grid,
+                                            std::vector<RuleBook> &rulebooks,
+                                            long *size, const Int threadCount) {
+  double countActiveInputs = 0;
+  std::vector<std::thread> threads;
+  std::vector<int> activeInputs(threadCount, 0);
 
   auto func = [&](const int order) {
     auto outputIter = grid.mp.begin();
@@ -75,10 +91,6 @@ double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid,
   for (Int t = 0; t < threadCount; ++t) {
     threads[t].join();
     countActiveInputs += activeInputs[t];
-    for (std::size_t i = 0; i < rulebooks[t].size(); ++i) {
-      rules[i].insert(rules[i].end(), rulebooks[t][i].begin(),
-                      rulebooks[t][i].end());
-    }
   }
 
   return countActiveInputs;
@@ -89,6 +101,7 @@ Int SubmanifoldConvolution_SgsToRules(SparseGrids<dimension> &SGs,
                                       RuleBook &rules, long *size) {
   Int sd = volume<dimension>(size);
   Int countActiveInputs = 0;
+
   rules.clear();
   rules.resize(sd);
   for (Int i = 0; i < (Int)SGs.size(); i++)
@@ -96,21 +109,31 @@ Int SubmanifoldConvolution_SgsToRules(SparseGrids<dimension> &SGs,
         SubmanifoldConvolution_SgToRules<dimension>(SGs[i], rules, size);
   return countActiveInputs;
 }
+
 template <Int dimension>
 Int SubmanifoldConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
                                           RuleBook &rules, long *size) {
-  std::vector<RuleBook> rbs(SGs.size());
+  std::vector<std::vector<RuleBook>> rbs(SGs.size());
   std::vector<double> countActiveInputs(SGs.size());
   rules.clear();
   Int sd = volume<dimension>(size);
   rules.resize(sd);
+  const Int threadCount = 4;
+
+  for (Int i = 0; i < SGs.size(); ++i) {
+    std::vector<RuleBook> rulebooks;
+    for (Int t = 0; t < threadCount; ++t) {
+      rulebooks.push_back(RuleBook(sd));
+    }
+    rbs.push_back(rulebooks);
+  }
+
   {
     Int i;
 #pragma omp parallel for private(i)
     for (i = 0; i < (Int)SGs.size(); i++) {
-      rbs[i].resize(sd);
-      countActiveInputs[i] =
-          SubmanifoldConvolution_SgToRules<dimension>(SGs[i], rbs[i], size);
+      countActiveInputs[i] = SubmanifoldConvolution_SgToRules_par<dimension>(
+          SGs[i], rbs[i], size, threadCount);
     }
   }
   {
@@ -118,7 +141,8 @@ Int SubmanifoldConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
 #pragma omp parallel for private(i)
     for (i = 0; i < sd; i++)
       for (auto const &rb : rbs)
-        rules[i].insert(rules[i].end(), rb[i].begin(), rb[i].end());
+        for (Int t = 0; t < threadCount; ++t)
+          rules[i].insert(rules[i].end(), rb[i][t].begin(), rb[i][t].end());
   }
   Int countActiveInputs_ = 0;
   for (auto &i : countActiveInputs)