From 981c442b7e917b7b0417d4231fbad2c6ac03b17c Mon Sep 17 00:00:00 2001
From: george <woltman@alum.mit.edu>
Date: Fri, 26 Dec 2025 03:45:54 +0000
Subject: [PATCH 1/9] Changed exponent from u32 to u64.  Exponents over 2^32-1
 don't work yet -- debugging is needed.

---
 src/Args.cpp      |  8 ++--
 src/Args.h        |  6 +--
 src/FFTConfig.cpp | 10 ++---
 src/FFTConfig.h   |  6 +--
 src/Gpu.cpp       | 93 ++++++++++++++++++++++++-----------------------
 src/Gpu.h         | 18 ++++-----
 src/PRPState.cpp  |  3 +-
 src/PRPState.h    | 10 ++---
 src/Primes.cpp    | 16 ++++----
 src/Primes.h      | 12 +++---
 src/Proof.cpp     | 60 ++++++++++++++++--------------
 src/Proof.h       | 46 +++++++++++------------
 src/Saver.cpp     | 45 ++++++++++++-----------
 src/Saver.h       | 14 +++----
 src/Task.cpp      | 12 +++---
 src/Task.h        |  2 +-
 src/TuneEntry.cpp | 13 ++++---
 src/Worktodo.cpp  |  4 +-
 src/common.h      |  6 +--
 src/shared.h      |  4 +-
 src/state.cpp     |  4 +-
 src/state.h       | 12 +++---
 src/tune.cpp      | 56 ++++++++++++++--------------
 23 files changed, 236 insertions(+), 224 deletions(-)

diff --git a/src/Args.cpp b/src/Args.cpp
index 041202ad..5556e6a5 100644
--- a/src/Args.cpp
+++ b/src/Args.cpp
@@ -86,7 +86,7 @@ void Args::readConfig(const fs::path& path) {
   }
 }
 
-u32 Args::getProofPow(u32 exponent) const {
+u32 Args::getProofPow(u64 exponent) const {
   if (proofPow == -1) { return ProofSet::bestPower(exponent); }
   assert(proofPow >= 1);
   return proofPow;
@@ -310,9 +310,9 @@ void Args::parse(const string& line) {
     } else if (key == "-tune") {
       doTune = true;
       if (!s.empty()) { tune = s; }
-    } else if (key == "-ctune") {
-      doCtune = true;
-      if (!s.empty()) { ctune.push_back(s); }
+//    } else if (key == "-ctune") {
+//      doCtune = true;
+//      if (!s.empty()) { ctune.push_back(s); }
     } else if (key == "-ztune") {
       doZtune = true;
     } else if (key == "-carryTune") {
diff --git a/src/Args.h b/src/Args.h
index 795cd99c..d7273afe 100644
--- a/src/Args.h
+++ b/src/Args.h
@@ -30,7 +30,7 @@ class Args {
   bool uses(const std::string& key) const { return flags.find(key) != flags.end(); }
   int value(const std::string& key, int valNotFound = -1) const;
   void readConfig(const fs::path& path);
-  u32 getProofPow(u32 exponent) const;
+  u32 getProofPow(u64 exponent) const;
   string tailDir() const;
 
   bool hasFlag(const string& key) const;
@@ -78,8 +78,8 @@ class Args {
   u32 logStep = 20000;
   string fftSpec;
 
-  u32 prpExp = 0;
-  u32 llExp = 0;
+  u64 prpExp = 0;
+  u64 llExp = 0;
   
   size_t maxAlloc = 0;
 
diff --git a/src/FFTConfig.cpp b/src/FFTConfig.cpp
index 2308a037..c21607e1 100644
--- a/src/FFTConfig.cpp
+++ b/src/FFTConfig.cpp
@@ -182,7 +182,7 @@ if (18.35 + 0.5 * (log2(13 * 1024 * 512) - log2(size())) > 19.0) return 19.0;
   return 18.35 + 0.5 * (log2(13 * 1024 * 512) - log2(size()));
 }
 
-bool FFTShape::needsLargeCarry(u32 E) const {
+bool FFTShape::needsLargeCarry(u64 E) const {
   return E / double(size()) > carry32BPW();
 }
 
@@ -271,12 +271,12 @@ float FFTConfig::maxBpw() const {
   return (carry == CARRY_32 && (shape.fft_type == FFT64 || shape.fft_type == FFT3231)) ? std::min(shape.carry32BPW(), b) : b;
 }
 
-FFTConfig FFTConfig::bestFit(const Args& args, u32 E, const string& spec) {
+FFTConfig FFTConfig::bestFit(const Args& args, u64 E, const string& spec) {
   // A FFT-spec was given, simply take the first FFT from the spec that can handle E
   if (!spec.empty()) {
     FFTConfig fft{spec};
     if (fft.maxExp() * args.fftOverdrive < E) {
-      log("Warning: %s (max %" PRIu64 ") may be too small for %u\n", fft.spec().c_str(), fft.maxExp(), E);
+      log("Warning: %s (max %" PRIu64 ") may be too small for %" PRIu64 "\n", fft.spec().c_str(), fft.maxExp(), E);
     }
     return fft;
   }
@@ -288,7 +288,7 @@ FFTConfig FFTConfig::bestFit(const Args& args, u32 E, const string& spec) {
     if (E <= e.fft.maxExp() * args.fftOverdrive) { return e.fft; }
   }
 
-  log("No FFTs found in tune.txt that can handle %u. Consider tuning with -tune\n", E);
+  log("No FFTs found in tune.txt that can handle %" PRIu64 ". Consider tuning with -tune\n", E);
 
   // Take the first FFT that can handle E
   for (const FFTShape& shape : FFTShape::allShapes()) {
@@ -297,7 +297,7 @@ FFTConfig FFTConfig::bestFit(const Args& args, u32 E, const string& spec) {
     }
   }
 
-  log("No FFT found for %u\n", E);
+  log("No FFT found for %" PRIu64 "\n", E);
   throw "No FFT";
 }
 
diff --git a/src/FFTConfig.h b/src/FFTConfig.h
index c873eb8c..a9ad052c 100644
--- a/src/FFTConfig.h
+++ b/src/FFTConfig.h
@@ -27,7 +27,7 @@ class FFTShape {
 public:
   static std::vector<FFTShape> allShapes(u32 from=0, u32 to = -1);
 
-  static tuple<u32, u32, bool> getChainLengths(u32 fftSize, u32 exponent, u32 middle);
+  static tuple<u32, u32, bool> getChainLengths(u32 fftSize, u64 exponent, u32 middle);
 
   static vector<FFTShape> multiSpec(const string& spec);
 
@@ -51,7 +51,7 @@ class FFTShape {
   std::string spec() const { return (fft_type ? to_string(fft_type) + ':' : "") + numberK(width) + ':' + numberK(middle) + ':' + numberK(height); }
 
   float carry32BPW() const;
-  bool needsLargeCarry(u32 E) const;
+  bool needsLargeCarry(u64 E) const;
   bool isFavoredShape() const;
 };
 
@@ -73,7 +73,7 @@ enum CARRY_KIND {CARRY_32=0, CARRY_64=1, CARRY_AUTO=2};
 
 struct FFTConfig {
 public:
-  static FFTConfig bestFit(const Args& args, u32 E, const std::string& spec);
+  static FFTConfig bestFit(const Args& args, u64 E, const std::string& spec);
 
   // Which FP and NTT primes are involved in the FFT
   bool FFT_FP64;
diff --git a/src/Gpu.cpp b/src/Gpu.cpp
index 44656fa1..433ce0a7 100644
--- a/src/Gpu.cpp
+++ b/src/Gpu.cpp
@@ -43,43 +43,43 @@ namespace {
 
 u32 kAt(u32 H, u32 line, u32 col) { return (line + col * H) * 2; }
 
-double weight(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) {
+double weight(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) {
   return exp2l((long double)(extra(N, E, kAt(H, line, col) + rep)) / N);
 }
 
-double invWeight(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) {
+double invWeight(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) {
   return exp2l(-(long double)(extra(N, E, kAt(H, line, col) + rep)) / N);
 }
 
-double weightM1(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) {
+double weightM1(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) {
   return exp2l((long double)(extra(N, E, kAt(H, line, col) + rep)) / N) - 1;
 }
 
-double invWeightM1(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) {
+double invWeightM1(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) {
   return exp2l(- (long double)(extra(N, E, kAt(H, line, col) + rep)) / N) - 1;
 }
 
 double boundUnderOne(double x) { return std::min(x, nexttoward(1, 0)); }
 
-float weight32(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) {
+float weight32(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) {
   return exp2((double)(extra(N, E, kAt(H, line, col) + rep)) / N);
 }
 
-float invWeight32(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) {
+float invWeight32(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) {
   return exp2(-(double)(extra(N, E, kAt(H, line, col) + rep)) / N);
 }
 
-float weightM132(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) {
+float weightM132(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) {
   return exp2((double)(extra(N, E, kAt(H, line, col) + rep)) / N) - 1;
 }
 
-float invWeightM132(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) {
+float invWeightM132(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) {
   return exp2(- (double)(extra(N, E, kAt(H, line, col) + rep)) / N) - 1;
 }
 
 float boundUnderOne(float x) { return std::min(x, nexttowardf(1, 0)); }
 
-Weights genWeights(FFTConfig fft, u32 E, u32 W, u32 H, u32 nW, bool AmdGpu) {
+Weights genWeights(FFTConfig fft, u64 E, u32 W, u32 H, u32 nW, bool AmdGpu) {
   u32 N = 2u * W * H;
   u32 groupWidth = W / nW;
 
@@ -227,7 +227,7 @@ constexpr bool isInList(const string& s, initializer_list<string> list) {
   return false;
 }
 
-string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<KeyVal>& extraConf, u32 E, bool doLog,
+string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<KeyVal>& extraConf, u64 E, bool doLog,
                  bool &tail_single_wide, bool &tail_single_kernel, u32 &in_place, u32 &pad_size) {
   map<string, string> config;
 
@@ -457,15 +457,15 @@ RoeInfo roeStat(const vector<float>& roe) {
 
 class IterationTimer {
   Timer timer;
-  u32 kStart;
+  u64 kStart;
 
 public:
-  explicit IterationTimer(u32 kStart) : kStart(kStart) { }
+  explicit IterationTimer(u64 kStart) : kStart(kStart) { }
 
-  float reset(u32 k) {
+  float reset(u64 k) {
     float secs = timer.reset();
 
-    u32 its = max(1u, k - kStart);
+    u64 its = max(u64(1), k - kStart);
     kStart = k;
     return secs / its;
   }
@@ -506,7 +506,7 @@ string toHex(const vector<u32>& v) {
 
 // --------
 
-unique_ptr<Gpu> Gpu::make(Queue* q, u32 E, GpuCommon shared, FFTConfig fftConfig, const vector<KeyVal>& extraConf, bool logFftSize) {
+unique_ptr<Gpu> Gpu::make(Queue* q, u64 E, GpuCommon shared, FFTConfig fftConfig, const vector<KeyVal>& extraConf, bool logFftSize) {
   return make_unique<Gpu>(q, shared, fftConfig, E, extraConf, logFftSize);
 }
 
@@ -518,7 +518,7 @@ Gpu::~Gpu() {
 #define ROE_SIZE 100000
 #define CARRY_SIZE 100000
 
-Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector<KeyVal>& extraConf, bool logFftSize) :
+Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector<KeyVal>& extraConf, bool logFftSize) :
   queue(q),
   background{shared.background},
   args{*shared.args},
@@ -649,7 +649,7 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector<KeyVal>&
     // Sometimes we do want to run a FFT beyond a reasonable BPW (e.g. during -ztune), and these situations
     // coincide with logFftSize == false
     if (fft.maxExp() < E) {
-      log("Warning: %s (max %" PRIu64 ") may be too small for %u\n", fft.spec().c_str(), fft.maxExp(), E);
+      log("Warning: %s (max %" PRIu64 ") may be too small for %" PRIu64 "\n", fft.spec().c_str(), fft.maxExp(), E);
     }
   }
 
@@ -992,7 +992,7 @@ void Gpu::modMul(Buffer<Word>& ioA, Buffer<Word>& inB, enum LEAD_TYPE leadInB, b
   mul(ioA, buf1, buf2, buf3, mul3);
 };
 
-void Gpu::writeState(u32 k, const vector<u32>& check, u32 blockSize) {
+void Gpu::writeState(u64 k, const vector<u32>& check, u32 blockSize) {
   assert(blockSize > 0);
   writeIn(bufCheck, check);
 
@@ -1022,7 +1022,7 @@ void Gpu::writeState(u32 k, const vector<u32>& check, u32 blockSize) {
   }
   modMul(bufData, bufAux, true);
 }
-  
+
 bool Gpu::doCheck(u32 blockSize) {
   squareLoop(bufAux, bufCheck, 0, blockSize, true);
   modMul(bufCheck, bufData);
@@ -1278,10 +1278,10 @@ void Gpu::square(Buffer<Word>& out, Buffer<Word>& in, enum LEAD_TYPE leadIn, enu
   }
 }
 
-u32 Gpu::squareLoop(Buffer<Word>& out, Buffer<Word>& in, u32 from, u32 to, bool doTailMul3) {
+u32 Gpu::squareLoop(Buffer<Word>& out, Buffer<Word>& in, u64 from, u64 to, bool doTailMul3) {
   assert(from < to);
   enum LEAD_TYPE leadIn = LEAD_NONE;
-  for (u32 k = from; k < to; ++k) {
+  for (u64 k = from; k < to; ++k) {
     enum LEAD_TYPE leadOut = useLongCarry || (k == to - 1) ? LEAD_NONE : LEAD_WIDTH;
     square(out, (k==from) ? in : out, leadIn, leadOut, doTailMul3 && (k == to - 1));
     leadIn = leadOut;
@@ -1350,16 +1350,16 @@ string RoeInfo::toString() const {
   return buf;
 }
 
-static string makeLogStr(const string& status, u32 k, u64 res, float secsPerIt, u32 nIters) {
+static string makeLogStr(const string& status, u64 k, u64 res, float secsPerIt, u64 nIters) {
   char buf[256];
   
-  snprintf(buf, sizeof(buf), "%2s %9u %016" PRIx64 " %4.0f ETA %s; ",
+  snprintf(buf, sizeof(buf), "%2s %9" PRIu64 " %016" PRIx64 " %4.0f ETA %s; ",
            status.c_str(), k, res, /* k / float(nIters) * 100, */
            secsPerIt * 1'000'000, getETA(k, nIters, secsPerIt).c_str());
   return buf;
 }
 
-void Gpu::doBigLog(u32 k, u64 res, bool checkOK, float secsPerIt, u32 nIters, u32 nErrors) {
+void Gpu::doBigLog(u64 k, u64 res, bool checkOK, float secsPerIt, u64 nIters, u32 nErrors) {
   auto [roeSq, roeMul] = readROE();
   double z = roeSq.z();
   zAvg.update(z, roeSq.N);
@@ -1480,7 +1480,7 @@ static u32 mod3(const std::vector<u32> &words) {
   return r % 3;
 }
 
-static void doDiv3(u32 E, Words& words) {
+static void doDiv3(u64 E, Words& words) {
   u32 r = (3 - mod3(words)) % 3;
   assert(r < 3);
   int topBits = E % 32;
@@ -1497,7 +1497,7 @@ static void doDiv3(u32 E, Words& words) {
   }
 }
 
-void Gpu::doDiv9(u32 E, Words& words) {
+void Gpu::doDiv9(u64 E, Words& words) {
   doDiv3(E, words);
   doDiv3(E, words);
 }
@@ -1532,12 +1532,12 @@ PRPState Gpu::loadPRP(Saver<PRPState>& saver) {
     u64 res = dataResidue();
 
     if (res == state.res64) {
-      log("OK %9u on-load: blockSize %d, %016" PRIx64 "\n", state.k, state.blockSize, res);
+      log("OK %9" PRIu64 " on-load: blockSize %d, %016" PRIx64 "\n", state.k, state.blockSize, res);
       return state;
       // return {loaded.k, loaded.blockSize, loaded.nErrors};
     }
 
-    log("EE %9u on-load: %016" PRIx64 " vs. %016" PRIx64 "\n", state.k, res, state.res64);
+    log("EE %9" PRIu64 " on-load: %016" PRIx64 " vs. %016" PRIx64 "\n", state.k, res, state.res64);
 
     if (!state.k) { break; }  // We failed on PRP start
   }
@@ -1545,7 +1545,7 @@ PRPState Gpu::loadPRP(Saver<PRPState>& saver) {
   throw "Error on load";
 }
 
-u32 Gpu::getProofPower(u32 k) {
+u32 Gpu::getProofPower(u64 k) {
   u32 power = ProofSet::effectivePower(E, args.getProofPow(E), k);
 
   if (power != args.getProofPow(E)) {
@@ -1785,7 +1785,8 @@ PRPResult Gpu::isPrimePRP(const Task& task) {
 
  reload:
   elapsedTimer.reset();
-  u32 blockSize{}, k{};
+  u32 blockSize{};
+  u64 k{};
   double elapsedBefore = 0;
 
   {
@@ -1814,28 +1815,28 @@ PRPResult Gpu::isPrimePRP(const Task& task) {
   // For M=2^E-1, residue "type-3" == 3^(M+1), and residue "type-1" == type-3 / 9,
   // See http://www.mersenneforum.org/showpost.php?p=468378&postcount=209
   // For both type-1 and type-3 we need to do E squarings (as M+1==2^E).
-  const u32 kEnd = E;
+  const u64 kEnd = E;
   assert(k < kEnd);
 
   // We continue beyound kEnd: to the next multiple of blockSize, to do a check there
-  u32 kEndEnd = roundUp(kEnd, blockSize);
+  u64 kEndEnd = roundUp(kEnd, blockSize);
 
   bool skipNextCheckUpdate = false;
 
-  u32 persistK = proofSet.next(k);
+  u64 persistK = proofSet.next(k);
   enum LEAD_TYPE leadIn = LEAD_NONE;
 
   assert(k % blockSize == 0);
   assert(checkStep % blockSize == 0);
 
-  const u32 startK = k;
+  const u64 startK = k;
   IterationTimer iterationTimer{k};
 
   wantROE = 0; // skip the initial iterations
 
   while (true) {
     assert(k < kEndEnd);
-    
+
     if (!wantROE && k - startK > 30) { wantROE = args.logROE ? ROE_SIZE : 2'000; }
 
     if (skipNextCheckUpdate) {
@@ -1876,7 +1877,7 @@ PRPResult Gpu::isPrimePRP(const Task& task) {
       res2048.clear();
       assert(words.size() >= 64);
       res2048.insert(res2048.end(), words.begin(), std::next(words.begin(), 64));
-      log("%s %8d / %d, %s\n", isPrime ? "PP" : "CC", kEnd, E, hex(finalRes64).c_str());
+      log("%s %8" PRIu64 " / %" PRIu64 ", %s\n", isPrime ? "PP" : "CC", kEnd, E, hex(finalRes64).c_str());
     }
 
     if (!doCheck && !doLog) continue;
@@ -1888,7 +1889,7 @@ PRPResult Gpu::isPrimePRP(const Task& task) {
     vector<Word> rawCheck = readChecked(bufCheck);
     if (rawCheck.empty()) {
       ++nErrors;
-      log("%9u %016" PRIx64 " read NULL check\n", k, res);
+      log("%9" PRIu64 " %016" PRIx64 " read NULL check\n", k, res);
       if (++nSeqErrors > 2) { throw "sequential errors"; }
       goto reload;
     }
@@ -1899,7 +1900,7 @@ PRPResult Gpu::isPrimePRP(const Task& task) {
                                     elapsedBefore + elapsedTimer.at()});
       });
 
-      log("   %9u %016" PRIx64 " %4.0f\n", k, res, /*k / float(kEndEnd) * 100*,*/ secsPerIt * 1'000'000);
+      log("   %9" PRIu64 " %016" PRIx64 " %4.0f\n", k, res, /*k / float(kEndEnd) * 100*,*/ secsPerIt * 1'000'000);
       RoeInfo carryStats = readCarryStats();
       if (carryStats.N) {
         u32 m = ldexp(carryStats.max, 32);
@@ -1965,7 +1966,7 @@ LLResult Gpu::isPrimeLL(const Task& task) {
   reload:
   elapsedTimer.reset();
 
-  u32 startK = 0;
+  u64 startK = 0;
   double elapsedBefore = 0;
   {
     LLState state = saver.load();
@@ -1977,13 +1978,13 @@ LLResult Gpu::isPrimeLL(const Task& task) {
     u64 res = dataResidue();
     if (res != expectedRes) { throw "Invalid savefile (res64)"; }
     assert(res == expectedRes);
-    log("LL loaded @ %u : %016" PRIx64 "\n", startK, res);
+    log("LL loaded @ %" PRIu64 " : %016" PRIx64 "\n", startK, res);
   }
 
   IterationTimer iterationTimer{startK};
 
-  u32 k = startK;
-  u32 kEnd = E - 2;
+  u64 k = startK;
+  u64 kEnd = E - 2;
   enum LEAD_TYPE leadIn = LEAD_NONE;
 
   while (true) {
@@ -2009,7 +2010,7 @@ LLResult Gpu::isPrimeLL(const Task& task) {
 
     if (isAllZero) {
       if (k < kEnd) {
-        log("Error: early ZERO @ %u\n", k);
+        log("Error: early ZERO @ %" PRIu64 "\n", k);
         if (doStop) {
           throw "stop requested";
         } else {
@@ -2025,7 +2026,7 @@ LLResult Gpu::isPrimeLL(const Task& task) {
 
     float secsPerIt = iterationTimer.reset(k);
     queue->setSquareTime((int) (secsPerIt * 1'000'000));
-    log("%9u %016" PRIx64 " %4.0f\n", k, res64, secsPerIt * 1'000'000);
+    log("%9" PRIu64 " %016" PRIx64 " %4.0f\n", k, res64, secsPerIt * 1'000'000);
 
     if (k >= kEnd) { return {isAllZero, res64}; }
 
@@ -2039,13 +2040,13 @@ array<u64, 4> Gpu::isCERT(const Task& task) {
 
   // Get CERT start value
   char fname[32];
-  sprintf(fname, "M%u.cert", E);
+  sprintf(fname, "M%" PRIu64 ".cert", E);
 
 // Autoprimenet.py does not add the cert entry to worktodo.txt until it has successfully downloaded the .cert file.
 
   { // Enclosing this code in braces ensures the file will be closed by the File destructor.  The later file deletion requires the file be closed in Windows.
     File fi = File::openReadThrow(fname);
-    u32 nBytes = (E - 1) / 8 + 1;
+    u32 nBytes = u32((E - 1) / 8 + 1);
     Words B = fi.readBytesLE(nBytes);
     writeIn(bufData, std::move(B));
   }
diff --git a/src/Gpu.h b/src/Gpu.h
index fc5166f3..ad859eac 100644
--- a/src/Gpu.h
+++ b/src/Gpu.h
@@ -93,7 +93,7 @@ class Gpu {
 private:
   std::unique_ptr<Saver<PRPState>> saver;
 
-  u32 E;
+  u64 E;
   u32 N;
 
   FFTConfig fft;
@@ -250,8 +250,8 @@ class Gpu {
   void squareCERT(Buffer<Word>& io, enum LEAD_TYPE leadIn, enum LEAD_TYPE leadOut) { square(io, io, leadIn, leadOut, false, false); }
   void squareLL(Buffer<Word>& io, enum LEAD_TYPE leadIn, enum LEAD_TYPE leadOut) { square(io, io, leadIn, leadOut, false, true); }
 
-  u32 squareLoop(Buffer<Word>& out, Buffer<Word>& in, u32 from, u32 to, bool doTailMul3);
-  u32 squareLoop(Buffer<Word>& io, u32 from, u32 to) { return squareLoop(io, io, from, to, false); }
+  u32 squareLoop(Buffer<Word>& out, Buffer<Word>& in, u64 from, u64 to, bool doTailMul3);
+  u32 squareLoop(Buffer<Word>& io, u64 from, u64 to) { return squareLoop(io, io, from, to, false); }
 
   bool isEqual(Buffer<Word>& bufCheck, Buffer<Word>& bufAux);
   u64 bufResidue(Buffer<Word>& buf);
@@ -260,7 +260,7 @@ class Gpu {
   
   void exponentiate(Buffer<Word>& bufInOut, u64 exp, Buffer<double>& buf1, Buffer<double>& buf2, Buffer<double>& buf3);
 
-  void writeState(u32 k, const vector<u32>& check, u32 blockSize);
+  void writeState(u64 k, const vector<u32>& check, u32 blockSize);
 
   // does either carrryFused() or the expanded version depending on useLongCarry
   void doCarry(Buffer<double>& out, Buffer<double>& in, Buffer<Word>& tmp);
@@ -283,13 +283,13 @@ class Gpu {
 
   // void measureTransferSpeed();
 
-  static void doDiv9(u32 E, Words& words);
+  static void doDiv9(u64 E, Words& words);
   static bool equals9(const Words& words);
   void selftestTrig();
 
 public:
-  Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector<KeyVal>& extraConf, bool logFftSize);
-  static unique_ptr<Gpu> make(Queue* q, u32 E, GpuCommon shared, FFTConfig fft,
+  Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector<KeyVal>& extraConf, bool logFftSize);
+  static unique_ptr<Gpu> make(Queue* q, u64 E, GpuCommon shared, FFTConfig fft,
                               const vector<KeyVal>& extraConf = {}, bool logFftSize = true);
 
   ~Gpu();
@@ -337,8 +337,8 @@ class Gpu {
   void clear(bool isPRP);
 
 private:
-  u32 getProofPower(u32 k);
-  void doBigLog(u32 k, u64 res, bool checkOK, float secsPerIt, u32 nIters, u32 nErrors);
+  u32 getProofPower(u64 k);
+  void doBigLog(u64 k, u64 res, bool checkOK, float secsPerIt, u64 nIters, u32 nErrors);
 };
 
 // Compute the size of an FFT/NTT data buffer depending on the FFT/NTT float/prime.  Size is returned in units of sizeof(double).
diff --git a/src/PRPState.cpp b/src/PRPState.cpp
index bdc2c9c8..02b9ae68 100644
--- a/src/PRPState.cpp
+++ b/src/PRPState.cpp
@@ -8,7 +8,8 @@ PRPState::PRPState(File&& fi) {
 
   string header = fi.readLine();
 
-  u32 fileE, fileK, blockSize, nErrors, crc;
+  u64 fileE, fileK;
+  u32 blockSize, nErrors, crc;
   u64 res64;
   vector<u32> check;
   u32 b1, nBits, start, nextK;
diff --git a/src/PRPState.h b/src/PRPState.h
index edf0ccf9..1eab5085 100644
--- a/src/PRPState.h
+++ b/src/PRPState.h
@@ -10,23 +10,23 @@ class File;
 
 class PRPState {
   // E, k, block-size, res64, nErrors
-  static constexpr const char *PRP_v10 = "OWL PRP 10 %u %u %u %016" SCNx64 " %u\n";
+  static constexpr const char *PRP_v10 = "OWL PRP 10 %" PRIu64 " %" PRIu64 " %u %016" SCNx64 " %u\n";
 
   // Exponent, iteration, block-size, res64, nErrors
   // B1, nBits, start, nextK, crc
-  static constexpr const char *PRP_v11 = "OWL PRP 11 %u %u %u %016" SCNx64 " %u %u %u %u %u %u\n";
+  static constexpr const char *PRP_v11 = "OWL PRP 11 %" PRIu64 " %" PRIu64 " %u %016" SCNx64 " %u %u %u %u %u %u\n";
 
   // E, k, block-size, res64, nErrors, CRC
-  static constexpr const char *PRP_v12 = "OWL PRP 12 %u %u %u %016" SCNx64 " %u %u\n";
+  static constexpr const char *PRP_v12 = "OWL PRP 12 %" PRIu64 " %" PRIu64 " %u %016" SCNx64 " %u %u\n";
 
 public:
-  u32 k{};
+  u64 k{};
   u32 blockSize{};
   u64 res64{};
   vector<u32> check;
   u32 nErrors{};
 
-  // PRPState(u32 k, u32 blockSize, u64 res64, vector<)
+  // PRPState(u64 k, u32 blockSize, u64 res64, vector<)
   PRPState(File&& f);
   void saveTo(const File& f);
 };
diff --git a/src/Primes.cpp b/src/Primes.cpp
index 865cfaec..6e3ceb36 100644
--- a/src/Primes.cpp
+++ b/src/Primes.cpp
@@ -15,14 +15,14 @@ Primes::Primes() {
   }
 }
 
-bool Primes::isPrimeOdd(u32 n) const {
+bool Primes::isPrimeOdd(u64 n) const {
   assert(n % 2); // must be odd to call here
 
   if (n < 3) { return false; }
   for (u32 k = 0; k < sieve.size(); ++k) {
     if (sieve[k]) {
       u32 p = k * 2 + 3;
-      if (p * p > n) { return true; }
+      if (u64(p) * u64(p) > n) { return true; }
       if (n % p == 0) { return false; }
     }
   }
@@ -30,11 +30,11 @@ bool Primes::isPrimeOdd(u32 n) const {
   return false;
 }
 
-bool Primes::isPrime(u32 n) const {
+bool Primes::isPrime(u64 n) const {
   return (n%2 && isPrimeOdd(n)) || (n == 2);
 }
 
-u32 Primes::prevPrime(u32 n) const {
+u64 Primes::prevPrime(u64 n) const {
   --n;
   if (n % 2 == 0) { --n; }
 
@@ -43,7 +43,7 @@ u32 Primes::prevPrime(u32 n) const {
   return 0;
 }
 
-u32 Primes::nextPrime(u32 n) const {
+u64 Primes::nextPrime(u64 n) const {
   ++n;
   if (n % 2 == 0) { ++n; }
   for (; ; n += 2) { if (isPrimeOdd(n)) { return n; }}
@@ -51,10 +51,10 @@ u32 Primes::nextPrime(u32 n) const {
   return 0;
 }
 
-u32 Primes::nearestPrime(u32 n) const {
+u64 Primes::nearestPrime(u64 n) const {
   if (isPrime(n)) { return n; }
-  u32 a = prevPrime(n);
-  u32 b = nextPrime(n);
+  u64 a = prevPrime(n);
+  u64 b = nextPrime(n);
   assert(a < n && n < b);
   return n-a < b-n ? a : b;
 }
diff --git a/src/Primes.h b/src/Primes.h
index b951c02d..7f1f16fb 100644
--- a/src/Primes.h
+++ b/src/Primes.h
@@ -6,14 +6,14 @@
 #include "common.h"
 
 class Primes {
-  std::bitset<50000> sieve;
-  bool isPrimeOdd(u32 n) const;
+  std::bitset<50000> sieve;           // Allows for testing prims up to 10 billion
+  bool isPrimeOdd(u64 n) const;
 
 public:
   Primes();
 
-  bool isPrime(u32 n) const;
-  u32 prevPrime(u32 n) const;
-  u32 nextPrime(u32 n) const;
-  u32 nearestPrime(u32 n) const;
+  bool isPrime(u64 n) const;
+  u64 prevPrime(u64 n) const;
+  u64 nextPrime(u64 n) const;
+  u64 nearestPrime(u64 n) const;
 };
diff --git a/src/Proof.cpp b/src/Proof.cpp
index 31c488f8..bd8d4be6 100644
--- a/src/Proof.cpp
+++ b/src/Proof.cpp
@@ -19,11 +19,11 @@
 
 namespace proof {
 
-array<u64, 4> hashWords(u32 E, const Words& words) {
+array<u64, 4> hashWords(u64 E, const Words& words) {
   return std::move(SHA3{}.update(words.data(), (E-1)/8+1)).finish();
 }
 
-array<u64, 4> hashWords(u32 E, array<u64, 4> prefix, const Words& words) {
+array<u64, 4> hashWords(u64 E, array<u64, 4> prefix, const Words& words) {
   return std::move(SHA3{}.update(prefix).update(words.data(), (E-1)/8+1)).finish();
 }
 
@@ -39,7 +39,8 @@ string fileHash(const fs::path& filePath) {
 ProofInfo getInfo(const fs::path& proofFile) {
   string hash = proof::fileHash(proofFile);
   File fi = File::openReadThrow(proofFile);
-  u32 E = 0, power = 0;
+  u64 E = 0;
+  u32 power = 0;
   char c = 0;
   if (fi.scanf(Proof::HEADER_v2, &power, &E, &c) != 3 || c != '\n') {
     log("Proof file '%s' has invalid header\n", proofFile.string().c_str());
@@ -68,7 +69,8 @@ void Proof::save(const fs::path& proofFile) const {
 
 Proof Proof::load(const fs::path& path) {
   File fi = File::openReadThrow(path);
-  u32 E = 0, power = 0;
+  u64 E = 0;
+  u32 power = 0;
   char c = 0;
   if (fi.scanf(HEADER_v2, &power, &E, &c) != 3 || c != '\n') {
     log("Proof file '%s' has invalid header\n", path.string().c_str());
@@ -84,7 +86,7 @@ Proof Proof::load(const fs::path& path) {
 bool Proof::verify(Gpu *gpu, const vector<u64>& hashes) const {
   // log("B         %016" PRIx64 "\n", res64(B));
   // for (u32 i = 0; i < middles.size(); ++i) { log("Middle[%u] %016" PRIx64 "\n", i, res64(middles[i])); }
-  
+
   u32 power = middles.size();
   assert(power > 0);
 
@@ -92,10 +94,10 @@ bool Proof::verify(Gpu *gpu, const vector<u64>& hashes) const {
 
   Words A{makeWords(E, 3)};
   Words B{this->B};
-  
+
   auto hash = proof::hashWords(E, B);
 
-  u32 span = E;
+  u64 span = E;
   for (u32 i = 0; i < power; ++i, span = (span + 1) / 2) {
     const Words& M = middles[i];
     hash = proof::hashWords(E, hash, M);
@@ -113,12 +115,12 @@ bool Proof::verify(Gpu *gpu, const vector<u64>& hashes) const {
     if (gpu->args.verbose) { log("proof [%u] : A %016" PRIx64 ", B %016" PRIx64 ", h %016" PRIx64 "\n", i, res64(A), res64(B), h); }
   }
     
-  log("proof verification: doing %d iterations\n", span);
+  log("proof verification: doing %" PRIu64 " iterations\n", span);
   A = gpu->expExp2(A, span);
 
   bool ok = (A == B);
   if (ok) {
-    log("proof: %u proved %s\n", E, isPrime ? "probable prime" : "composite");
+    log("proof: %" PRIu64 " proved %s\n", E, isPrime ? "probable prime" : "composite");
   } else {
     log("proof: invalid (%016" PRIx64 " expected %016" PRIx64 ")\n", res64(A), res64(B));
   }
@@ -127,9 +129,9 @@ bool Proof::verify(Gpu *gpu, const vector<u64>& hashes) const {
 
 // ---- ProofSet ----
 
-ProofSet::ProofSet(u32 E, u32 power)
+ProofSet::ProofSet(u64 E, u32 power)
   : E{E}, power{power} {
-  
+
   assert(E & 1); // E is supposed to be prime
   if (power <= 0 || power > 12) {
     log("Invalid proof power: %u\n", power);
@@ -138,11 +140,13 @@ ProofSet::ProofSet(u32 E, u32 power)
 
   fs::create_directories(proofPath(E));
 
-  vector<u32> spans;
-  for (u32 span = (E + 1) / 2; spans.size() < power; span = (span + 1) / 2) { spans.push_back(span); }
+  vector<u64> spans;
+  for (u64 span = (E + 1) / 2; spans.size() < power; span = (span + 1) / 2) { spans.push_back(span); }
 
   points.push_back(0);
-  for (u32 p = 0, span = (E + 1) / 2; p < power; ++p, span = (span + 1) / 2) {
+  u32 p;
+  u64 span;
+  for (p = 0, span = (E + 1) / 2; p < power; ++p, span = (span + 1) / 2) {
     for (u32 i = 0, end = points.size(); i < end; ++i) {
       points.push_back(points[i] + span);
     }
@@ -160,15 +164,17 @@ ProofSet::ProofSet(u32 E, u32 power)
   points.push_back(u32(-1)); // guard element
   cacheIt = points.begin();
 
-  for ([[maybe_unused]] u32 p : points) {
+  for ([[maybe_unused]] u64 p : points) {
     assert(p > E || isInPoints(E, power, p));
   }
 }
 
-bool ProofSet::isInPoints(u32 E, u32 power, u32 k) {
+bool ProofSet::isInPoints(u64 E, u32 power, u64 k) {
   if (k == E) { return true; } // special-case E
-  u32 start = 0;
-  for (u32 p = 0, span = (E + 1) / 2; p < power; ++p, span = (span + 1) / 2) {
+  u64 start = 0;
+  u32 p;
+  u64 span;
+  for (p = 0, span = (E + 1) / 2; p < power; ++p, span = (span + 1) / 2) {
     assert(k >= start);
     if (k > start + span) {
       start += span;
@@ -179,12 +185,12 @@ bool ProofSet::isInPoints(u32 E, u32 power, u32 k) {
   return false;
 }
 
-bool ProofSet::canDo(u32 E, u32 power, u32 currentK) {
+bool ProofSet::canDo(u64 E, u32 power, u64 currentK) {
   assert(power > 0 && power <= 12);
   return ProofSet{E, power}.isValidTo(currentK);
 }
 
-u32 ProofSet::bestPower(u32 E) {
+u32 ProofSet::bestPower(u64 E) {
   // Best proof powers assuming no disk space concern.
   // We increment power by 1 for each fourfold increase of the exponent.
   // The values below produce power=10 at wavefront, and power=11 at 100Mdigits:
@@ -197,7 +203,7 @@ u32 ProofSet::bestPower(u32 E) {
   return power;
 }
 
-double ProofSet::diskUsageGB(u32 E, u32 power) {
+double ProofSet::diskUsageGB(u64 E, u32 power) {
   //  -3 because convert exponent bits to bytes
   // -30 because convert bytes to GB
   // +power because needs 2^power residues for proof generation
@@ -205,7 +211,7 @@ double ProofSet::diskUsageGB(u32 E, u32 power) {
   return power ? ldexp(E, -33 + int(power)) * 1.05 : 0.0;
 }
 
-u32 ProofSet::effectivePower(u32 E, u32 power, u32 currentK) {
+u32 ProofSet::effectivePower(u64 E, u32 power, u64 currentK) {
   for (u32 p = power; p > 0; --p) {
     // log("validating proof residues for power %u\n", p);
     if (canDo(E, p, currentK)) { return p; }
@@ -213,11 +219,11 @@ u32 ProofSet::effectivePower(u32 E, u32 power, u32 currentK) {
   return 0;
 }
     
-bool ProofSet::fileExists(u32 k) const {
+bool ProofSet::fileExists(u64 k) const {
   return File::size(proofPath(E) / to_string(k)) == i64(E / 32 + 2) * 4;
 }
 
-bool ProofSet::isValidTo(u32 limitK) const {
+bool ProofSet::isValidTo(u64 limitK) const {
   auto it = upper_bound(points.begin(), points.end(), limitK);
 
   if (it == points.begin()) {
@@ -238,14 +244,14 @@ bool ProofSet::isValidTo(u32 limitK) const {
   return true;
 }
 
-u32 ProofSet::next(u32 k) const {
+u64 ProofSet::next(u64 k) const {
   if (*cacheIt <= k || (cacheIt > points.begin() && *prev(cacheIt) > k)) {
     cacheIt = upper_bound(points.begin(), points.end(), k);
   }
   return *cacheIt;
 }
 
-void ProofSet::save(u32 E, u32 power, u32 k, const Words& words) {
+void ProofSet::save(u64 E, u32 power, u64 k, const Words& words) {
   assert(k && k <= E);
   assert(isInPoints(E, power, k));
 
@@ -253,7 +259,7 @@ void ProofSet::save(u32 E, u32 power, u32 k, const Words& words) {
   assert(load(E, power, k) == words);
 }
 
-Words ProofSet::load(u32 E, u32 power, u32 k) {
+Words ProofSet::load(u64 E, u32 power, u64 k) {
   assert(k && k <= E);
   assert(isInPoints(E, power, k));
   return File::openReadThrow(proofPath(E) / to_string(k)).readChecked<u32>(E/32 + 1);
diff --git a/src/Proof.h b/src/Proof.h
index 97e9056d..c83d8ace 100644
--- a/src/Proof.h
+++ b/src/Proof.h
@@ -4,6 +4,7 @@
 
 #include "File.h"
 #include "common.h"
+#include <cinttypes>
 
 namespace fs = std::filesystem;
 
@@ -11,15 +12,15 @@ class Gpu;
 
 struct ProofInfo {
   u32 power;
-  u32 exp;
+  u64 exp;
   string md5;
 };
 
 namespace proof {
 
-array<u64, 4> hashWords(u32 E, const Words& words);
+array<u64, 4> hashWords(u64 E, const Words& words);
 
-array<u64, 4> hashWords(u32 E, array<u64, 4> prefix, const Words& words);
+array<u64, 4> hashWords(u64 E, array<u64, 4> prefix, const Words& words);
 
 string fileHash(const fs::path& filePath);
 
@@ -29,7 +30,7 @@ ProofInfo getInfo(const fs::path& proofFile);
 
 class Proof {  
 public:
-  const u32 E;
+  const u64 E;
   const Words B;
   const vector<Words> middles;
 
@@ -40,7 +41,7 @@ class Proof {
     POWER=8\n
     NUMBER=M216091\n
   */
-  static const constexpr char* HEADER_v2 = "PRP PROOF\nVERSION=2\nHASHSIZE=64\nPOWER=%u\nNUMBER=M%u%c";
+  static const constexpr char* HEADER_v2 = "PRP PROOF\nVERSION=2\nHASHSIZE=64\nPOWER=%u\nNUMBER=M%" PRIu64 "%c";
 
   static Proof load(const fs::path& path);
 
@@ -53,38 +54,37 @@ class Proof {
 
 class ProofSet {
 public:
-  const u32 E;
+  const u64 E;
   const u32 power;
   
 private:  
-  vector<u32> points;  
+  vector<u64> points;  
   
-  bool isValidTo(u32 limitK) const;
+  bool isValidTo(u64 limitK) const;
 
-  static bool canDo(u32 E, u32 power, u32 currentK);
+  static bool canDo(u64 E, u32 power, u64 currentK);
 
   mutable decltype(points)::const_iterator cacheIt{};
 
-  bool fileExists(u32 k) const;
+  bool fileExists(u64 k) const;
 
-  static fs::path proofPath(u32 E) { return fs::path(to_string(E)) / "proof"; }
+  static fs::path proofPath(u64 E) { return fs::path(to_string(E)) / "proof"; }
 public:
   
-  static u32 bestPower(u32 E);
-  static u32 effectivePower(u32 E, u32 power, u32 currentK);
-  static double diskUsageGB(u32 E, u32 power);
-  static bool isInPoints(u32 E, u32 power, u32 k);
+  static u32 bestPower(u64 E);
+  static u32 effectivePower(u64 E, u32 power, u64 currentK);
+  static double diskUsageGB(u64 E, u32 power);
+  static bool isInPoints(u64 E, u32 power, u64 k);
   
-  ProofSet(u32 E, u32 power);
-    
-  u32 next(u32 k) const;
+  ProofSet(u64 E, u32 power);
 
-  static void save(u32 E, u32 power, u32 k, const Words& words);
-  static Words load(u32 E, u32 power, u32 k);
-        
-  void save(u32 k, const Words& words) const { return save(E, power, k, words); }
-  Words load(u32 k) const { return load(E, power, k); }
+  u64 next(u64 k) const;
 
+  static void save(u64 E, u32 power, u64 k, const Words& words);
+  static Words load(u64 E, u32 power, u64 k);
+
+  void save(u64 k, const Words& words) const { return save(E, power, k, words); }
+  Words load(u64 k) const { return load(E, power, k); }
 
   std::pair<Proof, vector<u64>> computeProof(Gpu *gpu) const;
 };
diff --git a/src/Saver.cpp b/src/Saver.cpp
index 118d1eae..68caba41 100644
--- a/src/Saver.cpp
+++ b/src/Saver.cpp
@@ -15,19 +15,19 @@
 namespace {
 
 // E, k, block-size, res64, nErrors, CRC
-static constexpr const char *PRP_v12 = "OWL PRP 12 %u %u %u %016" SCNx64 " %u %u\n";
+static constexpr const char *PRP_v12 = "OWL PRP 12 %" PRIu64 " %" PRIu64 " %u %016" SCNx64 " %u %u\n";
 
 // Anticipated next version of the header.
 // Has general number form N=k*b^E+c, and labels for values.
-static constexpr const char *PRP_v13 = "OWL PRP 13 N=1*2^%u-1 k=%u block=%u res64=%016" SCNx64 " err=%u time=%lf\n";
+static constexpr const char *PRP_v13 = "OWL PRP 13 N=1*2^%" PRIu64 "-1 k=%" PRIu64 " block=%u res64=%016" SCNx64 " err=%u time=%lf\n";
 // static constexpr const char *PRP_v13_PRI = "OWL PRP 13 N=1*2^%u-1 k=%u block=%u res64=%016" PRIx64 " err=%u time=%.0lf\n";
 
 // E, k, CRC
-static constexpr const char *LL_v1 = "OWL LL 1 E=%u k=%u CRC=%u\n";
+static constexpr const char *LL_v1 = "OWL LL 1 E=%" PRIu64 " k=%" PRIu64 " CRC=%u\n";
 
 // Anticipated next version.
 // Push version number to sync it with PRP.
-static constexpr const char *LL_v13 = "OWL LL 13 N=1*2^%u-1 k=%u time=%lf\n";
+static constexpr const char *LL_v13 = "OWL LL 13 N=1*2^%" PRIu64 "-1 k=%" PRIu64 " time=%lf\n";
 
 struct BadHeaderError { string name; };
 
@@ -35,8 +35,8 @@ bool startsWith(const string& s, const string& prefix) {
   return s.rfind(prefix, 0) == 0;
 }
 
-vector<u32> savefiles(fs::path dir, const string& prefix, const string& kind) {
-  vector<u32> v;
+vector<u64> savefiles(fs::path dir, const string& prefix, const string& kind) {
+  vector<u64> v;
   for (const auto& entry: fs::directory_iterator(dir)) {
     if (entry.is_regular_file()) {
       string filename = entry.path().filename().string();
@@ -45,7 +45,7 @@ vector<u32> savefiles(fs::path dir, const string& prefix, const string& kind) {
         assert(dot > prefix.size());
         string id = filename.substr(prefix.size(), dot - prefix.size());
         if (id == "unverified") { continue; }
-        u32 k = 0;
+        u64 k = 0;
         const char* first = id.data();
         const char* end   = first + id.size();
         auto res = from_chars(first, end, k);
@@ -61,13 +61,13 @@ vector<u32> savefiles(fs::path dir, const string& prefix, const string& kind) {
   return v;
 }
 
-string str9(u32 k) {
+string str9(u64 k) {
   char buf[32];
-  snprintf(buf, sizeof(buf), "%09u", k);
+  snprintf(buf, sizeof(buf), "%09" PRIu64, k);
   return buf;
 }
 
-fs::path pathFor(fs::path base, const string& prefix, const string& kind, u32 k) {
+fs::path pathFor(fs::path base, const string& prefix, const string& kind, u64 k) {
   return base / (prefix + str9(k) + '.' + kind);
 }
 
@@ -79,16 +79,17 @@ fs::path pathUnverified(fs::path base, const string& prefix) {
 // <prefix><id>.<kind>
 // e.g.: 125784077-010000000.prp
 fs::path findLast(fs::path dir, const string& prefix, const string& kind) {
-  vector<u32> v = savefiles(dir, prefix, kind);
+  vector<u64> v = savefiles(dir, prefix, kind);
   if (v.empty()) { return {}; }
-  u32 lastK = v.back();
+  u64 lastK = v.back();
   fs::path path = pathFor(dir, prefix, kind, lastK);
   assert(is_regular_file(path));
   return path;
 }
 
 PRPState readState(const PRPState& dummy, File fi) {
-  u32 exponent{}, k{}, blockSize{}, nErrors{};
+  u64 exponent{}, k{};
+  u32 blockSize{}, nErrors{};
   u64 res64{};
   double elapsed{};
 
@@ -108,7 +109,7 @@ PRPState readState(const PRPState& dummy, File fi) {
 }
 
 LLState readState(const LLState& dummy, File fi) {
-  u32 exponent{}, k{};
+  u64 exponent{}, k{};
   double elapsed{};
 
   string header = fi.readLine();
@@ -142,7 +143,7 @@ void writeState(const File& fo, const LLState& state) {
   fo.writeChecked(state.data);
 }
 
-double roundNumberScore(u32 x) {
+double roundNumberScore(u64 x) {
   if (x == 0) { return 1; }
 
   double score = 0;
@@ -169,7 +170,7 @@ template<> LLState Saver<LLState>::initState() {
 // ---- Saver ----
 
 template<typename State>
-Saver<State>::Saver(u32 exponent, u32 blockSize, u32 nSavefiles) :
+Saver<State>::Saver(u64 exponent, u32 blockSize, u32 nSavefiles) :
   exponent{exponent},
   blockSize{blockSize},
   prefix{to_string(exponent) + '-'},
@@ -188,7 +189,7 @@ template<typename State>
 Saver<State>::~Saver() = default;
 
 template<typename State>
-void Saver<State>::clear(u32 exponent) {
+void Saver<State>::clear(u64 exponent) {
   error_code dummy;
   fs::path base = std::is_same_v<State, PRPState> ?
         fs::current_path() / to_string(exponent)
@@ -244,16 +245,16 @@ State Saver<State>::load() {
 
 template<typename State>
 void Saver<State>::trimFiles() {
-  vector<u32> v = savefiles(base, prefix, State::KIND);
+  vector<u64> v = savefiles(base, prefix, State::KIND);
 
   assert(nSavefiles > 0);
   while (v.size() > nSavefiles) {
     int bestIdx = -1;
     double bestSpan = 1e20;
-    u32 prevK = 0;
+    u64 prevK = 0;
 
     for (u32 i = 0; i < v.size() - 1; ++i) {
-      u32 k = v[i];
+      u64 k = v[i];
       double niceBias = std::min(1.0, roundNumberScore(k) - 4);
       double span = (v[i + 1] - prevK) * niceBias;
       prevK = k;
@@ -263,8 +264,8 @@ void Saver<State>::trimFiles() {
       }
     }
     assert(bestIdx >= 0);
-    u32 k = v[bestIdx];
-    // log("Deleting savefile %u\n", k);
+    u64 k = v[bestIdx];
+    // log("Deleting savefile %" PRIu64 "\n", k);
     fs::path path = pathFor(base, prefix, State::KIND, k);
     fs::remove(path);
     v.erase(v.begin() + bestIdx);
diff --git a/src/Saver.h b/src/Saver.h
index 3bf8e6e7..5a3a7ba6 100644
--- a/src/Saver.h
+++ b/src/Saver.h
@@ -13,8 +13,8 @@ class SaveMan;
 struct PRPState {
   static const constexpr char* KIND = "prp";
 
-  u32 exponent;
-  u32 k;
+  u64 exponent;
+  u64 k;
   u32 blockSize;
   u64 res64;
   vector<u32> check;
@@ -25,15 +25,15 @@ struct PRPState {
 struct LLState {
   static const constexpr char* KIND = "ll";
 
-  u32 exponent;
-  u32 k;
+  u64 exponent;
+  u64 k;
   vector<u32> data;
   double elapsed{};
 };
 
 template<typename State>
 class Saver {
-  u32 exponent;
+  u64 exponent;
   u32 blockSize;
   fs::path base;
   string prefix;
@@ -45,7 +45,7 @@ class Saver {
   fs::path mostRecentSavefile();
 
 public:
-  Saver(u32 exponent, u32 blockSize, u32 nSavefiles);
+  Saver(u64 exponent, u32 blockSize, u32 nSavefiles);
   ~Saver();
 
   State load();
@@ -53,7 +53,7 @@ class Saver {
 
   void dropMostRecent();
 
-  static void clear(u32 exponent);
+  static void clear(u64 exponent);
 
   // For PRP, we can save a verified save (see save() above) or an unverified save.
   void saveUnverified(const PRPState& s) const;
diff --git a/src/Task.cpp b/src/Task.cpp
index d72f4db2..9a49654f 100644
--- a/src/Task.cpp
+++ b/src/Task.cpp
@@ -103,7 +103,9 @@ string json(const vector<string>& v) {
 }
 
 string json(const string& s) { return '"' + s + '"'; }
+string json(int x) { return to_string(x); }
 string json(u32 x) { return to_string(x); }
+string json(u64 x) { return to_string(x); }
 
 template<typename T> string json(const string& key, const T& value) { return json(key) + ':' + json(value); }
 
@@ -112,7 +114,7 @@ string maybe(const string& key, const string& value) { return value.empty() ? ""
 template<typename T> void operator+=(vector<T>& a, const vector<T>& b) { a.insert(a.end(), b.begin(), b.end()); }
 
 
-vector<string> commonFields(u32 E, const char *worktype, const string &status) {
+vector<string> commonFields(u64 E, const char *worktype, const string &status) {
   return {
     json("status", status),
     json("exponent", E),
@@ -140,7 +142,7 @@ vector<string> tailFields(const std::string &AID, const Args &args) {
   };
 }
 
-void writeResult(u32 instance, u32 E, const char *workType, const string &status, const std::string &AID, const Args &args,
+void writeResult(u32 instance, u64 E, const char *workType, const string &status, const std::string &AID, const Args &args,
                  const vector<string>& extras) {
   fs::path resultsFile = "results-" + to_string(instance) + ".txt";
   vector<string> fields = commonFields(E, workType, status);
@@ -220,8 +222,8 @@ void Task::execute(GpuCommon shared, Queue *q, u32 instance) {
   {
     Primes primes;
     if (!primes.isPrime(exponent)) {
-      u32 new_exponent = primes.prevPrime(exponent);
-      log("Warning: Exponent %u is not prime.  Using exponent %u instead.\n", exponent, new_exponent);
+      u64 new_exponent = primes.prevPrime(exponent);
+      log("Warning: Exponent %" PRIu64 " is not prime.  Using exponent %" PRIu64 " instead.\n", exponent, new_exponent);
       exponent = new_exponent;
     }
   }
@@ -253,7 +255,7 @@ void Task::execute(GpuCommon shared, Queue *q, u32 instance) {
     Worktodo::deleteTask(*this, instance);
 
     if (isPrime) {
-      log("%u is PRIME!\n", exponent);
+      log("%" PRIu64 " is PRIME!\n", exponent);
     } else if (shared.args->clean) {
       gpu->clear(kind == PRP);
     }
diff --git a/src/Task.h b/src/Task.h
index 95f08024..4c130446 100644
--- a/src/Task.h
+++ b/src/Task.h
@@ -20,7 +20,7 @@ class Task {
   enum Kind {PRP, VERIFY, LL, CERT};
 
   Kind kind;
-  u32 exponent;
+  u64 exponent;
   string AID;  // Assignment ID
   string line; // the verbatim worktodo line, used in deleteTask().
   u32 squarings;  // For CERTs
diff --git a/src/TuneEntry.cpp b/src/TuneEntry.cpp
index c3288d24..68b6915d 100644
--- a/src/TuneEntry.cpp
+++ b/src/TuneEntry.cpp
@@ -3,10 +3,11 @@
 #include "CycleFile.h"
 
 #include <cassert>
+#include <cinttypes>
 
 // Returns whether *results* was updated.
 bool TuneEntry::update(vector<TuneEntry>& results) const {
-  u32 maxExp = fft.maxExp();
+  u64 maxExp = fft.maxExp();
   [[maybe_unused]] bool didErase = false;
 
   int i{};
@@ -28,7 +29,7 @@ bool TuneEntry::update(vector<TuneEntry>& results) const {
 
 // Returns whether entry *e* represents an improvement over *results* (i.e. would update the results).
 bool TuneEntry::willUpdate(const vector<TuneEntry>& results) const {
-  u32 maxExp = fft.maxExp();
+  u64 maxExp = fft.maxExp();
   for (const auto& r : results) {
     if (r.cost > cost) {
       break;
@@ -51,7 +52,7 @@ vector<TuneEntry> TuneEntry::readTuneFile(const Args& args) {
   File fi = File::openRead(tuneFile);
   if (!fi) { return {}; }
 
-  [[maybe_unused]] u32 prevMaxExp{};
+  [[maybe_unused]] u64 prevMaxExp{};
   [[maybe_unused]] double prevCost{};
 
   for (const string& line : fi) {
@@ -71,14 +72,14 @@ vector<TuneEntry> TuneEntry::readTuneFile(const Args& args) {
 }
 
 void TuneEntry::writeTuneFile(const vector<TuneEntry>& results) {
-  [[maybe_unused]] u32 prevMaxExp{};
+  [[maybe_unused]] u64 prevMaxExp{};
   [[maybe_unused]] double prevCost{};
   CycleFile tune{"tune.txt"};
   for (const TuneEntry& r : results) {
-    u32 maxExp = r.fft.maxExp();
+    u64 maxExp = r.fft.maxExp();
     assert(r.cost >= prevCost && maxExp > prevMaxExp);
     prevCost = r.cost;
     prevMaxExp = maxExp;
-    tune->printf("%6.1f %14s # %u\n", r.cost, r.fft.spec().c_str(), maxExp);
+    tune->printf("%6.1f %14s # %" PRIu64 "\n", r.cost, r.fft.spec().c_str(), maxExp);
   }
 }
diff --git a/src/Worktodo.cpp b/src/Worktodo.cpp
index 0a981a39..80afdc51 100644
--- a/src/Worktodo.cpp
+++ b/src/Worktodo.cpp
@@ -167,11 +167,11 @@ optional<Task> getWork(Args& args, i32 instance) {
 std::optional<Task> Worktodo::getTask(Args &args, i32 instance) {
   if (instance == 0) {
     if (args.prpExp) {
-      u32 exp = args.prpExp;
+      u64 exp = args.prpExp;
       args.prpExp = 0;
       return Task{Task::PRP, exp};
     } else if (args.llExp) {
-      u32 exp = args.llExp;
+      u64 exp = args.llExp;
       args.llExp = 0;
       return Task{Task::LL, exp};
     } else if (!args.verifyPath.empty()) {
diff --git a/src/common.h b/src/common.h
index 516b099d..530795bb 100644
--- a/src/common.h
+++ b/src/common.h
@@ -46,15 +46,15 @@ using Words = vector<u32>;
 
 inline u64 res64(const Words& words) { return words.empty() ? 0 : ((u64(words[1]) << 32) | words[0]); }
 
-inline u32 nWords(u32 E) { return (E - 1) / 32 + 1; }
+inline u32 nWords(u64 E) { return u32((E - 1) / 32 + 1); }
 
-inline Words makeWords(u32 E, u32 value) {
+inline Words makeWords(u64 E, u32 value) {
   Words ret(nWords(E));
   ret[0] = value;
   return ret;
 }
 
-inline u32 roundUp(u32 x, u32 multiple) { return ((x - 1) / multiple + 1) * multiple; }
+inline u64 roundUp(u64 x, u32 multiple) { return ((x - 1) / multiple + 1) * multiple; }
 
 u32 crc32(const void* data, size_t size);
 
diff --git a/src/shared.h b/src/shared.h
index c2d90dbc..6405d9c1 100644
--- a/src/shared.h
+++ b/src/shared.h
@@ -1,4 +1,4 @@
 // included from both C++ and OpenCL.
 
-u32 bitposToWord(u32 E, u32 N, u32 offset) { return offset * ((u64) N) / E; }
-u32 wordToBitpos(u32 E, u32 N, u32 word) { return (word * ((u64) E) + (N - 1)) / N; }
+u32 bitposToWord(u64 E, u32 N, u32 offset) { return offset * ((u64) N) / E; }
+u32 wordToBitpos(u64 E, u32 N, u32 word) { return (word * ((u64) E) + (N - 1)) / N; }
diff --git a/src/state.cpp b/src/state.cpp
index 4ac159d8..77ebff8f 100644
--- a/src/state.cpp
+++ b/src/state.cpp
@@ -10,7 +10,7 @@
 
 static i64 lowBits(i64 u, int bits) { return (u << (64 - bits)) >> (64 - bits); }
 
-std::vector<u32> compactBits(const vector<Word> &dataVect, u32 E) {
+std::vector<u32> compactBits(const vector<Word> &dataVect, u64 E) {
   if (dataVect.empty()) { return {}; } // Indicating all zero
 
   u32 N = dataVect.size();
@@ -87,7 +87,7 @@ struct BitBucket {
   }
 };
 
-vector<Word> expandBits(const vector<u32> &compactBits, u32 N, u32 E) {
+vector<Word> expandBits(const vector<u32> &compactBits, u32 N, u64 E) {
   assert(E % 32 != 0);
 
   std::vector<Word> out(N);
diff --git a/src/state.h b/src/state.h
index 9b37c0fd..02fab3fb 100644
--- a/src/state.h
+++ b/src/state.h
@@ -8,10 +8,10 @@
 #include <cassert>
 #include <cfenv>
 
-vector<u32> compactBits(const vector<Word> &dataVect, u32 E);
-vector<Word> expandBits(const vector<u32> &compactBits, u32 N, u32 E);
+vector<u32> compactBits(const vector<Word> &dataVect, u64 E);
+vector<Word> expandBits(const vector<u32> &compactBits, u32 N, u64 E);
 
-constexpr u32 step(u32 N, u32 E) { return N - (E % N); }
-constexpr u32 extra(u32 N, u32 E, u32 k) { return u64(step(N, E)) * k % N; }
-constexpr bool isBigWord(u32 N, u32 E, u32 k) { return extra(N, E, k) + step(N, E) < N; }
-constexpr u32 bitlen(u32 N, u32 E, u32 k) { return E / N + isBigWord(N, E, k); }
+constexpr u32 step(u32 N, u64 E) { return N - (E % N); }
+constexpr u32 extra(u32 N, u64 E, u32 k) { return u64(step(N, E)) * k % N; }
+constexpr bool isBigWord(u32 N, u64 E, u32 k) { return extra(N, E, k) + step(N, E) < N; }
+constexpr u32 bitlen(u32 N, u64 E, u32 k) { return E / N + isBigWord(N, E, k); }
diff --git a/src/tune.cpp b/src/tune.cpp
index 3b773437..a3a12321 100644
--- a/src/tune.cpp
+++ b/src/tune.cpp
@@ -173,7 +173,7 @@ printf ("Reguess bpw for %s is %.2f first Z22 is %.2f\n", fft.spec().c_str(), bp
 }
 
 float Tune::zForBpw(float bpw, FFTConfig fft, u32 count) {
-  u32 exponent = (count == 1) ? primes.prevPrime(fft.size() * bpw) : primes.nextPrime(fft.size() * bpw);
+  u64 exponent = (count == 1) ? primes.prevPrime(fft.size() * bpw) : primes.nextPrime(fft.size() * bpw);
   float total_z = 0.0;
   for (u32 i = 0; i < count; i++, exponent = primes.nextPrime (exponent + 1)) {
     auto [ok, res, roeSq, roeMul] = Gpu::make(q, exponent, shared, fft, {}, false)->measureROE(true);
@@ -249,7 +249,7 @@ void Tune::carryTune() {
     double m = 0;
     const float mid = fft.shape.carry32BPW();
     for (float bpw : {mid - 0.05, mid + 0.05}) {
-      u32 exponent = primes.nearestPrime(fft.size() * bpw);
+      u64 exponent = primes.nearestPrime(fft.size() * bpw);
       auto [ok, carry] = Gpu::make(q, exponent, shared, fft, {}, false)->measureCarry();
       m = carry.max;
       if (!ok) { log("Error %s at %f\n", fft.spec().c_str(), bpw); }
@@ -257,7 +257,7 @@ void Tune::carryTune() {
     }
 
     float avg = (zv[0] + zv[1]) / 2;
-    u32 exponent = fft.shape.carry32BPW() * fft.size();
+    u64 exponent = fft.shape.carry32BPW() * fft.size();
     double pErr100 = -expm1(-exp(-avg) * exponent * 100);
     log("%14s %.3f : %.3f (%.3f %.3f) %f %.0f%%\n", fft.spec().c_str(), mid, avg, zv[0], zv[1], m, pErr100 * 100);
     fo.printf("%f %f\n", log2(fft.size()), avg);
@@ -292,8 +292,8 @@ void Tune::ctune() {
 
   for (FFTShape shape : shapes) {
     FFTConfig fft{shape, 101, CARRY_32};
-    u32 exponent = primes.prevPrime(fft.maxExp());
-    // log("tuning %10s with exponent %u\n", fft.shape.spec().c_str(), exponent);
+    u64 exponent = primes.prevPrime(fft.maxExp());
+    // log("tuning %10s with exponent %" PRIu64 "\n", fft.shape.spec().c_str(), exponent);
 
     vector<int> bestPos(configsVect.size());
     Entry best{{1, 1, 1}, {}, 1e9};
@@ -448,7 +448,7 @@ void Tune::tune() {
     // Find best IN_WG,IN_SIZEX,OUT_WG,OUT_SIZEX settings
     if (1/*option to time IN/OUT settings*/) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_in_wg = 0;
       u32 best_in_sizex = 0;
       u32 current_in_wg = args->value("IN_WG", 128);
@@ -497,7 +497,7 @@ void Tune::tune() {
     // Find best PAD setting.  Default is 256 bytes for AMD, 0 for all others.
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_pad = 0;
       u32 current_pad = args->value("PAD", AMDGPU ? 256 : 0);
       double best_cost = -1.0;
@@ -517,7 +517,7 @@ void Tune::tune() {
     // Find best MIDDLE_IN_LDS_TRANSPOSE setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_middle_in_lds_transpose = 0;
       u32 current_middle_in_lds_transpose = args->value("MIDDLE_IN_LDS_TRANSPOSE", 1);
       double best_cost = -1.0;
@@ -537,7 +537,7 @@ void Tune::tune() {
     // Find best MIDDLE_OUT_LDS_TRANSPOSE setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_middle_out_lds_transpose = 0;
       u32 current_middle_out_lds_transpose = args->value("MIDDLE_OUT_LDS_TRANSPOSE", 1);
       double best_cost = -1.0;
@@ -557,7 +557,7 @@ void Tune::tune() {
     // Find best INPLACE setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_inplace = 0;
       double best_cost = -1.0;
       double current_cost = -1.0;
@@ -576,7 +576,7 @@ void Tune::tune() {
     // Find best NONTEMPORAL setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_nontemporal = 0;
       u32 current_nontemporal = args->value("NONTEMPORAL", 0);
       double best_cost = -1.0;
@@ -596,7 +596,7 @@ void Tune::tune() {
     // Find best FAST_BARRIER setting
     if (AMDGPU) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_fast_barrier = 0;
       u32 current_fast_barrier = args->value("FAST_BARRIER", 0);
       double best_cost = -1.0;
@@ -616,7 +616,7 @@ void Tune::tune() {
     // Find best TAIL_KERNELS setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_tail_kernels = 0;
       u32 current_tail_kernels = args->value("TAIL_KERNELS", 2);
       double best_cost = -1.0;
@@ -639,7 +639,7 @@ void Tune::tune() {
     // Find best TAIL_TRIGS setting
     if (time_FFTs) {
       FFTConfig fft{defaultFFTShape, 101, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_tail_trigs = 0;
       u32 current_tail_trigs = args->value("TAIL_TRIGS", 2);
       double best_cost = -1.0;
@@ -660,7 +660,7 @@ void Tune::tune() {
     if (time_NTTs) {
       FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
       if (!fft.NTT_GF31) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO);
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_tail_trigs = 0;
       u32 current_tail_trigs = args->value("TAIL_TRIGS31", 0);
       double best_cost = -1.0;
@@ -681,7 +681,7 @@ void Tune::tune() {
     if (time_NTTs && time_FP32) {
       FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
       if (!fft.FFT_FP32) fft = FFTConfig(FFTShape(FFT3261, 512, 8, 512), 202, CARRY_AUTO);
-      u32 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size());   // Back off the maxExp as different settings will have different maxBpw
+      u64 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size());   // Back off the maxExp as different settings will have different maxBpw
       u32 best_tail_trigs = 0;
       u32 current_tail_trigs = args->value("TAIL_TRIGS32", 2);
       double best_cost = -1.0;
@@ -702,7 +702,7 @@ void Tune::tune() {
     if (time_NTTs) {
       FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
       if (!fft.NTT_GF61) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO);
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_tail_trigs = 0;
       u32 current_tail_trigs = args->value("TAIL_TRIGS61", 0);
       double best_cost = -1.0;
@@ -722,7 +722,7 @@ void Tune::tune() {
     // Find best TABMUL_CHAIN setting
     if (time_FFTs) {
       FFTConfig fft{defaultFFTShape, 101, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_tabmul_chain = 0;
       u32 current_tabmul_chain = args->value("TABMUL_CHAIN", 0);
       double best_cost = -1.0;
@@ -743,7 +743,7 @@ void Tune::tune() {
     if (time_NTTs) {
       FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
       if (!fft.NTT_GF31) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO);
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_tabmul_chain = 0;
       u32 current_tabmul_chain = args->value("TABMUL_CHAIN31", 0);
       double best_cost = -1.0;
@@ -764,7 +764,7 @@ void Tune::tune() {
     if (time_NTTs && time_FP32) {
       FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
       if (!fft.FFT_FP32) fft = FFTConfig(FFTShape(FFT3261, 512, 8, 512), 202, CARRY_AUTO);
-      u32 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size());   // Back off the maxExp as different settings will have different maxBpw
+      u64 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size());   // Back off the maxExp as different settings will have different maxBpw
       u32 best_tabmul_chain = 0;
       u32 current_tabmul_chain = args->value("TABMUL_CHAIN32", 0);
       double best_cost = -1.0;
@@ -785,7 +785,7 @@ void Tune::tune() {
     if (time_NTTs) {
       FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
       if (!fft.NTT_GF61) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO);
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_tabmul_chain = 0;
       u32 current_tabmul_chain = args->value("TABMUL_CHAIN61", 0);
       double best_cost = -1.0;
@@ -806,7 +806,7 @@ void Tune::tune() {
     if (time_NTTs) {
       FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
       if (!fft.NTT_GF31) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO);
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_modm31 = 0;
       u32 current_modm31 = args->value("MODM31", 0);
       double best_cost = -1.0;
@@ -826,7 +826,7 @@ void Tune::tune() {
     // Find best UNROLL_W setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_unroll_w = 0;
       u32 current_unroll_w = args->value("UNROLL_W", AMDGPU ? 0 : 1);
       double best_cost = -1.0;
@@ -846,7 +846,7 @@ void Tune::tune() {
     // Find best UNROLL_H setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_unroll_h = 0;
       u32 current_unroll_h = args->value("UNROLL_H", AMDGPU && defaultShape->height >= 1024 ? 0 : 1);
       double best_cost = -1.0;
@@ -866,7 +866,7 @@ void Tune::tune() {
     // Find best ZEROHACK_W setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_zerohack_w = 0;
       u32 current_zerohack_w = args->value("ZEROHACK_W", 1);
       double best_cost = -1.0;
@@ -886,7 +886,7 @@ void Tune::tune() {
     // Find best ZEROHACK_H setting
     if (1) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_zerohack_h = 0;
       u32 current_zerohack_h = args->value("ZEROHACK_H", 1);
       double best_cost = -1.0;
@@ -906,7 +906,7 @@ void Tune::tune() {
     // Find best BIGLIT setting
     if (time_FFTs) {
       FFTConfig fft{*defaultShape, 101, CARRY_AUTO};
-      u32 exponent = primes.prevPrime(fft.maxExp());
+      u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_biglit = 0;
       u32 current_biglit = args->value("BIGLIT", 1);
       double best_cost = -1.0;
@@ -987,7 +987,7 @@ skip_1K_256 = 0;
     if ((shape.fft_type == FFT3261 || shape.fft_type == FFT323161 || shape.fft_type == FFT3231 || shape.fft_type == FFT32) && !time_FP32) continue;
 
     // Time an exponent that's good for all variants and carry-config.
-    u32 exponent = primes.prevPrime(FFTConfig{shape, shape.width <= 1024 ? 0u : 100u, CARRY_32}.maxExp());
+    u64 exponent = primes.prevPrime(FFTConfig{shape, shape.width <= 1024 ? 0u : 100u, CARRY_32}.maxExp());
     u32 adjusted_quick = (exponent < 50000000) ? quick - 1 : (exponent < 170000000) ? quick : (exponent < 350000000) ? quick + 1 : quick + 2;
     if (adjusted_quick < 1) adjusted_quick = 1;
     if (adjusted_quick > 10) adjusted_quick = 10;

From 205754df4610dfccef54f314dee11fe1fde413e5 Mon Sep 17 00:00:00 2001
From: george <woltman@alum.mit.edu>
Date: Fri, 26 Dec 2025 03:58:25 +0000
Subject: [PATCH 2/9] Merged in previous inplace changes to tune.cpp

---
 src/tune.cpp | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/tune.cpp b/src/tune.cpp
index a3a12321..4d613d0f 100644
--- a/src/tune.cpp
+++ b/src/tune.cpp
@@ -348,6 +348,7 @@ void Tune::tune() {
   bool time_FFTs = 0;
   bool time_NTTs = 0;
   bool time_FP32 = 1;
+  bool time_inplace_only = 0;
   int quick = 7;                        // Run config from slowest (quick=1) to fastest (quick=10)
   u64 min_exponent = 75000000;
   u64 max_exponent = 350000000;
@@ -360,6 +361,7 @@ void Tune::tune() {
     if (s == "fp64") time_FFTs = 1;
     if (s == "ntt") time_NTTs = 1;
     if (s == "nofp32") time_FP32 = 0;
+    if (s == "inplace") time_inplace_only = 1;
     auto keyVal = split(s, '=');
     if (keyVal.size() == 2) {
       if (keyVal.front() == "quick") quick = stod(keyVal.back());
@@ -446,7 +448,7 @@ void Tune::tune() {
     args->flags["INPLACE"] = to_string(0);
 
     // Find best IN_WG,IN_SIZEX,OUT_WG,OUT_SIZEX settings
-    if (1/*option to time IN/OUT settings*/) {
+    if (!time_inplace_only) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
       u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_in_wg = 0;
@@ -495,7 +497,7 @@ void Tune::tune() {
     }
 
     // Find best PAD setting.  Default is 256 bytes for AMD, 0 for all others.
-    if (1) {
+    if (!time_inplace_only) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
       u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_pad = 0;
@@ -515,7 +517,7 @@ void Tune::tune() {
     }
 
     // Find best MIDDLE_IN_LDS_TRANSPOSE setting
-    if (1) {
+    if (!time_inplace_only) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
       u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_middle_in_lds_transpose = 0;
@@ -535,7 +537,7 @@ void Tune::tune() {
     }
 
     // Find best MIDDLE_OUT_LDS_TRANSPOSE setting
-    if (1) {
+    if (!time_inplace_only) {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
       u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_middle_out_lds_transpose = 0;
@@ -554,8 +556,13 @@ void Tune::tune() {
       args->flags["MIDDLE_OUT_LDS_TRANSPOSE"] = to_string(best_middle_out_lds_transpose);
     }
 
+    // If only timing INPLACE=1 options, then set INPLACE
+    if (time_inplace_only) {
+      args->flags["INPLACE"] = to_string(1);
+      newConfigKeyVals.push_back({"INPLACE", 1});
+    }
     // Find best INPLACE setting
-    if (1) {
+    else {
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
       u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_inplace = 0;
@@ -905,7 +912,7 @@ void Tune::tune() {
 
     // Find best BIGLIT setting
     if (time_FFTs) {
-      FFTConfig fft{*defaultShape, 101, CARRY_AUTO};
+      FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
       u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_biglit = 0;
       u32 current_biglit = args->value("BIGLIT", 1);

From a3719b01449edf18a961cc5b44acbced44deca15 Mon Sep 17 00:00:00 2001
From: george <woltman@alum.mit.edu>
Date: Fri, 26 Dec 2025 04:10:47 +0000
Subject: [PATCH 3/9] Fixed carry propagtion bug when BPW was very low (lower
 than one would see in normal usage).  The optimization that generated 32-bit
 FFT data words requiring long carries was modified.

---
 src/cl/carryutil.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cl/carryutil.cl b/src/cl/carryutil.cl
index 6cdff372..4e04812b 100644
--- a/src/cl/carryutil.cl
+++ b/src/cl/carryutil.cl
@@ -681,7 +681,7 @@ Word OVERLOAD carryStepSignedSloppy(i96 x, i64 *outCarry, bool isBigWord) {
 //  i64 xhi = i96_hi64(x) + xmid_topbit;
 //  *outCarry = xhi >> (nBits - 32);
 //  return as_long((int2)(i96_lo32(x), whi));
-#elif EXP / NWORDS == 31 || SLOPPY_MAXBPW >= 3200       // nBits = 31 or 32, bigwordBits = 32 (or allowed to create 32-bit word for better performance)
+#elif EXP / NWORDS == 31 || (SLOPPY_MAXBPW >= 3200 && EXP / NWORDS >= 22) // nBits = 31 or 32, bigwordBits = 32 (or allowed to create 32-bit word for better performance)
   i32 w = i96_lo32(x);                                  // lowBits(x, bigwordBits = 32);
   *outCarry = (i96_hi64(x) + (w < 0)) << (32 - nBits);
   return w;

From eceaad6173232be7af9980f8cd961ad9792ea9ac Mon Sep 17 00:00:00 2001
From: george <woltman@alum.mit.edu>
Date: Tue, 30 Dec 2025 01:34:09 +0000
Subject: [PATCH 4/9] Output num squarings a cert will perform as well as an
 ETA.

---
 src/Gpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Gpu.cpp b/src/Gpu.cpp
index 433ce0a7..e9d20522 100644
--- a/src/Gpu.cpp
+++ b/src/Gpu.cpp
@@ -2086,7 +2086,7 @@ array<u64, 4> Gpu::isCERT(const Task& task) {
 
     float secsPerIt = iterationTimer.reset(k);
     queue->setSquareTime((int) (secsPerIt * 1'000'000));
-    log("%9u %016" PRIx64 " %4.0f\n", k, res64, secsPerIt * 1'000'000);
+    log("%7u / %7u %016" PRIx64 " %4.0f ETA %s\n", k, kEnd, res64, secsPerIt * 1'000'000, getETA(k, kEnd, secsPerIt).c_str());
 
     if (k >= kEnd) {
       fs::remove (fname);

From 5cfbe0b3d0adb42ede81fdea15a3d17a3c59fceb Mon Sep 17 00:00:00 2001
From: george <woltman@alum.mit.edu>
Date: Tue, 30 Dec 2025 01:43:17 +0000
Subject: [PATCH 5/9] Output error message when worktodo-N.txt is empty. 
 Helpful for novice users.

---
 src/Worktodo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Worktodo.cpp b/src/Worktodo.cpp
index 80afdc51..1628a391 100644
--- a/src/Worktodo.cpp
+++ b/src/Worktodo.cpp
@@ -114,7 +114,7 @@ optional<Task> getWork(Args& args, i32 instance) {
   // Try to get a task from the local worktodo-<N> file.
   if (optional<Task> task = bestTask(localWork, args.smallest)) { return task; }
 
-  if (args.masterDir.empty()) { return {}; }
+  if (args.masterDir.empty()) { log("No work to do found.  Add work to %s.\n", localWork.c_str()); return {}; }
 
   fs::path worktodo = args.masterDir / "worktodo.txt";
 

From 3a7a76ddb33a3c69999065b2274fe2d0e4e7788b Mon Sep 17 00:00:00 2001
From: Teal Dulcet <tdulcet@pdx.edu>
Date: Sun, 14 Dec 2025 03:29:45 -0800
Subject: [PATCH 6/9] Fixed GitHub Actions CI and replaced macOS 13.

---
 .github/workflows/ci.yml | 47 +++++++++++++++++++++++++---------------
 Makefile                 |  8 +++----
 genbundle.sh             | 17 ++++++++-------
 src/Args.cpp             |  2 +-
 src/common.h             |  2 +-
 src/tune.cpp             |  2 +-
 6 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3947c982..6d43b725 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -13,13 +13,16 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-22.04, ubuntu-24.04]
+        os: [ubuntu-22.04, ubuntu-24.04, ubuntu-22.04-arm, ubuntu-24.04-arm]
         cxx: [g++, clang++]
+        exclude:
+        - os: ubuntu-22.04-arm
+          cxx: clang++
       fail-fast: false
     env:
       CXX: ${{ matrix.cxx }}
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Install
       run: |
         sudo apt-get update -y
@@ -27,14 +30,14 @@ jobs:
         $CXX --version
     - name: Script
       run: |
-        make prpll -O -j "$(nproc)"
+        make -O -j "$(nproc)"
         cd build-release
         rm -f -- *.o
         ./prpll -h
-    - uses: actions/upload-artifact@v4
+    - uses: actions/upload-artifact@v7
       if: always()
       with:
-        name: ${{ matrix.os }}_${{ matrix.cxx }}_prpll
+        name: ${{ matrix.os }}_${{ endsWith(matrix.os, '-arm') && 'arm' || 'x86' }}_${{ matrix.cxx }}_prpll
         path: ${{ github.workspace }}
     - name: Cppcheck
       run: cppcheck --enable=all --force .
@@ -49,15 +52,17 @@ jobs:
   Windows:
     name: Windows
 
-    runs-on: windows-latest
+    runs-on: ${{ matrix.os }}
     strategy:
       matrix:
+        os: [windows-latest] # windows-11-arm
         cxx: [g++, clang++]
       fail-fast: false
     env:
       CXX: ${{ matrix.cxx }}
+      PACKAGE_PREFIX: mingw-w64-${{ endsWith(matrix.os, '-arm') && 'clang-aarch64' || 'x86_64' }}-
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Before Install
       run: |
         echo "C:\msys64\mingw64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
@@ -66,7 +71,7 @@ jobs:
         echo "LIBPATH=-LC:\msys64\mingw64\lib" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
     - name: Install
       run: |
-        pacman -S --noconfirm mingw-w64-x86_64-gmp mingw-w64-x86_64-opencl-icd
+        pacman -S --noconfirm "${env:PACKAGE_PREFIX}opencl-icd"
         & $env:CXX --version
     - name: Install Clang
       if: ${{ matrix.cxx == 'clang++' }}
@@ -74,34 +79,40 @@ jobs:
         pacman -S --noconfirm mingw-w64-x86_64-clang
         & $env:CXX --version
     - name: Script
-      run: | # Cannot use `make exe`, as the OpenCL ICD Loader does not support static linking
-        make prpll -O -j $env:NUMBER_OF_PROCESSORS
+      run: |
+        make -O -j $env:NUMBER_OF_PROCESSORS
         cd build-release
         rm *.o
         .\prpll.exe -h
-    - uses: actions/upload-artifact@v4
+    - uses: actions/upload-artifact@v7
       if: always()
       with:
-        name: win_${{ matrix.cxx }}_prpll
+        name: win_${{ endsWith(matrix.os, '-arm') && 'arm' || 'x86' }}_${{ matrix.cxx }}_prpll
         path: ${{ github.workspace }}
 
   macOS:
     name: macOS
 
-    runs-on: macos-13
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [macos-15-intel, macos-latest]
+      fail-fast: false
+    env:
+      CXX: g++-15
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Install
       run: |
-        brew install gcc@14
+        $CXX --version
     - name: Script
       run: |
-        make prpll -j "$(sysctl -n hw.ncpu)"
+        make -j "$(sysctl -n hw.ncpu)"
         cd build-release
         rm -f -- *.o
         ./prpll -h
-    - uses: actions/upload-artifact@v4
+    - uses: actions/upload-artifact@v7
       if: always()
       with:
-        name: macos_prpll
+        name: macos_${{ endsWith(matrix.os, '-intel') && 'x86' || 'arm' }}_prpll
         path: ${{ github.workspace }}
diff --git a/Makefile b/Makefile
index 26bffeda..b1c021e0 100644
--- a/Makefile
+++ b/Makefile
@@ -14,9 +14,9 @@ HOST_OS = $(shell uname -s)
 
 ifeq ($(HOST_OS), Darwin)
 # Real GCC (not clang), needed for 128-bit floats and std::filesystem::path
-CXX = g++-14
+CXX ?= g++-15
 else
-CXX = g++
+CXX ?= g++
 endif
 
 ifneq ($(findstring MINGW, $(HOST_OS)), MINGW)
@@ -45,7 +45,7 @@ else
 
 BIN=build-release
 
-CXXFLAGS = -O2 -DNDEBUG $(COMMON_FLAGS)
+CXXFLAGS = -O3 -DNDEBUG $(COMMON_FLAGS)
 STRIP=-s
 
 endif
@@ -90,7 +90,7 @@ $(BIN)/%.o : src/%.cpp $(DEPDIR)/%.d
 # src/bundle.cpp is just a wrapping of the OpenCL sources (*.cl) as a C string.
 
 src/bundle.cpp: genbundle.sh src/cl/*.cl
-	./genbundle.sh $^ > src/bundle.cpp
+	bash genbundle.sh $^ > src/bundle.cpp
 
 $(DEPDIR)/%.d: ;
 .PRECIOUS: $(DEPDIR)/%.d
diff --git a/genbundle.sh b/genbundle.sh
index ec042bb2..9d176062 100755
--- a/genbundle.sh
+++ b/genbundle.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 cat <<EOM
 // Copyright (C) Mihai Preda
 // Generated file, do not edit. See genbundle.sh and src/cl/*.cl
@@ -9,24 +10,24 @@ EOM
 
 names=
 
-for xx in $*
+for xx in "$@"
 do
-    x=`basename $xx`
+    x=$(basename "$xx")
     
     if [ "$x" = "genbundle.sh" ] ; then continue ; fi
     
     names=${names}\"${x}\",
 
-    echo // $xx
+    echo "// $xx"
     #echo const char ${x}_cl[] = R\"cltag\(
-    echo R\"cltag\(
-    cat $xx
-    echo \)cltag\"\,
+    echo 'R"cltag('
+    cat "$xx"
+    echo ')cltag",'
     echo
 done
-echo \}\;
+echo '};'
 
-echo static const std::vector\<const char*\> CL_FILE_NAMES\{${names}\}\;
+echo "static const std::vector<const char*> CL_FILE_NAMES{${names}};"
 
 cat <<EOM
 const std::vector<const char*>& getClFileNames() { return CL_FILE_NAMES; }
diff --git a/src/Args.cpp b/src/Args.cpp
index 041202ad..b18a95ba 100644
--- a/src/Args.cpp
+++ b/src/Args.cpp
@@ -118,7 +118,7 @@ and should be able to run.
 PRPLL keeps the active tasks in per-worker files worktodo-0.txt, worktodo-1.txt etc in the local directory.
 These per-worker files are supplied from the global worktodo.txt file if -pool is used.
 In turn the global worktodo.txt can be supplied through the primenet.py script,
-either the one located at gpuowl/tools/primenet.py or https://download.mersenne.ca/primenet.py
+either the one located at gpuowl/tools/primenet.py or https://download.mersenne.ca/AutoPrimeNet
 
 It is also possible to manually add exponents by adding lines of the form "PRP=118063003" to worktodo-<N>.txt
 
diff --git a/src/common.h b/src/common.h
index 516b099d..6303e539 100644
--- a/src/common.h
+++ b/src/common.h
@@ -13,7 +13,7 @@ using i64 = int64_t;
 using u64 = uint64_t;
 using i128 = __int128;
 using u128 = unsigned __int128;
-using f128 = __float128;
+// using f128 = __float128;
 
 static_assert(sizeof(u8)  == 1, "size u8");
 static_assert(sizeof(u32) == 4, "size u32");
diff --git a/src/tune.cpp b/src/tune.cpp
index 3b773437..b9cf1f71 100644
--- a/src/tune.cpp
+++ b/src/tune.cpp
@@ -947,7 +947,7 @@ void Tune::tune() {
       config.write("\n  -log 1000000\n");
     }
     if (args->workers < 2) {
-      config.write("\n# Running two workers sometimes gives better throughput.  Autoprimenet will need to create up a second worktodo file.");
+      config.write("\n# Running two workers sometimes gives better throughput.  AutoPrimeNet will need to create up a second worktodo file (use --num-workers 2).");
       config.write("\n#  -workers 2\n");
       config.write("\n# Changing TAIL_KERNELS to 3 when running two workers may be better.");
       config.write("\n#  -use TAIL_KERNELS=3\n");

From 89be60a188253b74593e4f36cb14b189d08a7c3c Mon Sep 17 00:00:00 2001
From: george <woltman@alum.mit.edu>
Date: Wed, 4 Mar 2026 19:09:21 +0000
Subject: [PATCH 7/9] Changed type 3 4M FFT max bpw.  LL test of 100028317
 failed under old limit.

---
 src/fftbpw.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/fftbpw.h b/src/fftbpw.h
index c62db530..f3af7da7 100644
--- a/src/fftbpw.h
+++ b/src/fftbpw.h
@@ -138,9 +138,9 @@
 {"3:256:16:256", {24.15, 24.15, 24.15, 24.15, 24.15, 24.15}},
 { "3:512:8:256", {24.15, 24.15, 24.15, 24.15, 24.15, 24.15}},
 { "3:512:4:512", {24.15, 24.15, 24.15, 24.15, 24.15, 24.15}},
-{  "3:1K:8:256", {23.94, 23.94, 23.94, 23.94, 23.94, 23.94}},
-{"3:512:16:256", {23.94, 23.94, 23.94, 23.94, 23.94, 23.94}},
-{ "3:512:8:512", {23.94, 23.94, 23.94, 23.94, 23.94, 23.94}},
+{  "3:1K:8:256", {23.84, 23.84, 23.84, 23.84, 23.84, 23.84}}, // LL of 100028317 failed (ROEmax=0.294, ROEavg=0.247).  Lowering bpw from 23.94 to 23.84.
+{"3:512:16:256", {23.84, 23.84, 23.84, 23.84, 23.84, 23.84}},
+{ "3:512:8:512", {23.84, 23.84, 23.84, 23.84, 23.84, 23.84}},
 { "3:1K:16:256", {23.65, 23.65, 23.65, 23.65, 23.65, 23.65}},
 {  "3:1K:8:512", {23.65, 23.65, 23.65, 23.65, 23.65, 23.65}},
 {"3:512:16:512", {23.65, 23.65, 23.65, 23.65, 23.65, 23.65}},

From 6b39a5f660ba21f103a0dc3f4bb36867f6301af8 Mon Sep 17 00:00:00 2001
From: george <woltman@alum.mit.edu>
Date: Wed, 11 Mar 2026 22:42:11 +0000
Subject: [PATCH 8/9] CarryFused can now process multiple lines at the same
 time -- 1% performance increase on TitanV. Standardized LDS memory layout and
 bar() strategy. Made a cleaner, common shufl routine to handle multiple lines
 using new constants SHUFL_BYTES_W and SHUFL_BYTES_H. Reverse line routines
 overhauled to use LDS memory layout and bar() strategy. Added L2STORE and
 LULOAD routines for nVidia.  Need to study which GPUs might benefit.
 Deprecated BIGLIT=0.

---
 src/Gpu.cpp          |   52 +-
 src/Gpu.h            |    2 +-
 src/cl/base.cl       |   63 +-
 src/cl/carryfused.cl | 1349 +++++++++++++++++++++---------------------
 src/cl/fftbase.cl    |  412 +++++--------
 src/cl/fftheight.cl  |  214 ++-----
 src/cl/ffthin.cl     |   25 +-
 src/cl/fftp.cl       |   90 +--
 src/cl/fftw.cl       |   28 +-
 src/cl/fftwidth.cl   |  171 +++---
 src/cl/math.cl       |    6 +-
 src/cl/middle.cl     |   62 +-
 src/cl/tailmul.cl    |  104 ++--
 src/cl/tailsquare.cl |  222 +++----
 src/cl/tailutil.cl   |  639 ++++++++------------
 src/tune.cpp         |    4 +-
 16 files changed, 1513 insertions(+), 1930 deletions(-)

diff --git a/src/Gpu.cpp b/src/Gpu.cpp
index e9d20522..84cc4434 100644
--- a/src/Gpu.cpp
+++ b/src/Gpu.cpp
@@ -85,7 +85,6 @@ Weights genWeights(FFTConfig fft, u64 E, u32 W, u32 H, u32 nW, bool AmdGpu) {
 
   vector<double> weightsConstIF;
   vector<double> weightsIF;
-  vector<u32> bits;
 
   if (fft.FFT_FP64) {
     // Inverse + Forward
@@ -141,24 +140,7 @@ Weights genWeights(FFTConfig fft, u64 E, u32 W, u32 H, u32 nW, bool AmdGpu) {
     memcpy((double *) weightsIF.data(), weightsIF32.data(), weightsIF32.size() * sizeof(float));
   }
 
-  if (fft.FFT_FP64 || fft.FFT_FP32) {
-    for (u32 line = 0; line < H; ++line) {
-      for (u32 thread = 0; thread < groupWidth; ) {
-        std::bitset<32> b;
-        for (u32 bitoffset = 0; bitoffset < 32; bitoffset += nW*2, ++thread) {
-          for (u32 block = 0; block < nW; ++block) {
-            for (u32 rep = 0; rep < 2; ++rep) {
-              if (isBigWord(N, E, kAt(H, line, block * groupWidth + thread) + rep)) { b.set(bitoffset + block * 2 + rep); }
-            }        
-          }
-        }
-        bits.push_back(b.to_ulong());
-      }
-    }
-    assert(bits.size() == N / 32);
-  }
-
-  return Weights{weightsConstIF, weightsIF, bits};
+  return Weights{weightsConstIF, weightsIF};
 }
 
 string toLiteral(i32 value) { return to_string(value); }
@@ -228,7 +210,7 @@ constexpr bool isInList(const string& s, initializer_list<string> list) {
 }
 
 string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<KeyVal>& extraConf, u64 E, bool doLog,
-                 bool &tail_single_wide, bool &tail_single_kernel, u32 &in_place, u32 &pad_size) {
+                 bool &tail_single_wide, bool &tail_single_kernel, u32 &in_place, u32 &pad_size, u32 &wmul) {
   map<string, string> config;
 
   // Highest priority is the requested "extra" conf
@@ -246,6 +228,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
   // Default value for -use options that must also be parsed in C++ code
   tail_single_wide = 0, tail_single_kernel = 1;         // Default tailSquare is double-wide in one kernel
   in_place = 0;                                         // Default is not in-place
+  wmul = 1;						// Default is carryFused processes one workgroup at a time
   pad_size = isAmdGpu(id) ? 256 : 0;                    // Default is 256 bytes for AMD, 0 for others
 
   // Validate -use options
@@ -264,7 +247,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
                               "NO_ASM",
                               "DEBUG",
                               "CARRY64",
-                              "BIGLIT",
+                              "BIGLIT",                 // Deprecated
                               "NONTEMPORAL",
                               "INPLACE",
                               "PAD",
@@ -279,7 +262,8 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
                               "TABMUL_CHAIN31",
                               "TABMUL_CHAIN32",
                               "TABMUL_CHAIN61",
-                              "MODM31"
+                              "MODM31",
+                              "WMUL"
                             });
     if (!isValid) {
       log("Warning: unrecognized -use key '%s'\n", k.c_str());
@@ -293,6 +277,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
       if (atoi(v.c_str()) == 3) tail_single_wide = 0, tail_single_kernel = 0;
     }
     if (k == "INPLACE") in_place = atoi(v.c_str());
+    if (k == "WMUL") wmul = atoi(v.c_str());
     if (k == "PAD") pad_size = atoi(v.c_str());
   }
 
@@ -532,7 +517,7 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector<KeyVal>&
   nW(fft.shape.nW()),
   nH(fft.shape.nH()),
   useLongCarry{args.carry == Args::CARRY_LONG},
-  compiler{args, queue->context, clDefines(args, queue->context->deviceId(), fft, extraConf, E, logFftSize, tail_single_wide, tail_single_kernel, in_place, pad_size)},
+  compiler{args, queue->context, clDefines(args, queue->context->deviceId(), fft, extraConf, E, logFftSize, tail_single_wide, tail_single_kernel, in_place, pad_size, wmul)},
 
 #define K(name, ...) name(#name, &compiler, profile.make(#name), queue, __VA_ARGS__)
 
@@ -581,11 +566,11 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector<KeyVal>&
   K(kCarryM,               "carry.cl", "carry", hN / CARRY_LEN, "-DMUL3=1"),
   K(kCarryMROE,            "carry.cl", "carry", hN / CARRY_LEN, "-DMUL3=1 -DROE=1"),
   K(kCarryLL,              "carry.cl", "carry", hN / CARRY_LEN, "-DLL=1"),
-  K(kCarryFused,           "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW),
-  K(kCarryFusedROE,        "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW, "-DROE=1"),
-  K(kCarryFusedMul,        "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW, "-DMUL3=1"),
-  K(kCarryFusedMulROE,     "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW, "-DMUL3=1 -DROE=1"),
-  K(kCarryFusedLL,         "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW, "-DLL=1"),
+  K(kCarryFused,           "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW),
+  K(kCarryFusedROE,        "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW, "-DROE=1"),
+  K(kCarryFusedMul,        "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW, "-DMUL3=1"),
+  K(kCarryFusedMulROE,     "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW, "-DMUL3=1 -DROE=1"),
+  K(kCarryFusedLL,         "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW, "-DLL=1"),
 
   K(carryB,                "carryb.cl", "carryB",   hN / CARRY_LEN),
 
@@ -615,7 +600,6 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector<KeyVal>&
   weights{genWeights(fft, E, WIDTH, BIG_H, nW, isAmdGpu(q->context->deviceId()))},
   bufConstWeights{q->context, std::move(weights.weightsConstIF)},
   bufWeights{q->context,      std::move(weights.weightsIF)},
-  bufBits{q->context,         std::move(weights.bitsCF)},
 
 #define BUF(name, ...) name{profile.make(#name), queue, __VA_ARGS__}
 
@@ -695,16 +679,16 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector<KeyVal>&
     kfftWGF61.setFixedArgs(2, bufTrigW);
   }
 
-  if (fft.FFT_FP64 || fft.FFT_FP32) {                         // The FP versions take bufWeight arguments  (and bufBits which may be deleted)
+  if (fft.FFT_FP64 || fft.FFT_FP32) {                         // The FP versions take bufWeight arguments
     kfftP.setFixedArgs(2, bufTrigW, bufWeights);
     for (Kernel* k : {&kCarryA, &kCarryAROE, &kCarryM, &kCarryMROE, &kCarryLL}) { k->setFixedArgs(3, bufCarry, bufWeights); }
     for (Kernel* k : {&kCarryA, &kCarryM, &kCarryLL}) { k->setFixedArgs(5, bufStatsCarry); }
     for (Kernel* k : {&kCarryAROE, &kCarryMROE})      { k->setFixedArgs(5, bufROE); }
     for (Kernel* k : {&kCarryFused, &kCarryFusedROE, &kCarryFusedMul, &kCarryFusedMulROE, &kCarryFusedLL}) {
-      k->setFixedArgs(3, bufCarry, bufReady, bufTrigW, bufBits, bufConstWeights, bufWeights);
+      k->setFixedArgs(3, bufCarry, bufReady, bufTrigW, bufConstWeights, bufWeights);
     }
-    for (Kernel* k : {&kCarryFusedROE, &kCarryFusedMulROE})           { k->setFixedArgs(9, bufROE); }
-    for (Kernel* k : {&kCarryFused, &kCarryFusedMul, &kCarryFusedLL}) { k->setFixedArgs(9, bufStatsCarry); }
+    for (Kernel* k : {&kCarryFusedROE, &kCarryFusedMulROE})           { k->setFixedArgs(8, bufROE); }
+    for (Kernel* k : {&kCarryFused, &kCarryFusedMul, &kCarryFusedLL}) { k->setFixedArgs(8, bufStatsCarry); }
   } else {
     kfftP.setFixedArgs(2, bufTrigW);
     for (Kernel* k : {&kCarryA, &kCarryAROE, &kCarryM, &kCarryMROE, &kCarryLL}) { k->setFixedArgs(3, bufCarry); }
@@ -2042,7 +2026,7 @@ array<u64, 4> Gpu::isCERT(const Task& task) {
   char fname[32];
   sprintf(fname, "M%" PRIu64 ".cert", E);
 
-// Autoprimenet.py does not add the cert entry to worktodo.txt until it has successfully downloaded the .cert file.
+// AutoPrimenet.py does not add the cert entry to worktodo.txt until it has successfully downloaded the .cert file.
 
   { // Enclosing this code in braces ensures the file will be closed by the File destructor.  The later file deletion requires the file be closed in Windows.
     File fi = File::openReadThrow(fname);
diff --git a/src/Gpu.h b/src/Gpu.h
index ad859eac..b184c4d6 100644
--- a/src/Gpu.h
+++ b/src/Gpu.h
@@ -172,6 +172,7 @@ class Gpu {
   bool tail_single_wide;                // TailSquare processes one line at a time
   bool tail_single_kernel;              // TailSquare does not use a separate kernel for line zero
   u32 in_place;                         // Should GPU perform transform in-place. 1 = nVidia friendly memory layout, 2 = AMD friendly.
+  u32 wmul;                             // Number of workgroups carryFused kernel should process ("width multiplier").
   u32 pad_size;                         // Pad size in bytes as specified on the command line or config.txt.  Maximum value is 512.
 
   // Twiddles: trigonometry constant buffers, used in FFTs.
@@ -185,7 +186,6 @@ class Gpu {
   Weights weights;
   Buffer<double> bufConstWeights;
   Buffer<double> bufWeights;
-  Buffer<u32> bufBits;  // bigWord bits aligned for CarryFused/fftP
 
   // "integer word" buffers. These are "small buffers": N x int.
   Buffer<Word> bufData;   // Main int buffer with the words.
diff --git a/src/cl/base.cl b/src/cl/base.cl
index df1ef02b..f252cfe1 100644
--- a/src/cl/base.cl
+++ b/src/cl/base.cl
@@ -58,6 +58,13 @@ G_H        "group height" == SMALL_HEIGHT / NH
 //__builtin_assume(condition)
 #endif // DEBUG
 
+#ifndef AMDGPU
+#define AMDGPU 0
+#endif
+#ifndef NVIDIAGPU
+#define NVIDIAGPU 0
+#endif
+
 #if NO_ASM
 #define HAS_ASM 0
 #define HAS_PTX 0
@@ -128,8 +135,14 @@ G_H        "group height" == SMALL_HEIGHT / NH
 #endif
 #endif
 
-#if !defined(BIGLIT)
-#define BIGLIT 1
+// Shufl width in bytes (can be 4, 8, or 16).  See fftbase.cl.  Allow different shufl widths for fft_width and fft_height.
+// Default is 8 bytes (one double).  Historically best for Radeon VII and TitanV.  This setting will affect how much LDS
+// memory is needed which in turn may affect occupancy and thus performance.
+#if !defined(SHUFL_BYTES_W)
+#define SHUFL_BYTES_W 8
+#endif
+#if !defined(SHUFL_BYTES_H)
+#define SHUFL_BYTES_H 8
 #endif
 
 #if !defined(TABMUL_CHAIN)
@@ -259,6 +272,42 @@ ulong2 OVERLOAD U2(ulong a, ulong b) { return (ulong2) (a, b); }
 #define NTSTORE(mem,val)   (mem) = val
 #endif
 
+// Routines for storing to L2 cache bypassing L1 cache.
+void OVERLOAD L2STORE(i64 *mem, i64 val) {
+#if ENABLE_L2STORE && HAS_PTX >= 200        // Cache hints requires sm_20 support or higher
+  __asm("st.global.cg.b64  [%0], %1;" : : "l"(mem), "l"(val));
+#else
+  *mem = val;
+#endif
+}
+void OVERLOAD L2STORE(i32 *mem, i32 val) {
+#if ENABLE_L2STORE && HAS_PTX >= 200        // Cache hints requires sm_20 support or higher
+  __asm("st.global.cg.b32  [%0], %1;" : : "l"(mem), "r"(val));
+#else
+  *mem = val;
+#endif
+}
+
+// Routines for loading a value and marking it for "last use".
+i64 OVERLOAD LULOAD(i64 *mem) {
+#if ENABLE_LULOAD && HAS_PTX >= 200         // Cache hints requires sm_20 support or higher
+  i64 retval;
+  __asm("ld.global.lu.b64  %0, [%1];" : "=l"(retval) : "l"(mem));
+  return retval;
+#else
+  return *mem;
+#endif
+}
+i32 OVERLOAD LULOAD(i32 *mem) {
+#if ENABLE_LULOAD && HAS_PTX >= 200         // Cache hints requires sm_20 support or higher
+  i32 retval;
+  __asm("ld.global.lu.b32  %0, [%1];" : "=r"(retval) : "l"(mem));
+  return retval;
+#else
+  return *mem;
+#endif
+}
+
 // Prefetch macros.  Unused at present, I tried using them in fftMiddleInGF61 on a 5080 with no benefit.
 void PREFETCHL1(const __global void *addr) {
 #if HAS_PTX >= 200         // Prefetch instruction requires sm_20 support or higher
@@ -371,7 +420,15 @@ void OVERLOAD bar(void) {
 #endif
 }
 
-void OVERLOAD bar(u32 WG) { if (WG > WAVEFRONT) { bar(); } }
+void OVERLOAD bar(const u32 WG) {
+  if (WG > WAVEFRONT) {
+#if ENABLE_BARSYNC && HAS_PTX >= 200         // bar.sync with thread count requires sm_20 support or higher.  Slower on TitanV, need to try on later nVidia GPUs.
+    __asm("bar.sync %0, %1;" : : "r"(get_local_id(0) / WG + 1), "n"(WG));
+#else
+    bar();
+#endif
+  }
+}
 
 // A half-barrier is only needed when half-a-workgroup needs a barrier.
 // This is used e.g. by the double-wide tailSquare, where LDS is split between the halves.
diff --git a/src/cl/carryfused.cl b/src/cl/carryfused.cl
index 05e4ca4c..ba48c8bb 100644
--- a/src/cl/carryfused.cl
+++ b/src/cl/carryfused.cl
@@ -16,18 +16,101 @@ void spin() {
 #endif
 }
 
+// Increasing WMUL to 2 will reduce carryShuttle activity.  This led to a 1% speedup on Titan V.  Testing on other GPUs is needed.
+#ifndef WMUL
+#define WMUL 1
+#endif
+
+#if AMDGPU
+#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
+//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
+#else
+#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
+#endif
+
+// The last WMUL workgroup's carries have been written to global memory.  Now we shuffle WMUL-1 workgroups carries up using local memory.
+void OVERLOAD shufl_carries_up(local void *lds2, i64 *carry, u32 me, u32 lowMe) {
+  // If WMUL is one, there is no shuffling of carries
+  if (WMUL == 1) return;
+
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes used by shufl for each WMUL workgroup
+  const u32 lds_i64s = lds_bytes / sizeof(i64);                 // Number of i64s in LDS used by shufl for each WMUL workgroup
+  local i64 *lds = (local i64 *) lds2;
+
+  // Handle nasty case where we are writing 8-byte quantities but SHUFL_BYTES_W is only 4 bytes
+  if (SHUFL_BYTES_W == 4) {
+    if (WMUL == 2) {
+      // Full barrier needed as we are using the entire LDS buffer.
+      bar();
+      // Write the carries.  This will use the entire LDS buffer.
+      if (me < G_W) for (i32 i = 0; i < NW; ++i) lds[i * G_W + lowMe] = carry[i];
+      // Read carries from previous WMUL workgroup
+      bar();
+      if (me >= G_W) for (i32 i = 0; i < NW; ++i) carry[i] = lds[i * G_W + lowMe];
+      // Full barrier needed as one workgroup just read data from two workgroups LDS buffer.  Not compatible with shufl().
+      bar();
+    }
+
+    // The really nasty case where all the carries will not fit in LDS memory
+    else {
+      lds += (me / G_W) * lds_i64s + lowMe;                         // This WMUL workgroup's LDS area
+      // Write half the carries to next WMUL's workgroup LDS area
+      bar();
+      if (me < (WMUL-1) * G_W) for (i32 i = 0; i < NW/2; ++i) lds[lds_i64s + i * G_W] = carry[i];
+      // Read carries from our WMUL workgroup LDS area
+      bar();
+      if (me >= G_W) for (i32 i = 0; i < NW/2; ++i) carry[i] = lds[i * G_W];
+      // Write the other half of the carries
+      bar();
+      if (me < (WMUL-1) * G_W) for (i32 i = 0; i < NW/2; ++i) lds[lds_i64s + i * G_W] = carry[i + NW/2];
+      // Read carries from our WMUL workgroup LDS area.  Compatible with shufl, no trailing bar() needed.
+      bar();
+      if (me >= G_W) for (i32 i = 0; i < NW/2; ++i) carry[i + NW/2] = lds[i * G_W];
+    }
+  }
+
+  // Easy case.  Write carries to local memory (except last WMUL workgroup which was written to global memory).
+  else {
+    lds += (me / G_W) * lds_i64s + lowMe;                         // This WMUL workgroup's LDS area
+    // Full barrier needed as we are moving data to next WMUL workgroup's LDS area
+    bar();
+    if (me < (WMUL-1) * G_W) for (i32 i = 0; i < NW; ++i) lds[lds_i64s + i * G_W] = carry[i];
+    // Full barrier needed as we just moved data from one WMUL workgroup LDS area to the another WMUL workgroup's LDS area
+    bar();
+    // Read carries from our WMUL workgroup's LDS area.  This is compatible with shufl and no trailing bar() is required.
+    if (me >= G_W) for (i32 i = 0; i < NW; ++i) carry[i] = lds[i * G_W];
+  }
+}
+
+// The last WMUL workgroup's carries have been written to global memory.  Now we shuffle WMUL-1 workgroup carries up using local memory.
+void OVERLOAD shufl_carries_up(local void *lds2, i32 *carry, u32 me, u32 lowMe) {
+  // If WMUL is one, there is no shuffling of carries
+  if (WMUL == 1) return;
+
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes used by shufl for each WMUL workgroup
+  const u32 lds_i32s = lds_bytes / sizeof(i32);                 // Number of i32s in LDS used by shufl for each WMUL workgroup
+  local i32 *lds = (local i32 *) lds2;
+  lds += (me / G_W) * lds_i32s + lowMe;                         // This WMUL workgroup's LDS area
+
+  // Write carries to local memory (except last WMUL workgroup which was written to global memory)
+  // Full barrier needed as we are moving data to next WMUL workgroup's LDS area
+  bar();
+  if (me < (WMUL-1) * G_W) for (i32 i = 0; i < NW; ++i) lds[lds_i32s + i * G_W] = carry[i];
+  // Full barrier needed as we just moved data from one WMUL workgroup LDS area to the another WMUL workgroup's LDS area
+  bar();
+  // Read carries from our WMUL workgroup's LDS area.  This is compatible with shufl and no trailing bar() is required.
+  if (me >= G_W) for (i32 i = 0; i < NW; ++i) carry[i] = lds[i * G_W];
+}
+
+
 #if FFT_TYPE == FFT64
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
-                       CP(u32) bits, ConstBigTab CONST_THREAD_WEIGHTS, BigTab THREAD_WEIGHTS, P(uint) bufROE) {
-
-#if 0   // fft_WIDTH uses shufl_int instead of shufl
-  local T2 lds[WIDTH / 4];
-#else
-  local T2 lds[WIDTH / 2];
-#endif
+KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
+                              ConstBigTab CONST_THREAD_WEIGHTS, BigTab THREAD_WEIGHTS, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local T2 lds[WMUL * lds_bytes / sizeof(T2)];
 
   T2 u[NW];
 
@@ -35,36 +118,31 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
 #if HAS_ASM
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(in, u, line);
-
-// Split 32 bits into NW groups of 2 bits.  See later for different way to do this.
-#if !BIGLIT
-#define GPW (16 / NW)
-  u32 b = NTLOAD(bits[(G_W * line + me) / GPW]) >> (me % GPW * (2 * NW));
-#undef GPW
-#endif
+  readCarryFusedLine(in, u, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack);
-#else
-  new_fft_WIDTH1(lds, u, smallTrig);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
 #if AMDGPU
-  T2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);
+  T2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);
 #else
-  T2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
+  T2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
 #endif
 
 #if MUL3
@@ -75,24 +153,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   CFcarry carry[NW+1];
 #endif
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   float roundMax = 0;
   float carryMax = 0;
 
-  // On Titan V it is faster to derive the big vs. little flags from the fractional number of bits in each FFT word rather than read the flags from memory.
-  // On Radeon VII this code is about the same speed.  Not sure which is better on other GPUs.
-#if BIGLIT
   // Calculate the most significant 32-bits of FRAC_BPW * the word index.  Also add FRAC_BPW_HI to test first biglit flag.
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
   u32 frac_bits = word_index * FRAC_BPW_HI + mad_hi (word_index, FRAC_BPW_LO, FRAC_BPW_HI);
   const u32 frac_bits_bigstep = ((G_W * H * 2) * FRAC_BPW_HI + (u32)(((u64)(G_W * H * 2) * FRAC_BPW_LO) >> 32));
-#endif
 
   // Apply the inverse weights and carry propagate pairs to generate the output carries
 
@@ -103,13 +170,8 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     T invWeight2 = optionalDouble(fancyMul(invWeight1, IWEIGHT_STEP));
 
     // Generate big-word/little-word flags
-#if BIGLIT
     bool biglit0 = frac_bits + i * frac_bits_bigstep <= FRAC_BPW_HI;
     bool biglit1 = frac_bits + i * frac_bits_bigstep >= -FRAC_BPW_HI;   // Same as frac_bits + i * frac_bits_bigstep + FRAC_BPW_HI <= FRAC_BPW_HI;
-#else
-    bool biglit0 = test(b, 2 * i);
-    bool biglit1 = test(b, 2 * i + 1);
-#endif
 
     // Apply the inverse weights, optionally compute roundoff error, and convert to integer.  Also apply MUL3 here.
     // Then propagate carries through two words (the first carry does not have to be accurately calculated because it will
@@ -126,28 +188,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+    // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) {
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -163,68 +225,66 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     u[i] = U2(weight1, weight2);
   }
 
+  // Shuffle carries up
+  shufl_carries_up(lds, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
   // Apply each 32 or 64 bit carry to the 2 words
   for (i32 i = 0; i < NW; ++i) {
-#if BIGLIT
     bool biglit0 = frac_bits + i * frac_bits_bigstep <= FRAC_BPW_HI;
-#else
-    bool biglit0 = test(b, 2 * i);
-#endif
     wu[i] = carryFinal(wu[i], carry[i], biglit0);
     u[i] = U2(u[i].x * wu[i].x, u[i].y * wu[i].y);
   }
 
-  bar();
+  new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe);
 
-//  fft_WIDTH(lds, u, smallTrig);
-  new_fft_WIDTH2(lds, u, smallTrig);
-
-  writeCarryFusedLine(u, out, line);
+  writeCarryFusedLine(u, out, line, lowMe);
 }
 
 
@@ -236,14 +296,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigFP32 smallTrig,
-                       CP(u32) bits, ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) {
-
-#if 0   // fft_WIDTH uses shufl_int instead of shufl
-  local F2 lds[WIDTH / 4];
-#else
-  local F2 lds[WIDTH / 2];
-#endif
+KERNEL(G_W * WMUL) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigFP32 smallTrig,
+                              ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local F2 lds[WMUL * lds_bytes / sizeof(F2)];
 
   F2 u[NW];
 
@@ -251,46 +307,41 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
 #if HAS_ASM
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(in, u, line);
+  readCarryFusedLine(in, u, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack);
-#else
-  new_fft_WIDTH1(lds, u, smallTrig);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
 #if AMDGPU
-  F2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);
+  F2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);
 #else
-  F2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
+  F2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
 #endif
 
   P(CFcarry) carryShuttlePtr = (P(CFcarry)) carryShuttle;
   CFcarry carry[NW+1];
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   float roundMax = 0;
   float carryMax = 0;
 
   // Calculate the most significant 32-bits of FRAC_BPW * the word index.  Also add FRAC_BPW_HI to test first biglit flag.
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
   u32 frac_bits = word_index * FRAC_BPW_HI + mad_hi (word_index, FRAC_BPW_LO, FRAC_BPW_HI);
   const u32 frac_bits_bigstep = ((G_W * H * 2) * FRAC_BPW_HI + (u32)(((u64)(G_W * H * 2) * FRAC_BPW_LO) >> 32));
 
@@ -321,28 +372,28 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+    // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) { 
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -358,48 +409,53 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(
     u[i] = U2(weight1, weight2);
   }
 
+  // Shuffle carries up
+  shufl_carries_up(lds, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
@@ -410,12 +466,9 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(
     u[i] = U2(u[i].x * wu[i].x, u[i].y * wu[i].y);
   }
 
-  bar();
-
-//  fft_WIDTH(lds, u, smallTrig);
-  new_fft_WIDTH2(lds, u, smallTrig);
+  new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe);
 
-  writeCarryFusedLine(u, out, line);
+  writeCarryFusedLine(u, out, line, lowMe);
 }
 
 
@@ -427,13 +480,9 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigGF31 smallTrig, P(uint) bufROE) {
-
-#if 0   // fft_WIDTH uses shufl_int instead of shufl
-  local GF31 lds[WIDTH / 4];
-#else
-  local GF31 lds[WIDTH / 2];
-#endif
+KERNEL(G_W * WMUL) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigGF31 smallTrig, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local GF31 lds[WMUL * lds_bytes / sizeof(GF31)];
 
   GF31 u[NW];
 
@@ -441,40 +490,35 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
 #if HAS_ASM
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(in, u, line);
+  readCarryFusedLine(in, u, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack);
-#else
-  new_fft_WIDTH1(lds, u, smallTrig);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
 
   P(CFcarry) carryShuttlePtr = (P(CFcarry)) carryShuttle;
   CFcarry carry[NW+1];
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   u32 roundMax = 0;
   float carryMax = 0;
 
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
 
   // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words.
   // Weights can be applied with shifts because 2 is the 60th root GF31.
@@ -537,28 +581,28 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+    // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) { 
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -566,48 +610,52 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle
   __asm("s_setprio 0");
 #endif
 
+  // Shuffle carries up
+  shufl_carries_up(lds, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
@@ -627,11 +675,9 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle
     if (weight_shift > 31) weight_shift -= 31;
   }
 
-  bar();
-
-  new_fft_WIDTH2(lds, u, smallTrig);
+  new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe);
 
-  writeCarryFusedLine(u, out, line);
+  writeCarryFusedLine(u, out, line, lowMe);
 }
 
 
@@ -643,13 +689,9 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigGF61 smallTrig, P(uint) bufROE) {
-
-#if 0   // fft_WIDTH uses shufl_int instead of shufl
-  local GF61 lds[WIDTH / 4];
-#else
-  local GF61 lds[WIDTH / 2];
-#endif
+KERNEL(G_W * WMUL) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigGF61 smallTrig, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local GF61 lds[WMUL * lds_bytes / sizeof(GF61)];
 
   GF61 u[NW];
 
@@ -657,23 +699,25 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
 #if HAS_ASM
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(in, u, line);
+  readCarryFusedLine(in, u, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack);
-#else
-  new_fft_WIDTH1(lds, u, smallTrig);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
 
@@ -685,17 +729,10 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle
   CFcarry carry[NW+1];
 #endif
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   u32 roundMax = 0;
   float carryMax = 0;
 
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
 
   // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words.
   // Weights can be applied with shifts because 2 is the 60th root GF61.
@@ -758,28 +795,28 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+    // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) { 
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -787,48 +824,53 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle
   __asm("s_setprio 0");
 #endif
 
+  // Shuffle carries up
+  shufl_carries_up(lds, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
@@ -848,11 +890,9 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle
     if (weight_shift > 61) weight_shift -= 61;
   }
 
-  bar();
-
-  new_fft_WIDTH2(lds, u, smallTrig);
+  new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe);
 
-  writeCarryFusedLine(u, out, line);
+  writeCarryFusedLine(u, out, line, lowMe);
 }
 
 
@@ -864,10 +904,10 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
-                       CP(u32) bits, ConstBigTab CONST_THREAD_WEIGHTS, BigTab THREAD_WEIGHTS, P(uint) bufROE) {
-
-  local T2 lds[WIDTH / 2];
+KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
+                              ConstBigTab CONST_THREAD_WEIGHTS, BigTab THREAD_WEIGHTS, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local T2 lds[WMUL * lds_bytes / sizeof(T2)];
   local GF31 *lds31 = (local GF31 *) lds;
 
   T2 u[NW];
@@ -877,7 +917,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
   CP(GF31) in31 = (CP(GF31)) (in + DISTGF31);
   P(GF31) out31 = (P(GF31)) (out + DISTGF31);
@@ -887,43 +933,30 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(in, u, line);
-  readCarryFusedLine(in31, u31, line);
+  readCarryFusedLine(in, u, line, lowMe);
+  readCarryFusedLine(in31, u31, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack);
-  bar();
-  new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack);
-#else
-  new_fft_WIDTH1(lds, u, smallTrig);
-  bar();
-  new_fft_WIDTH1(lds31, u31, smallTrig31);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
+  new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
 #if AMDGPU
-  T2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);
+  T2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);
 #else
-  T2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
+  T2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
 #endif
+
   P(i64) carryShuttlePtr = (P(i64)) carryShuttle;
   i64 carry[NW+1];
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   float roundMax = 0;
   float carryMax = 0;
 
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
 
   // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words.
   // Let s be the shift amount for word 1.  The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 31.
@@ -987,28 +1020,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+  // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) { 
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -1024,48 +1057,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     u[i] = U2(weight1, weight2);
   }
 
+  // Shuffle carries up
+  shufl_carries_up(lds, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
@@ -1087,15 +1125,11 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     if (weight_shift > 31) weight_shift -= 31;
   }
 
-  bar();
-
-  new_fft_WIDTH2(lds, u, smallTrig);
-  writeCarryFusedLine(u, out, line);
+  new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(u, out, line, lowMe);
 
-  bar();
-
-  new_fft_WIDTH2(lds31, u31, smallTrig31);
-  writeCarryFusedLine(u31, out31, line);
+  new_fft_WIDTH2(lds31, u31, smallTrig31, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(u31, out31, line, lowMe);
 }
 
 
@@ -1107,10 +1141,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
-                       CP(u32) bits, ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) {
-
-  local F2 ldsF2[WIDTH / 2];
+KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
+                              ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local F2 ldsF2[WMUL * lds_bytes / sizeof(F2)];
   local GF31 *lds31 = (local GF31 *) ldsF2;
 
   F2 uF2[NW];
@@ -1120,7 +1154,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
   CP(F2) inF2 = (CP(F2)) in;
   P(F2) outF2 = (P(F2)) out;
@@ -1133,43 +1173,30 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(inF2, uF2, line);
-  readCarryFusedLine(in31, u31, line);
+  readCarryFusedLine(inF2, uF2, line, lowMe);
+  readCarryFusedLine(in31, u31, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack);
-  bar();
-  new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack);
-#else
-  new_fft_WIDTH1(ldsF2, uF2, smallTrigF2);
-  bar();
-  new_fft_WIDTH1(lds31, u31, smallTrig31);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
+  new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
 #if AMDGPU
-  F2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);
+  F2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);
 #else
-  F2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
+  F2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
 #endif
+
   P(i32) carryShuttlePtr = (P(i32)) carryShuttle;
   i32 carry[NW+1];
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   float roundMax = 0;
   float carryMax = 0;
 
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
 
   // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words.
   // Let s be the shift amount for word 1.  The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 31.
@@ -1233,28 +1260,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+    // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) { 
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -1270,48 +1297,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     uF2[i] = U2(weight1, weight2);
   }
 
+  // Shuffle carries up
+  shufl_carries_up(ldsF2, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
@@ -1333,15 +1365,11 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     if (weight_shift > 31) weight_shift -= 31;
   }
 
-  bar();
-
-  new_fft_WIDTH2(ldsF2, uF2, smallTrigF2);
-  writeCarryFusedLine(uF2, outF2, line);
+  new_fft_WIDTH2(ldsF2, uF2, smallTrigF2, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(uF2, outF2, line, lowMe);
 
-  bar();
-
-  new_fft_WIDTH2(lds31, u31, smallTrig31);
-  writeCarryFusedLine(u31, out31, line);
+  new_fft_WIDTH2(lds31, u31, smallTrig31, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(u31, out31, line, lowMe);
 }
 
 
@@ -1353,10 +1381,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
-                       CP(u32) bits, ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) {
-
-  local GF61 lds61[WIDTH / 2];
+KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
+                              ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local GF61 lds61[WMUL * lds_bytes / sizeof(GF61)];
   local F2 *ldsF2 = (local F2 *) lds61;
 
   F2 uF2[NW];
@@ -1366,7 +1394,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
   CP(F2) inF2 = (CP(F2)) in;
   P(F2) outF2 = (P(F2)) out;
@@ -1379,43 +1413,30 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(inF2, uF2, line);
-  readCarryFusedLine(in61, u61, line);
+  readCarryFusedLine(inF2, uF2, line, lowMe);
+  readCarryFusedLine(in61, u61, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack);
-  bar();
-  new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack);
-#else
-  new_fft_WIDTH1(ldsF2, uF2, smallTrigF2);
-  bar();
-  new_fft_WIDTH1(lds61, u61, smallTrig61);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
+  new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
 #if AMDGPU
-  F2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);
+  F2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);
 #else
-  F2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
+  F2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
 #endif
+
   P(i64) carryShuttlePtr = (P(i64)) carryShuttle;
   i64 carry[NW+1];
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   float roundMax = 0;
   float carryMax = 0;
 
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
 
   // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words.
   // Let s be the shift amount for word 1.  The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 61.
@@ -1479,28 +1500,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+    // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) { 
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -1516,48 +1537,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     uF2[i] = U2(weight1, weight2);
   }
 
+  // Shuffle carries up
+  shufl_carries_up(lds61, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
@@ -1579,15 +1605,11 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     if (weight_shift > 61) weight_shift -= 61;
   }
 
-  bar();
+  new_fft_WIDTH2(ldsF2, uF2, smallTrigF2, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(uF2, outF2, line, lowMe);
 
-  new_fft_WIDTH2(ldsF2, uF2, smallTrigF2);
-  writeCarryFusedLine(uF2, outF2, line);
-
-  bar();
-
-  new_fft_WIDTH2(lds61, u61, smallTrig61);
-  writeCarryFusedLine(u61, out61, line);
+  new_fft_WIDTH2(lds61, u61, smallTrig61, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(u61, out61, line, lowMe);
 }
 
 
@@ -1599,13 +1621,9 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, P(uint) bufROE) {
-
-#if 0   // fft_WIDTH uses shufl_int instead of shufl
-  local GF61 lds61[WIDTH / 4];
-#else
-  local GF61 lds61[WIDTH / 2];
-#endif
+KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local GF61 lds61[WMUL * lds_bytes / sizeof(GF61)];
   local GF31 *lds31 = (local GF31 *) lds61;
 
   GF31 u31[NW];
@@ -1615,7 +1633,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
   CP(GF31) in31 = (CP(GF31)) (in + DISTGF31);
   P(GF31) out31 = (P(GF31)) (out + DISTGF31);
@@ -1628,38 +1652,25 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(in31, u31, line);
-  readCarryFusedLine(in61, u61, line);
+  readCarryFusedLine(in31, u31, line, lowMe);
+  readCarryFusedLine(in61, u61, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack);
-  bar();
-  new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack);
-#else
-  new_fft_WIDTH1(lds31, u31, smallTrig31);
-  bar();
-  new_fft_WIDTH1(lds61, u61, smallTrig61);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
+  new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
+
   P(i64) carryShuttlePtr = (P(i64)) carryShuttle;
   i64 carry[NW+1];
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   u32 roundMax = 0;
   float carryMax = 0;
 
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
 
   // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words.
   // Let s be the shift amount for word 1.  The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 31.
@@ -1738,28 +1749,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+    // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) {
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -1767,48 +1778,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   __asm("s_setprio 0");
 #endif
 
+  // Shuffle carries up
+  shufl_carries_up(lds61, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
@@ -1836,15 +1852,11 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     m61_weight_shift = adjust_m61_weight_shift(m61_weight_shift);
   }
 
-  bar();
-
-  new_fft_WIDTH2(lds31, u31, smallTrig31);
-  writeCarryFusedLine(u31, out31, line);
-
-  bar();
+  new_fft_WIDTH2(lds31, u31, smallTrig31, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(u31, out31, line, lowMe);
 
-  new_fft_WIDTH2(lds61, u61, smallTrig61);
-  writeCarryFusedLine(u61, out61, line);
+  new_fft_WIDTH2(lds61, u61, smallTrig61, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(u61, out61, line, lowMe);
 }
 
 
@@ -1856,14 +1868,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
 
 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul.
 // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next)
-KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
-                       CP(u32) bits, ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) {
-
-#if 0   // fft_WIDTH uses shufl_int instead of shufl
-  local GF61 lds61[WIDTH / 4];
-#else
-  local GF61 lds61[WIDTH / 2];
-#endif
+KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig,
+                              ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) {
+  const u32 lds_bytes = WIDTH * SHUFL_BYTES_W;                  // LDS bytes needed for each WMUL workgroup
+  local GF61 lds61[WMUL * lds_bytes / sizeof(GF61)];
   local F2 *ldsF2 = (local F2 *) lds61;
   local GF31 *lds31 = (local GF31 *) lds61;
 
@@ -1875,7 +1883,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   u32 me = get_local_id(0);
 
   u32 H = BIG_HEIGHT;
+#if WMUL == 1
+  u32 lowMe = me;
   u32 line = gr % H;
+#else
+  u32 lowMe = me % G_W;           // lane-id in one of the WMUL sub-workgroups.
+  u32 line = (gr * WMUL + me / G_W) % H;
+#endif
 
   CP(F2) inF2 = (CP(F2)) in;
   P(F2) outF2 = (P(F2)) out;
@@ -1891,48 +1905,32 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   __asm("s_setprio 3");
 #endif
 
-  readCarryFusedLine(inF2, uF2, line);
-  readCarryFusedLine(in31, u31, line);
-  readCarryFusedLine(in61, u61, line);
+  readCarryFusedLine(inF2, uF2, line, lowMe);
+  readCarryFusedLine(in31, u31, line, lowMe);
+  readCarryFusedLine(in61, u61, line, lowMe);
 
 // Try this weird FFT_width call that adds a "hidden zero" when unrolling.  This prevents the compiler from finding
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
-#if ZEROHACK_W
-  u32 zerohack = get_group_id(0) / 131072;
-  new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack);
-  bar();
-  new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack);
-  bar();
-  new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack);
-#else
-  new_fft_WIDTH1(ldsF2, uF2, smallTrigF2);
-  bar();
-  new_fft_WIDTH1(lds31, u31, smallTrig31);
-  bar();
-  new_fft_WIDTH1(lds61, u61, smallTrig61);
-#endif
+  u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072;
+  new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
+  new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
+  new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack, WMUL, SHUFL_BYTES_W, lowMe);
 
   Word2 wu[NW];
 #if AMDGPU
-  F2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);
+  F2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);
 #else
-  F2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
+  F2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]);            // On nVidia, don't pollute the constant cache with line weights
 #endif
+
   P(i64) carryShuttlePtr = (P(i64)) carryShuttle;
   i64 carry[NW+1];
 
-#if AMDGPU
-#define CarryShuttleAccess(me,i)        ((me) * NW + (i))                       // Generates denser global_load_dwordx4 instructions
-//#define CarryShuttleAccess(me,i)      ((me) * 4 + (i)%4 + (i)/4 * 4*G_W)      // Also generates global_load_dwordx4 instructions and unit stride when NW=8
-#else
-#define CarryShuttleAccess(me,i)        ((me) + (i) * G_W)                      // nVidia likes this unit stride better
-#endif
-
   float roundMax = 0;
   float carryMax = 0;
 
-  u32 word_index = (me * H + line) * 2;
+  u32 word_index = (lowMe * H + line) * 2;
 
   // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words.
   // Let s be the shift amount for word 1.  The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 31.
@@ -2013,28 +2011,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   updateStats(bufROE, posROE, carryMax);
 #endif
 
-  // Write out our carries. Only groups 0 to H-1 need to write carries out.
-  // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out,
+  // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out.
+  // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out,
   // but it's fine either way.
-  if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } }
+  if (gr < H / WMUL && me >= (WMUL-1) * G_W) {
+    for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); }
 
-  // Tell next line that its carries are ready
-  if (gr < H) {
+    // Tell next group that its carries are ready
 #if OLD_FENCE
     // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    bar();
-    if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
+    bar(G_W);
+    if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); }
 #else
     write_mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if (me % WAVEFRONT == 0) { 
-      u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (lowMe % WAVEFRONT == 0) { 
+      u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT;
       atomic_store((atomic_uint *) &ready[pos], 1);
     }
 #endif
   }
 
-  // Line zero will be redone when gr == H
+  // Group zero will be redone when gr == H / WMUL
   if (gr == 0) { return; }
 
   // Do some work while our carries may not be ready
@@ -2050,48 +2048,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     uF2[i] = U2(weight1, weight2);
   }
 
+  // Shuffle carries up
+  shufl_carries_up(lds61, carry, me, lowMe);
+
   // Wait until our carries are ready
+  if (me < G_W) {
 #if OLD_FENCE
-  if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
-  // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
-  bar();
-  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me == 0) ready[gr - 1] = 0;
+    if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); }
+    // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+    bar();
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me == 0) ready[gr - 1] = 0;
 #else
-  u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
-  if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
-  }
-  mem_fence(CLK_GLOBAL_MEM_FENCE);
-  // Clear carry ready flag for next iteration
-  if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
+    u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
+    if (me % WAVEFRONT == 0) {
+      do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // Clear carry ready flag for next iteration
+    if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0;
 #endif
 #if HAS_ASM
-  __asm("s_setprio 1");
+    __asm("s_setprio 1");
 #endif
 
-  // Read from the carryShuttle carries produced by the previous WIDTH row.  Rotate carries from the last WIDTH row.
-  // The new carry layout lets the compiler generate global_load_dwordx4 instructions.
-  if (gr < H) {
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)];
-    }
-  } else {
+    // Read from the carryShuttle carries produced by the previous WIDTH group.  Rotate carries from the last WIDTH line.
+    // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions.
+    if (gr < H / WMUL) {
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]);
+      }
+    } else {
 
 #if !OLD_FENCE
-    // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
-    bar();
+      // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply.
+      bar();
 #endif
 
-    for (i32 i = 0; i < NW; ++i) {
-      carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/];
-    }
+      for (i32 i = 0; i < NW; ++i) {
+        carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]);
+      }
 
-    if (me == 0) {
-      carry[NW] = carry[NW-1];
-      for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
-      carry[0] = carry[NW];
+      if (me == 0) {
+        carry[NW] = carry[NW-1];
+        for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; }
+        carry[0] = carry[NW];
+      }
     }
   }
 
@@ -2120,20 +2123,14 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     m61_weight_shift = adjust_m61_weight_shift(m61_weight_shift);
   }
 
-  bar();
-
-  new_fft_WIDTH2(ldsF2, uF2, smallTrigF2);
-  writeCarryFusedLine(uF2, outF2, line);
+  new_fft_WIDTH2(ldsF2, uF2, smallTrigF2, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(uF2, outF2, line, lowMe);
 
-  bar();
-
-  new_fft_WIDTH2(lds31, u31, smallTrig31);
-  writeCarryFusedLine(u31, out31, line);
-
-  bar();
+  new_fft_WIDTH2(lds31, u31, smallTrig31, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(u31, out31, line, lowMe);
 
-  new_fft_WIDTH2(lds61, u61, smallTrig61);
-  writeCarryFusedLine(u61, out61, line);
+  new_fft_WIDTH2(lds61, u61, smallTrig61, WMUL, SHUFL_BYTES_W, lowMe);
+  writeCarryFusedLine(u61, out61, line, lowMe);
 }
 
 
diff --git a/src/cl/fftbase.cl b/src/cl/fftbase.cl
index c955e803..9d2fde10 100644
--- a/src/cl/fftbase.cl
+++ b/src/cl/fftbase.cl
@@ -5,6 +5,119 @@
 #include "trig.cl"
 // #include "math.cl"
 
+
+#if FFT_FP64 | NTT_GF61
+
+// Shufl two or more fft_WIDTHs or FFT_HEIGHTs operating on 64-bit values.  Each WG uses WG * sb bytes of LDS memory.
+// Care is taken that each simultaneous workgroup does not interfere with the LDS memory of other simultaneous workgroups --
+// even when operating on differernt sized data elements as can happen in an M31+M61 NTT.
+// WG = workgroup size of a single fft_WIDTH or fft_HEIGHT
+// n = sizeof array u (nW or nH).  n * WG = WIDTH or HEIGHT
+// sb = The number of bytes to write to LDS memory at a time.  SHUFL_BYTES_W or SHUFL_BYTES_H
+// numWG = number of fft_WIDTHs or fft_HEIGHTs being processed simultaneously
+// lowMe = me % WG
+// NOTE: shufl routines perform a bar(WG) at the start but not at the end.  After calling shufl, a bar(WG) is required
+// before next LDS memory usage.  All routines that use LDS memory MUST OBEY THIS PROTOCOL of bar() before LDS use and
+// only bar(WG) required before next use.  ALSO NOTE: the first shufl call does not need to do bar(WG).  A relatively
+// minor optimization would be to spedial case the first shufl call.
+void OVERLOAD shufl64(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) {
+
+  u32 mask = f - 1;
+  assert((mask & (mask + 1)) == 0);
+
+  if (sb == 16) {
+    local T2* lds = ((local T2*) lds2);
+    if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(T2);
+
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i]; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + lowMe]; }
+  }
+
+  else if (sb == 8) {
+    // Accessing lds memory as doubles is faster than T2 accesses on Radeon VII (halving LDS memory requirements)
+    local T* lds = ((local T*) lds2);
+    if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(T);
+
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i].x; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + lowMe]; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i].y; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + lowMe]; }
+  }
+
+  else if (sb == 4) {
+    // Lower LDS requirements may let the optimizer use fewer VGPRs and increase occupancy for WIDTHs >= 1024.
+    // Alas, the increased occupancy does not offset extra code needed for shufl_int (the assembly
+    // code generated is not pretty).  This might not be true for nVidia or future ROCm optimizers.
+    local int* lds = (local int*) lds2;
+    if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(int);
+
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = as_int4(u[i]).x; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.x = lds[i * WG + lowMe]; u[i] = as_double2(tmp); }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = as_int4(u[i]).y; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.y = lds[i * WG + lowMe]; u[i] = as_double2(tmp); }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = as_int4(u[i]).z; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.z = lds[i * WG + lowMe]; u[i] = as_double2(tmp); }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = as_int4(u[i]).w; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.w = lds[i * WG + lowMe]; u[i] = as_double2(tmp); }
+  }
+}
+
+#endif
+
+
+#if FFT_FP32 | NTT_GF31
+
+// Shufl two or more fft_WIDTHs or FFT_HEIGHTs using two 4-byte floats.
+void OVERLOAD shufl32(u32 WG, local F2 *lds2, F2 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) {
+
+  u32 mask = f - 1;
+  assert((mask & (mask + 1)) == 0);
+
+  //GW - would a 16 byte implementation be useful?
+
+  if (sb >= 8) {
+    local F2* lds = ((local F2*) lds2);
+    if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(F2);
+
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i]; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + lowMe]; }
+  }
+
+  else if (sb == 4) {
+    // Accessing lds memory as ints might be faster than F2 accesses (halving LDS memory requirements)
+    local F* lds = ((local F*) lds2);
+    if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(F);
+
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i].x; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + lowMe]; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i].y; }
+    bar(WG);
+    for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + lowMe]; }
+  }
+}
+
+#endif
+
+
 #if FFT_FP64
 
 void OVERLOAD chainMul4(T2 *u, T2 w) {
@@ -45,7 +158,7 @@ void OVERLOAD chainMul8(T2 *u, T2 w, u32 tailSquareBcast) {
   u[3] = cmulFancy(u[3], w3);
 
   w3.x += 1;
-  T2 base = cmulFancy (w3, w);
+  T2 base = cmulFancy(w3, w);
   for (int i = 4; i < 8; ++i) {
     u[i] = cmul(u[i], base);
     base = cmulFancy(base, w);
@@ -106,83 +219,8 @@ T2 bcast(T2 src, u32 span) {
 
 #endif
 
-void OVERLOAD shuflBigLDS(u32 WG, local T2 *lds, T2 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i]; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + me]; }
-}
-
-void OVERLOAD shufl(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-  local T* lds = (local T*) lds2;
-
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; }
-}
-
-// Same as shufl but use ints instead of doubles to reduce LDS memory requirements.
-// Lower LDS requirements should let the optimizer use fewer VGPRs and increase occupancy for WIDTHs >= 1024.
-// Alas, the increased occupancy does not offset extra code needed for shufl_int (the assembly
-// code generated is not pretty).  This might not be true for nVidia or future ROCm optimizers.
-void OVERLOAD shufl_int(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-  local int* lds = (local int*) lds2;
-
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).x; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.x = lds[i * WG + me]; u[i] = as_double2(tmp); }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).y; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.y = lds[i * WG + me]; u[i] = as_double2(tmp); }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).z; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.z = lds[i * WG + me]; u[i] = as_double2(tmp); }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).w; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.w = lds[i * WG + me]; u[i] = as_double2(tmp); }
-  bar();   // I'm not sure why this barrier call is needed
-}
-
-// Shufl two simultaneous FFT_HEIGHTs.  Needed for tailSquared where u and v are computed simultaneously in different threads.
-// NOTE:  It is very important for this routine to use lds memory in coordination with reverseLine2 and unreverseLine2.
-// Failure to do so would result in the need for more bar() calls.  Specifically, the u values are stored in the upper half
-// of lds memory (first SMALL_HEIGHT T2 values).  The v values are stored in the lower half of lds memory (next SMALL_HEIGHT T2 values).
-void OVERLOAD shufl2(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-
-  // Partition lds memory into upper and lower halves
-  assert(WG == G_H);
-
-  // Accessing lds memory as doubles is faster than T2 accesses
-  local T* lds = ((local T*) lds2) + (me / WG) * SMALL_HEIGHT;
-
-  me = me % WG;
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; }  
+void OVERLOAD shufl(u32 WG, local T2 *lds, T2 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) {
+  shufl64(WG, lds, u, n, f, numWG, sb, lowMe);
 }
 
 void OVERLOAD tabMul(u32 WG, Trig trig, T2 *u, u32 n, u32 f, u32 me) {
@@ -209,7 +247,7 @@ void OVERLOAD tabMul(u32 WG, Trig trig, T2 *u, u32 n, u32 f, u32 me) {
 
   if (TABMUL_CHAIN) {
     T2 w = trig[p];
-    chainMul (n, u, w, 0);
+    chainMul(n, u, w, 0);
     return;
   }
 
@@ -226,8 +264,7 @@ void OVERLOAD tabMul(u32 WG, Trig trig, T2 *u, u32 n, u32 f, u32 me) {
     }
 
     for (u32 i = 2; i < n; ++i) {
-      T2 base = trig[(i-1)*WG + p];
-      u[i] = cmul(u[i], base);
+      u[i] = cmul(u[i], trig[(i-1)*WG + p]);
     }
     return;
   }
@@ -250,7 +287,7 @@ T2 partial_cmul(T2 u, T sine_over_cosine) {
 #define X2_via_FMA(a, c, b) { T2 t = a; a = fma(c, b, t); b = fma(-c, b, t); }
 
 // Preload trig values for the first partial tabMul.  We load the sine/cosine values early so that F64 ops can hide the read latency.
-void preload_tabMul4_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 me) {
+void preload_tabMul4_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 numWG, u32 me) {
   TrigSingle trig1 = (TrigSingle) trig;
 
   // Read 3 lines of sine/cosine values for the first fft4.  Read two of the lines as a pair as AMD likes T2 global memory reads
@@ -263,19 +300,20 @@ void preload_tabMul4_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 me) {
 }
 
 // Do a partial tabMul.  Save the mul-by-cosine for later FMA instructions.
-void partial_tabMul4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 me) {
+void partial_tabMul4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 numWG, u32 me) {
   local T *lds1 = (local T *) lds;
   TrigSingle trig1 = (TrigSingle) trig;
   trig1 += 4*WG;                // Skip past sine_over_cosine values
 
   // Use LDS memory to distribute preloaded trig values.
   if (f > 1) {
+    bar(WG);
     lds1[me] = preloads[4];     // Preloaded sine/cosine values
     lds1[WG+me] = preloads[5];  // Preloaded cosine values
-    bar(WG);
   }
 
   // Apply sine/cosines
+  bar(WG);
   for (u32 i = 1; i < 4; ++i) {
     T sine_over_cosine;
     if (f == 1) sine_over_cosine = preloads[i-1];
@@ -299,13 +337,11 @@ void partial_tabMul4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f
     preloads[2] = lds1[WG + ((me/f) & 3) * WG/4 + (2 * WG + me)/(4*f) * f/4];
     preloads[3] = lds1[WG + ((me/f) & 3) * WG/4 + (3 * WG + me)/(4*f) * f/4];
     preloads[1] = lds1[WG + ((me/f) & 3) * WG/4 + (1 * WG + me)/(4*f) * f/4];
-    bar(WG);
   }
 }
 
 // Finish off a partial tabMul while doing next fft4 making more use of FMA.
-void finish_tabMul4_fft4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 me, u32 save_one_more_mul) {
-  local T *lds1 = (local T *) lds;
+void finish_tabMul4_fft4(u32 WG, Trig trig, T *preloads, T2 *u, u32 f, u32 numWG, u32 me, u32 save_one_more_mul) {
   TrigSingle trig1 = (TrigSingle) trig;
 
   //
@@ -338,7 +374,7 @@ void finish_tabMul4_fft4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u
 //************************************************************************************
 
 // Preload trig values for the first partial tabMul.  We load the sine/cosine values early so that F64 ops can hide the read latency.
-void preload_tabMul8_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 me) {
+void preload_tabMul8_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 numWG, u32 me) {
   TrigSingle trig1 = (TrigSingle) trig;
 
   // Read 7 lines of sine/cosine values for the first fft8.  Read six of the lines as pairs as AMD likes T2 global memory reads
@@ -353,19 +389,20 @@ void preload_tabMul8_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 me) {
 }
 
 // Do a partial tabMul.  Save the mul-by-cosine for later FMA instructions.
-void partial_tabMul8(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 me) {
+void partial_tabMul8(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 numWG, u32 me) {
   local T *lds1 = (local T *) lds;
   TrigSingle trig1 = (TrigSingle) trig;
   trig1 += 8*WG;                // Skip past sine_over_cosine values
 
   // Use LDS memory to distribute preloaded trig values.
   if (f > 1) {
+    bar(WG);
     lds1[me] = preloads[8];     // Preloaded sine/cosine values
     lds1[WG+me] = preloads[9];  // Preloaded cosine values
-    bar(WG);
   }
 
   // Apply sine/cosines
+  bar(WG);
   for (u32 i = 1; i < 8; ++i) {
     T sine_over_cosine;
     if (f == 1) sine_over_cosine = preloads[i-1];
@@ -394,13 +431,11 @@ void partial_tabMul8(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f
     preloads[7] = lds1[WG + ((me/f) & 7) * WG/8 + (7 * WG + me)/(8*f) * f/8];
     preloads[2] = lds1[WG + ((me/f) & 7) * WG/8 + (2 * WG + me)/(8*f) * f/8];
     preloads[3] = lds1[WG + ((me/f) & 7) * WG/8 + (3 * WG + me)/(8*f) * f/8];
-    bar(WG);
   }
 }
 
 // Finish off a partial tabMul while doing next fft8 making more use of FMA.
-void finish_tabMul8_fft8(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 me, u32 save_one_more_mul) {
-  local T *lds1 = (local T *) lds;
+void finish_tabMul8_fft8(u32 WG, Trig trig, T *preloads, T2 *u, u32 f, u32 numWG, u32 me, u32 save_one_more_mul) {
   TrigSingle trig1 = (TrigSingle) trig;
 
   //
@@ -503,7 +538,7 @@ void OVERLOAD chainMul8(F2 *u, F2 w, u32 tailSquareBcast) {
   u[3] = cmulFancy(u[3], w3);
 
   w3.x += 1;
-  F2 base = cmulFancy (w3, w);
+  F2 base = cmulFancy(w3, w);
   for (int i = 4; i < 8; ++i) {
     u[i] = cmul(u[i], base);
     base = cmulFancy(base, w);
@@ -517,55 +552,8 @@ void OVERLOAD chainMul(u32 len, F2 *u, F2 w, u32 tailSquareBcast) {
   if (len == 8) chainMul8(u, w, tailSquareBcast);
 }
 
-void OVERLOAD shuflBigLDS(u32 WG, local F2 *lds, F2 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i]; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + me]; }
-}
-
-void OVERLOAD shufl(u32 WG, local F2 *lds2, F2 *u, u32 n, u32 f) {				//GWBUG - is shufl of int2 faster (BigLDS)?
-  u32 me = get_local_id(0);
-  local F* lds = (local F*) lds2;
-
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; }
-}
-
-// Shufl two simultaneous FFT_HEIGHTs.  Needed for tailSquared where u and v are computed simultaneously in different threads.
-// NOTE:  It is very important for this routine to use lds memory in coordination with reverseLine2 and unreverseLine2.
-// Failure to do so would result in the need for more bar() calls.  Specifically, the u values are stored in the upper half
-// of lds memory (first SMALL_HEIGHT GF31 values).  The v values are stored in the lower half of lds memory (next SMALL_HEIGHT GF31 values).
-void OVERLOAD shufl2(u32 WG, local F2 *lds2, F2 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-
-  // Partition lds memory into upper and lower halves
-  assert(WG == G_H);
-
-  // Accessing lds memory as F is faster than F2 accesses					//GWBUG???
-  local F* lds = ((local F*) lds2) + (me / WG) * SMALL_HEIGHT;
-
-  me = me % WG;
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; }  
+void OVERLOAD shufl(u32 WG, local F2 *lds, F2 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) {
+  shufl32(WG, lds, u, n, f, numWG, sb, lowMe);
 }
 
 void OVERLOAD tabMul(u32 WG, TrigFP32 trig, F2 *u, u32 n, u32 f, u32 me) {
@@ -574,7 +562,7 @@ void OVERLOAD tabMul(u32 WG, TrigFP32 trig, F2 *u, u32 n, u32 f, u32 me) {
 // This code uses chained complex multiplies which could be faster on GPUs with great mul throughput or poor memory bandwidth or caching.
 
   if (TABMUL_CHAIN32) {
-    chainMul (n, u, trig[p], 0);
+    chainMul(n, u, trig[p], 0);
     return;
   }
 
@@ -632,55 +620,8 @@ void OVERLOAD chainMul(u32 len, GF31 *u, GF31 w) {
   if (len == 8) chainMul8(u, w);
 }
 
-void OVERLOAD shuflBigLDS(u32 WG, local GF31 *lds, GF31 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i]; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + me]; }
-}
-
-void OVERLOAD shufl(u32 WG, local GF31 *lds2, GF31 *u, u32 n, u32 f) {				//GWBUG - is shufl of int2 faster (BigLDS)?
-  u32 me = get_local_id(0);
-  local Z31* lds = (local Z31*) lds2;
-
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; }
-}
-
-// Shufl two simultaneous FFT_HEIGHTs.  Needed for tailSquared where u and v are computed simultaneously in different threads.
-// NOTE:  It is very important for this routine to use lds memory in coordination with reverseLine2 and unreverseLine2.
-// Failure to do so would result in the need for more bar() calls.  Specifically, the u values are stored in the upper half
-// of lds memory (first SMALL_HEIGHT GF31 values).  The v values are stored in the lower half of lds memory (next SMALL_HEIGHT GF31 values).
-void OVERLOAD shufl2(u32 WG, local GF31 *lds2, GF31 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-
-  // Partition lds memory into upper and lower halves
-  assert(WG == G_H);
-
-  // Accessing lds memory as Z31s is faster than GF31 accesses					//GWBUG???
-  local Z31* lds = ((local Z31*) lds2) + (me / WG) * SMALL_HEIGHT;
-
-  me = me % WG;
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; }  
+void OVERLOAD shufl(u32 WG, local GF31 *lds, GF31 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) {
+  shufl32(WG, (local F2 *) lds, (local F2 *) u, n, f, numWG, sb, lowMe);
 }
 
 void OVERLOAD tabMul(u32 WG, TrigGF31 trig, GF31 *u, u32 n, u32 f, u32 me) {
@@ -689,7 +630,7 @@ void OVERLOAD tabMul(u32 WG, TrigGF31 trig, GF31 *u, u32 n, u32 f, u32 me) {
 // This code uses chained complex multiplies which could be faster on GPUs with great mul throughput or poor memory bandwidth or caching.
 
   if (TABMUL_CHAIN31) {
-    chainMul (n, u, trig[p]);
+    chainMul(n, u, trig[p]);
     return;
   }
 
@@ -728,7 +669,7 @@ void OVERLOAD chainMul8(GF61 *u, GF61 w, u32 tailSquareBcast) {
   GF61 w2 = csq(w);
   u[2] = cmul(u[2], w2);
 
-  GF61 base = cmul (w2, w);		//GWBUG - see FP64 version for many possible optimizations
+  GF61 base = cmul(w2, w);		//GWBUG - see FP64 version for many possible optimizations
   for (int i = 3; i < 8; ++i) {
     u[i] = cmul(u[i], base);
     base = cmul(base, w);
@@ -742,83 +683,8 @@ void OVERLOAD chainMul(u32 len, GF61 *u, GF61 w, u32 tailSquareBcast) {
   if (len == 8) chainMul8(u, w, tailSquareBcast);
 }
 
-void OVERLOAD shuflBigLDS(u32 WG, local GF61 *lds, GF61 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i]; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + me]; }
-}
-
-void OVERLOAD shufl(u32 WG, local GF61 *lds2, GF61 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-  local Z61* lds = (local Z61*) lds2;
-
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; }
-}
-
-// Same as shufl but use ints instead of Z61s to reduce LDS memory requirements.
-// Lower LDS requirements should let the optimizer use fewer VGPRs and increase occupancy for WIDTHs >= 1024.
-// Alas, the increased occupancy does not offset extra code needed for shufl_int (the assembly
-// code generated is not pretty).  This might not be true for nVidia or future ROCm optimizers.
-void OVERLOAD shufl_int(u32 WG, local GF61 *lds2, GF61 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-  local int* lds = (local int*) lds2;
-
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).x; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.x = lds[i * WG + me]; u[i] = as_ulong2(tmp); }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).y; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.y = lds[i * WG + me]; u[i] = as_ulong2(tmp); }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).z; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.z = lds[i * WG + me]; u[i] = as_ulong2(tmp); }
-  bar();
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).w; }
-  bar();
-  for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.w = lds[i * WG + me]; u[i] = as_ulong2(tmp); }
-  bar();   // I'm not sure why this barrier call is needed
-}
-
-// Shufl two simultaneous FFT_HEIGHTs.  Needed for tailSquared where u and v are computed simultaneously in different threads.
-// NOTE:  It is very important for this routine to use lds memory in coordination with reverseLine2 and unreverseLine2.
-// Failure to do so would result in the need for more bar() calls.  Specifically, the u values are stored in the upper half
-// of lds memory (first SMALL_HEIGHT GF61 values).  The v values are stored in the lower half of lds memory (next SMALL_HEIGHT GF61 values).
-void OVERLOAD shufl2(u32 WG, local GF61 *lds2, GF61 *u, u32 n, u32 f) {
-  u32 me = get_local_id(0);
-
-  // Partition lds memory into upper and lower halves
-  assert(WG == G_H);
-
-  // Accessing lds memory as Z61s is faster than GF61 accesses
-  local Z61* lds = ((local Z61*) lds2) + (me / WG) * SMALL_HEIGHT;
-
-  me = me % WG;
-  u32 mask = f - 1;
-  assert((mask & (mask + 1)) == 0);
-
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; }
-  bar(WG);
-  for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; }  
+void OVERLOAD shufl(u32 WG, local GF61 *lds, GF61 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) {
+  shufl64(WG, (local T2 *) lds, (T2 *) u, n, f, numWG, sb, lowMe);
 }
 
 void OVERLOAD tabMul(u32 WG, TrigGF61 trig, GF61 *u, u32 n, u32 f, u32 me) {
@@ -827,7 +693,7 @@ void OVERLOAD tabMul(u32 WG, TrigGF61 trig, GF61 *u, u32 n, u32 f, u32 me) {
 // This code uses chained complex multiplies which could be faster on GPUs with great mul throughput or poor memory bandwidth or caching.
 
   if (TABMUL_CHAIN61) {
-    chainMul (n, u, trig[p], 0);
+    chainMul(n, u, trig[p], 0);
     return;
   }
 
diff --git a/src/cl/fftheight.cl b/src/cl/fftheight.cl
index 69fa75b8..49323782 100644
--- a/src/cl/fftheight.cl
+++ b/src/cl/fftheight.cl
@@ -35,76 +35,43 @@ void OVERLOAD fft_NH(T2 *u) {
 #error FFT_VARIANT_H == 0 only supported by AMD GPUs
 #endif
 
-void OVERLOAD fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w) {
-  for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) {
-    if (s > 1) { bar(); }
-    fft_NH(u);
-    w = bcast(w, s);
-
-    chainMul(NH, u, w, 1);
-
-    shufl(SMALL_HEIGHT / NH, lds,  u, NH, s);
-  }
-  fft_NH(u);
-}
-
-void OVERLOAD fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w) {
+void OVERLOAD fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe) {
   u32 WG = SMALL_HEIGHT / NH;
-  for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) {
-    if (s > 1) { bar(WG); }
+  for (u32 s = 1; s < WG; s *= NH) {
     fft_NH(u);
     w = bcast(w, s);
 
     chainMul(NH, u, w, 1);
 
-    shufl2(SMALL_HEIGHT / NH, lds,  u, NH, s);
+    shufl(WG, lds,  u, NH, s, numWG, sb, lowMe);
   }
   fft_NH(u);
 }
 
 #else
 
-void OVERLOAD fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w) {
-  u32 me = get_local_id(0);
-
-#if !UNROLL_H
-  __attribute__((opencl_unroll_hint(1)))
-#endif
-
-  for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) {
-    if (s > 1) { bar(); }
-    fft_NH(u);
-    tabMul(SMALL_HEIGHT / NH, trig, u, NH, s, me);
-    shufl(SMALL_HEIGHT / NH, lds,  u, NH, s);
-  }
-  fft_NH(u);
-}
-
-void OVERLOAD fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe) {
   u32 WG = SMALL_HEIGHT / NH;
 
 #if !UNROLL_H
   __attribute__((opencl_unroll_hint(1)))
 #endif
-
   for (u32 s = 1; s < WG; s *= NH) {
-    if (s > 1) { bar(WG); }
     fft_NH(u);
-    tabMul(WG, trig, u, NH, s, me % WG);
-    shufl2(WG, lds,  u, NH, s);
+    tabMul(WG, trig, u, NH, s, lowMe);
+    shufl(WG, lds,  u, NH, s, numWG, sb, lowMe);
   }
   fft_NH(u);
 }
 
 #endif
 
-void OVERLOAD new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, int callnum) {
+void OVERLOAD new_fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe, int callnum) {
   u32 WG = SMALL_HEIGHT / NH;
-  u32 me = get_local_id(0);
-  // This line mimics shufl2 -- partition lds into halves
-  local T2* partitioned_lds = lds + (me / WG) * SMALL_HEIGHT / 2;
-  me = me % WG;
+
+  // This line mimics shufl -- partition lds
+  local T2* partitioned_lds = lds;
+  if (numWG > 1) partitioned_lds += ((u32) get_local_id(0) / WG) * SMALL_HEIGHT * sb / sizeof(T2);
 
 // Custom code for various SMALL_HEIGHT values
 
@@ -116,27 +83,25 @@ void OVERLOAD new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, int callnum
   trig += WG*4 + 2*WG*4;      // Skip past old FFT_width trig values.  Also skip past !save_one_more_mul trig values.
 
   // Preload trig values to hide global memory latencies.  As the preloads are used, the next set of trig values are preloaded.
-  preload_tabMul4_trig(WG, trig, preloads, 1, me);
+  preload_tabMul4_trig(WG, trig, preloads, 1, numWG, lowMe);
 
   // Do first fft4, partial tabMul, and shufl.
   fft4(u);
-  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, me);
-  shufl2(WG, lds, u, NH, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe);
+  shufl(WG, lds, u, NH, 1, numWG, sb, lowMe);
 
   // Finish the first tabMul and perform second fft4.  Do second partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 1, me, 1);
-  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, me);
-  bar(WG);
-  shufl2(WG, lds, u, NH, 4);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 1, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, numWG, lowMe);
+  shufl(WG, lds, u, NH, 4, numWG, sb, lowMe);
 
   // Finish the second tabMul and perform third fft4.  Do third partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 4, me, 1);
-  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, me);
-  bar(WG);
-  shufl2(WG, lds, u, NH, 16);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 4, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, numWG, lowMe);
+  shufl(WG, lds, u, NH, 16, numWG, sb, lowMe);
 
   // Finish third tabMul and perform final fft4.
-  finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 16, me, 1);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 16, numWG, lowMe, 1);
 
 #elif SMALL_HEIGHT == 512 && NH == 8 && FFT_VARIANT_H == 2
 
@@ -146,21 +111,20 @@ void OVERLOAD new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, int callnum
   trig += WG*8 + 2*WG*8;      // Skip past old FFT_width trig values.  Also skip past !save_one_more_mul trig values.
 
   // Preload trig values to hide global memory latencies.  As the preloads are used, the next set of trig values are preloaded.
-  preload_tabMul8_trig(WG, trig, preloads, 1, me);
+  preload_tabMul8_trig(WG, trig, preloads, 1, numWG, lowMe);
 
   // Do first fft8, partial tabMul, and shufl.
   fft8(u);
-  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 1, me);
-  shufl2(WG, lds, u, NH, 1);
+  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe);
+  shufl(WG, lds, u, NH, 1, numWG, sb, lowMe);
 
   // Finish the first tabMul and perform second fft8.  Do second partial tabMul and shufl.
-  finish_tabMul8_fft8(WG, partitioned_lds, trig, preloads, u, 1, me, 1);
-  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 8, me);
-  bar(WG);
-  shufl2(WG, lds, u, NH, 8);
+  finish_tabMul8_fft8(WG, trig, preloads, u, 1, numWG, lowMe, 1);
+  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 8, numWG, lowMe);
+  shufl(WG, lds, u, NH, 8, numWG, sb, lowMe);
 
   // Finish second tabMul and perform final fft8.
-  finish_tabMul8_fft8(WG, partitioned_lds, trig, preloads, u, 8, me, 1);
+  finish_tabMul8_fft8(WG, trig, preloads, u, 8, numWG, lowMe, 1);
 
 #elif SMALL_HEIGHT == 1024 && NH == 4 && FFT_VARIANT_H == 2
 
@@ -170,44 +134,41 @@ void OVERLOAD new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, int callnum
   trig += WG*4 + 2*WG*4;      // Skip past old FFT_width trig values.  Also skip past !save_one_more_mul trig values.
 
   // Preload trig values to hide global memory latencies.  As the preloads are used, the next set of trig values are preloaded.
-  preload_tabMul4_trig(WG, trig, preloads, 1, me);
+  preload_tabMul4_trig(WG, trig, preloads, 1, numWG, lowMe);
 
   // Do first fft4, partial tabMul, and shufl.
   fft4(u);
-  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, me);
-  shufl2(WG, lds, u, NH, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe);
+  shufl(WG, lds, u, NH, 1, numWG, sb, lowMe);
 
   // Finish the first tabMul and perform second fft4.  Do second partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 1, me, 1);
-  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, me);
-  bar(WG);
-  shufl2(WG, lds, u, NH, 4);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 1, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, numWG, lowMe);
+  shufl(WG, lds, u, NH, 4, numWG, sb, lowMe);
 
   // Finish the second tabMul and perform third fft4.  Do third partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 4, me, 1);
-  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, me);
-  bar(WG);
-  shufl2(WG, lds, u, NH, 16);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 4, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, numWG, lowMe);
+  shufl(WG, lds, u, NH, 16, numWG, sb, lowMe);
 
   // Finish the third tabMul and perform fourth fft4.  Do fourth partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 16, me, 1);
-  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 64, me);
-  bar(WG);
-  shufl2(WG, lds, u, NH, 64);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 16, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 64, numWG, lowMe);
+  shufl(WG, lds, u, NH, 64, numWG, sb, lowMe);
 
   // Finish fourth tabMul and perform final fft4.
-  finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 64, me, 1);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 64, numWG, lowMe, 1);
 
 #else
 
   // Old version
-  fft_HEIGHT2(lds, u, trig, w);
+  fft_HEIGHT(lds, u, trig, w, numWG, sb, lowMe);
 
 #endif
 }
 
-void new_fft_HEIGHT2_1(local T2 *lds, T2 *u, Trig trig, T2 w)  { new_fft_HEIGHT2(lds, u, trig, w, 1); }
-void new_fft_HEIGHT2_2(local T2 *lds, T2 *u, Trig trig, T2 w)  { new_fft_HEIGHT2(lds, u, trig, w, 2); }
+void new_fft_HEIGHT1(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe)  { new_fft_HEIGHT(lds, u, trig, w, numWG, sb, lowMe, 1); }
+void new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe)  { new_fft_HEIGHT(lds, u, trig, w, numWG, sb, lowMe, 2); }
 
 #endif
 
@@ -228,41 +189,22 @@ void OVERLOAD fft_NH(F2 *u) {
 #endif
 }
 
-void OVERLOAD fft_HEIGHT(local F2 *lds, F2 *u, TrigFP32 trig) {
-  u32 me = get_local_id(0);
-
-#if !UNROLL_H
-  __attribute__((opencl_unroll_hint(1)))
-#endif
-
-  for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) {
-    if (s > 1) { bar(); }
-    fft_NH(u);
-    tabMul(SMALL_HEIGHT / NH, trig, u, NH, s, me);
-    shufl(SMALL_HEIGHT / NH, lds,  u, NH, s);
-  }
-  fft_NH(u);
-}
-
-void OVERLOAD fft_HEIGHT2(local F2 *lds, F2 *u, TrigFP32 trig) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_HEIGHT(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) {
   u32 WG = SMALL_HEIGHT / NH;
 
 #if !UNROLL_H
   __attribute__((opencl_unroll_hint(1)))
 #endif
-
   for (u32 s = 1; s < WG; s *= NH) {
-    if (s > 1) { bar(WG); }
     fft_NH(u);
-    tabMul(WG, trig, u, NH, s, me % WG);
-    shufl2(WG, lds,  u, NH, s);
+    tabMul(WG, trig, u, NH, s, lowMe);
+    shufl(WG, lds, u, NH, s, numWG, sb, lowMe);
   }
   fft_NH(u);
 }
 
-void new_fft_HEIGHT2_1(local F2 *lds, F2 *u, TrigFP32 trig)  { fft_HEIGHT2(lds, u, trig); }
-void new_fft_HEIGHT2_2(local F2 *lds, F2 *u, TrigFP32 trig)  { fft_HEIGHT2(lds, u, trig); }
+void new_fft_HEIGHT1(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe)  { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); }
+void new_fft_HEIGHT2(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe)  { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); }
 
 #endif
 
@@ -283,41 +225,22 @@ void OVERLOAD fft_NH(GF31 *u) {
 #endif
 }
 
-void OVERLOAD fft_HEIGHT(local GF31 *lds, GF31 *u, TrigGF31 trig) {
-  u32 me = get_local_id(0);
-
-#if !UNROLL_H
-  __attribute__((opencl_unroll_hint(1)))
-#endif
-
-  for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) {
-    if (s > 1) { bar(); }
-    fft_NH(u);
-    tabMul(SMALL_HEIGHT / NH, trig, u, NH, s, me);
-    shufl(SMALL_HEIGHT / NH, lds,  u, NH, s);
-  }
-  fft_NH(u);
-}
-
-void OVERLOAD fft_HEIGHT2(local GF31 *lds, GF31 *u, TrigGF31 trig) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_HEIGHT(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) {
   u32 WG = SMALL_HEIGHT / NH;
 
 #if !UNROLL_H
   __attribute__((opencl_unroll_hint(1)))
 #endif
-
   for (u32 s = 1; s < WG; s *= NH) {
-    if (s > 1) { bar(WG); }
     fft_NH(u);
-    tabMul(WG, trig, u, NH, s, me % WG);
-    shufl2(WG, lds,  u, NH, s);
+    tabMul(WG, trig, u, NH, s, lowMe);
+    shufl(WG, lds, u, NH, s, numWG, sb, lowMe);
   }
   fft_NH(u);
 }
 
-void OVERLOAD new_fft_HEIGHT2_1(local GF31 *lds, GF31 *u, TrigGF31 trig)  { fft_HEIGHT2(lds, u, trig); }
-void OVERLOAD new_fft_HEIGHT2_2(local GF31 *lds, GF31 *u, TrigGF31 trig)  { fft_HEIGHT2(lds, u, trig); }
+void OVERLOAD new_fft_HEIGHT1(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe)  { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); }
+void OVERLOAD new_fft_HEIGHT2(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe)  { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); }
 
 #endif
 
@@ -338,40 +261,21 @@ void OVERLOAD fft_NH(GF61 *u) {
 #endif
 }
 
-void OVERLOAD fft_HEIGHT(local GF61 *lds, GF61 *u, TrigGF61 trig) {
-  u32 me = get_local_id(0);
-
-#if !UNROLL_H
-  __attribute__((opencl_unroll_hint(1)))
-#endif
-
-  for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) {
-    if (s > 1) { bar(); }
-    fft_NH(u);
-    tabMul(SMALL_HEIGHT / NH, trig, u, NH, s, me);
-    shufl(SMALL_HEIGHT / NH, lds,  u, NH, s);
-  }
-  fft_NH(u);
-}
-
-void OVERLOAD fft_HEIGHT2(local GF61 *lds, GF61 *u, TrigGF61 trig) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_HEIGHT(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) {
   u32 WG = SMALL_HEIGHT / NH;
 
 #if !UNROLL_H
   __attribute__((opencl_unroll_hint(1)))
 #endif
-
   for (u32 s = 1; s < WG; s *= NH) {
-    if (s > 1) { bar(WG); }
     fft_NH(u);
-    tabMul(WG, trig, u, NH, s, me % WG);
-    shufl2(WG, lds,  u, NH, s);
+    tabMul(WG, trig, u, NH, s, lowMe);
+    shufl(WG, lds, u, NH, s, numWG, sb, lowMe);
   }
   fft_NH(u);
 }
 
-void OVERLOAD new_fft_HEIGHT2_1(local GF61 *lds, GF61 *u, TrigGF61 trig)  { fft_HEIGHT2(lds, u, trig); }
-void OVERLOAD new_fft_HEIGHT2_2(local GF61 *lds, GF61 *u, TrigGF61 trig)  { fft_HEIGHT2(lds, u, trig); }
+void OVERLOAD new_fft_HEIGHT1(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe)  { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); }
+void OVERLOAD new_fft_HEIGHT2(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe)  { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); }
 
 #endif
diff --git a/src/cl/ffthin.cl b/src/cl/ffthin.cl
index ae14ec53..82b3377d 100644
--- a/src/cl/ffthin.cl
+++ b/src/cl/ffthin.cl
@@ -8,11 +8,11 @@
 
 // Do an FFT Height after an fftMiddleIn (which may not have fully transposed data, leading to non-sequential input)
 KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local T2 lds[SMALL_HEIGHT / 2];
-  
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local T2 lds[lds_bytes / sizeof(T2)];
+
   T2 u[NH];
   u32 g = get_group_id(0);
-
   u32 me = get_local_id(0);
 
   readTailFusedLine(in, u, g, me);
@@ -23,7 +23,7 @@ KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) {
   T2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH);
 #endif
 
-  fft_HEIGHT(lds, u, smallTrig, w);
+  fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me);
 
   write(G_H, NH, u, out, SMALL_HEIGHT * transPos(g, MIDDLE, WIDTH));
 }
@@ -39,7 +39,8 @@ KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) {
 
 // Do an FFT Height after an fftMiddleIn (which may not have fully transposed data, leading to non-sequential input)
 KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local F2 lds[SMALL_HEIGHT / 2];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local F2 lds[lds_bytes / sizeof(F2)];
 
   CP(F2) inF2 = (CP(F2)) in;
   P(F2) outF2 = (P(F2)) out;
@@ -57,7 +58,7 @@ KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) {
   F2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH);
 #endif
 
-  fft_HEIGHT(lds, u, smallTrigF2);
+  fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me);
 
   write(G_H, NH, u, outF2, SMALL_HEIGHT * transPos(g, MIDDLE, WIDTH));
 }
@@ -73,7 +74,8 @@ KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) {
 
 // Do an FFT Height after an fftMiddleIn (which may not have fully transposed data, leading to non-sequential input)
 KERNEL(G_H) fftHinGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF31 lds[SMALL_HEIGHT / 2];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF31 lds[lds_bytes / sizeof(GF31)];
 
   CP(GF31) in31 = (CP(GF31)) (in + DISTGF31);
   P(GF31) out31 = (P(GF31)) (out + DISTGF31);
@@ -81,12 +83,11 @@ KERNEL(G_H) fftHinGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   GF31 u[NH];
   u32 g = get_group_id(0);
-
   u32 me = get_local_id(0);
 
   readTailFusedLine(in31, u, g, me);
 
-  fft_HEIGHT(lds, u, smallTrig31);
+  fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me);
 
   write(G_H, NH, u, out31, SMALL_HEIGHT * transPos(g, MIDDLE, WIDTH));
 }
@@ -102,7 +103,8 @@ KERNEL(G_H) fftHinGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
 
 // Do an FFT Height after an fftMiddleIn (which may not have fully transposed data, leading to non-sequential input)
 KERNEL(G_H) fftHinGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF61 lds[SMALL_HEIGHT / 2];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF61 lds[lds_bytes / sizeof(GF61)];
 
   CP(GF61) in61 = (CP(GF61)) (in + DISTGF61);
   P(GF61) out61 = (P(GF61)) (out + DISTGF61);
@@ -110,12 +112,11 @@ KERNEL(G_H) fftHinGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   GF61 u[NH];
   u32 g = get_group_id(0);
-
   u32 me = get_local_id(0);
 
   readTailFusedLine(in61, u, g, me);
 
-  fft_HEIGHT(lds, u, smallTrig61);
+  fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me);
 
   write(G_H, NH, u, out61, SMALL_HEIGHT * transPos(g, MIDDLE, WIDTH));
 }
diff --git a/src/cl/fftp.cl b/src/cl/fftp.cl
index 2fe0d7d4..21cacf77 100644
--- a/src/cl/fftp.cl
+++ b/src/cl/fftp.cl
@@ -10,7 +10,7 @@
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) {
-  local T2 lds[WIDTH / 2];
+  local T2 lds[WIDTH * SHUFL_BYTES_W / sizeof(T2)];
   T2 u[NW];
 
   u32 g = get_group_id(0);
@@ -27,9 +27,9 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS)
     u[i] = U2(in[p].x * w1, in[p].y * w2);
   }
 
-  fft_WIDTH(lds, u, smallTrig);
+  fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me);
 
-  writeCarryFusedLine(u, out, g);
+  writeCarryFusedLine(u, out, g, me);
 }
 
 
@@ -41,7 +41,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS)
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(F2) out, CP(Word2) in, TrigFP32 smallTrig, BigTabFP32 THREAD_WEIGHTS) {
-  local F2 lds[WIDTH / 2];
+  local F2 lds[WIDTH * SHUFL_BYTES_W / sizeof(F2)];
   F2 u[NW];
 
   u32 g = get_group_id(0);
@@ -58,9 +58,9 @@ KERNEL(G_W) fftP(P(F2) out, CP(Word2) in, TrigFP32 smallTrig, BigTabFP32 THREAD_
     u[i] = U2(in[p].x * w1, in[p].y * w2);
   }
 
-  fft_WIDTH(lds, u, smallTrig);
+  fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me);
 
-  writeCarryFusedLine(u, out, g);
+  writeCarryFusedLine(u, out, g, me);
 }
 
 
@@ -72,7 +72,7 @@ KERNEL(G_W) fftP(P(F2) out, CP(Word2) in, TrigFP32 smallTrig, BigTabFP32 THREAD_
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(GF31) out, CP(Word2) in, TrigGF31 smallTrig) {
-  local GF31 lds[WIDTH / 2];
+  local GF31 lds[WIDTH * SHUFL_BYTES_W / sizeof(GF31)];
   GF31 u[NW];
 
   u32 g = get_group_id(0);
@@ -113,9 +113,9 @@ KERNEL(G_W) fftP(P(GF31) out, CP(Word2) in, TrigGF31 smallTrig) {
     if (weight_shift > 31) weight_shift -= 31;
   }
 
-  fft_WIDTH(lds, u, smallTrig);
+  fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me);
 
-  writeCarryFusedLine(u, out, g);
+  writeCarryFusedLine(u, out, g, me);
 }
 
 
@@ -127,7 +127,7 @@ KERNEL(G_W) fftP(P(GF31) out, CP(Word2) in, TrigGF31 smallTrig) {
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(GF61) out, CP(Word2) in, TrigGF61 smallTrig) {
-  local GF61 lds[WIDTH / 2];
+  local GF61 lds[WIDTH * SHUFL_BYTES_W / sizeof(GF61)];
   GF61 u[NW];
 
   u32 g = get_group_id(0);
@@ -170,9 +170,9 @@ KERNEL(G_W) fftP(P(GF61) out, CP(Word2) in, TrigGF61 smallTrig) {
     if (weight_shift > 61) weight_shift -= 61;
   }
 
-  fft_WIDTH(lds, u, smallTrig);
+  fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me);
 
-  writeCarryFusedLine(u, out, g);
+  writeCarryFusedLine(u, out, g, me);
 }
 
 
@@ -184,7 +184,7 @@ KERNEL(G_W) fftP(P(GF61) out, CP(Word2) in, TrigGF61 smallTrig) {
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) {
-  local T2 lds[WIDTH / 2];
+  local T2 lds[WIDTH * SHUFL_BYTES_W / sizeof(T2)];
   local GF31 *lds31 = (local GF31 *) lds;
   T2 u[NW];
   GF31 u31[NW];
@@ -236,11 +236,11 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS)
     if (weight_shift > 31) weight_shift -= 31;
   }
 
-  fft_WIDTH(lds, u, smallTrig);
-  writeCarryFusedLine(u, out, g);
-  bar();
-  fft_WIDTH(lds31, u31, smallTrig31);
-  writeCarryFusedLine(u31, out31, g);
+  fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(u, out, g, me);
+
+  fft_WIDTH(lds31, u31, smallTrig31, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(u31, out31, g, me);
 }
 
 
@@ -252,7 +252,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS)
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIGHTS) {
-  local F2 ldsF2[WIDTH / 2];
+  local F2 ldsF2[WIDTH * SHUFL_BYTES_W / sizeof(F2)];
   local GF31 *lds31 = (local GF31 *) ldsF2;
   F2 uF2[NW];
   GF31 u31[NW];
@@ -306,11 +306,11 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG
     if (weight_shift > 31) weight_shift -= 31;
   }
 
-  fft_WIDTH(ldsF2, uF2, smallTrigF2);
-  writeCarryFusedLine(uF2, outF2, g);
-  bar();
-  fft_WIDTH(lds31, u31, smallTrig31);
-  writeCarryFusedLine(u31, out31, g);
+  fft_WIDTH(ldsF2, uF2, smallTrigF2, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(uF2, outF2, g, me);
+
+  fft_WIDTH(lds31, u31, smallTrig31, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(u31, out31, g, me);
 }
 
 
@@ -322,7 +322,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIGHTS) {
-  local GF61 lds61[WIDTH / 2];
+  local GF61 lds61[WIDTH * SHUFL_BYTES_W / sizeof(GF61)];
   local F2 *ldsF2 = (local F2 *) lds61;
   F2 uF2[NW];
   GF61 u61[NW];
@@ -376,11 +376,11 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG
     if (weight_shift > 61) weight_shift -= 61;
   }
 
-  fft_WIDTH(ldsF2, uF2, smallTrigF2);
-  writeCarryFusedLine(uF2, outF2, g);
-  bar();
-  fft_WIDTH(lds61, u61, smallTrig61);
-  writeCarryFusedLine(u61, out61, g);
+  fft_WIDTH(ldsF2, uF2, smallTrigF2, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(uF2, outF2, g, me);
+
+  fft_WIDTH(lds61, u61, smallTrig61, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(u61, out61, g, me);
 }
 
 
@@ -392,7 +392,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig) {
-  local GF61 lds61[WIDTH / 2];
+  local GF61 lds61[WIDTH * SHUFL_BYTES_W / sizeof(GF61)];
   local GF31 *lds31 = (local GF31 *) lds61;
   GF31 u31[NW];
   GF61 u61[NW];
@@ -459,11 +459,11 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig) {
     m61_weight_shift = adjust_m61_weight_shift(m61_weight_shift);
   }
 
-  fft_WIDTH(lds31, u31, smallTrig31);
-  writeCarryFusedLine(u31, out31, g);
-  bar();
-  fft_WIDTH(lds61, u61, smallTrig61);
-  writeCarryFusedLine(u61, out61, g);
+  fft_WIDTH(lds31, u31, smallTrig31, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(u31, out31, g, me);
+
+  fft_WIDTH(lds61, u61, smallTrig61, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(u61, out61, g, me);
 }
 
 
@@ -475,7 +475,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig) {
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIGHTS) {
-  local GF61 lds61[WIDTH / 2];
+  local GF61 lds61[WIDTH * SHUFL_BYTES_W / sizeof(GF61)];
   local F2 *ldsF2 = (local F2 *) lds61;
   local GF31 *lds31 = (local GF31 *) lds61;
   F2 uF2[NW];
@@ -551,14 +551,14 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG
     m61_weight_shift = adjust_m61_weight_shift(m61_weight_shift);
   }
 
-  fft_WIDTH(ldsF2, uF2, smallTrigF2);
-  writeCarryFusedLine(uF2, outF2, g);
-  bar();
-  fft_WIDTH(lds31, u31, smallTrig31);
-  writeCarryFusedLine(u31, out31, g);
-  bar();
-  fft_WIDTH(lds61, u61, smallTrig61);
-  writeCarryFusedLine(u61, out61, g);
+  fft_WIDTH(ldsF2, uF2, smallTrigF2, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(uF2, outF2, g, me);
+
+  fft_WIDTH(lds31, u31, smallTrig31, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(u31, out31, g, me);
+
+  fft_WIDTH(lds61, u61, smallTrig61, 1, SHUFL_BYTES_W, me);
+  writeCarryFusedLine(u61, out61, g, me);
 }
 
 
diff --git a/src/cl/fftw.cl b/src/cl/fftw.cl
index a19b26d0..89277649 100644
--- a/src/cl/fftw.cl
+++ b/src/cl/fftw.cl
@@ -9,13 +9,14 @@
 
 // Do the ending fft_WIDTH after an fftMiddleOut.  This is the same as the first half of carryFused.
 KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local T2 lds[WIDTH / 2];
+  local T2 lds[WIDTH * SHUFL_BYTES_W / sizeof(T2)];
 
   T2 u[NW];
   u32 g = get_group_id(0);
+  u32 me = get_local_id(0);
 
-  readCarryFusedLine(in, u, g);
-  fft_WIDTH(lds, u, smallTrig);  
+  readCarryFusedLine(in, u, g, me);
+  fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me);
   out += WIDTH * g;
   write(G_W, NW, u, out, 0);
 }
@@ -31,7 +32,7 @@ KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) {
 
 // Do the ending fft_WIDTH after an fftMiddleOut.  This is the same as the first half of carryFused.
 KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local F2 lds[WIDTH / 2];
+  local F2 lds[WIDTH * SHUFL_BYTES_W / sizeof(F2)];
 
   CP(F2) inF2 = (CP(F2)) in;
   P(F2) outF2 = (P(F2)) out;
@@ -39,9 +40,10 @@ KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   F2 u[NW];
   u32 g = get_group_id(0);
+  u32 me = get_local_id(0);
 
-  readCarryFusedLine(inF2, u, g);
-  fft_WIDTH(lds, u, smallTrigF2);  
+  readCarryFusedLine(inF2, u, g, me);
+  fft_WIDTH(lds, u, smallTrigF2, 1, SHUFL_BYTES_W, me);
   outF2 += WIDTH * g;
   write(G_W, NW, u, outF2, 0);
 }
@@ -56,7 +58,7 @@ KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) {
 #if NTT_GF31
 
 KERNEL(G_W) fftWGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF31 lds[WIDTH / 2];
+  local GF31 lds[WIDTH * SHUFL_BYTES_W / sizeof(GF31)];
 
   CP(GF31) in31 = (CP(GF31)) (in + DISTGF31);
   P(GF31) out31 = (P(GF31)) (out + DISTGF31);
@@ -64,9 +66,10 @@ KERNEL(G_W) fftWGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   GF31 u[NW];
   u32 g = get_group_id(0);
+  u32 me = get_local_id(0);
 
-  readCarryFusedLine(in31, u, g);
-  fft_WIDTH(lds, u, smallTrig31);
+  readCarryFusedLine(in31, u, g, me);
+  fft_WIDTH(lds, u, smallTrig31, 1, SHUFL_BYTES_W, me);
   out31 += WIDTH * g;
   write(G_W, NW, u, out31, 0);
 }
@@ -81,7 +84,7 @@ KERNEL(G_W) fftWGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
 #if NTT_GF61
 
 KERNEL(G_W) fftWGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF61 lds[WIDTH / 2];
+  local GF61 lds[WIDTH * SHUFL_BYTES_W / sizeof(GF61)];
 
   CP(GF61) in61 = (CP(GF61)) (in + DISTGF61);
   P(GF61) out61 = (P(GF61)) (out + DISTGF61);
@@ -89,9 +92,10 @@ KERNEL(G_W) fftWGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   GF61 u[NW];
   u32 g = get_group_id(0);
+  u32 me = get_local_id(0);
 
-  readCarryFusedLine(in61, u, g);
-  fft_WIDTH(lds, u, smallTrig61);  
+  readCarryFusedLine(in61, u, g, me);
+  fft_WIDTH(lds, u, smallTrig61, 1, SHUFL_BYTES_W, me);
   out61 += WIDTH * g;
   write(G_W, NW, u, out61, 0);
 }
diff --git a/src/cl/fftwidth.cl b/src/cl/fftwidth.cl
index d0589ab0..03935149 100644
--- a/src/cl/fftwidth.cl
+++ b/src/cl/fftwidth.cl
@@ -29,39 +29,38 @@ void OVERLOAD fft_NW(T2 *u) {
 #error FFT_VARIANT_W == 0 only supported by AMD GPUs
 #endif
 
-void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig, u32 numWG, const u32 sb, u32 lowMe) {
+  u32 WG = WIDTH / NW;
+
 #if NW == 8
-  T2 w = fancyTrig_N(ND / WIDTH * me);
+  T2 w = fancyTrig_N(ND / WIDTH * lowMe);
 #else
-  T2 w = slowTrig_N(ND / WIDTH * me, ND / NW);
+  T2 w = slowTrig_N(ND / WIDTH * lowMe, ND / NW);
 #endif
 
-  for (u32 s = 1; s < WIDTH / NW; s *= NW) {
-    if (s > 1) { bar(); }
+  for (u32 s = 1; s < WG; s *= NW) {
     fft_NW(u);
     w = bcast(w, s);
 
     chainMul(NW, u, w, 0);
 
-    shufl( WIDTH / NW, lds,  u, NW, s);
+    shufl(WG, lds,  u, NW, s, numWG, sb, lowMe);
   }
   fft_NW(u);
 }
 
 #else
 
-void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig, u32 numWG, const u32 sb, u32 lowMe) {
+  u32 WG = WIDTH / NW;
 
 #if !UNROLL_W
   __attribute__((opencl_unroll_hint(1)))
 #endif
-  for (u32 s = 1; s < WIDTH / NW; s *= NW) {
-    if (s > 1) { bar(); }
+  for (u32 s = 1; s < WG; s *= NW) {
     fft_NW(u);
-    tabMul(WIDTH / NW, trig, u, NW, s, me);
-    shufl(WIDTH / NW, lds,  u, NW, s);
+    tabMul(WG, trig, u, NW, s, lowMe);
+    shufl(WG, lds,  u, NW, s, numWG, sb, lowMe);
   }
   fft_NW(u);
 }
@@ -74,9 +73,12 @@ void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig) {
 // To maximize FMA opportunities we precompute trig values as cosine and sine/cosine rather than cosine and sine.
 // The downside is sine/cosine cannot be computed with chained multiplies.
 
-void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) {
+void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, u32 numWG, u32 lowMe, const u32 sb, int callnum) {
   u32 WG = WIDTH / NW;
-  u32 me = get_local_id(0);
+
+  // This line mimics shufl -- partition lds
+  local T2* partitioned_lds = lds;
+  if (numWG > 1) partitioned_lds += ((u32) get_local_id(0) / WG) * WIDTH * sb / sizeof(T2);
 
 // Custom code for various WIDTH values
 
@@ -88,27 +90,25 @@ void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) {
   trig += WG*4 + 2*WG*4;      // Skip past old FFT_width trig values.  Also skip past !save_one_more_mul trig values.
 
   // Preload trig values to hide global memory latencies.  As the preloads are used, the next set of trig values are preloaded.
-  preload_tabMul4_trig(WG, trig, preloads, 1, me);
+  preload_tabMul4_trig(WG, trig, preloads, 1, numWG, lowMe);
 
   // Do first fft4, partial tabMul, and shufl.
   fft4(u);
-  partial_tabMul4(WG, lds, trig, preloads, u, 1, me);
-  shufl(WG, lds, u, NW, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe);
+  shufl(WG, lds, u, NW, 1, numWG, sb, lowMe);
 
   // Finish the first tabMul and perform second fft4.  Do second partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, lds, trig, preloads, u, 1, me, 1);
-  partial_tabMul4(WG, lds, trig, preloads, u, 4, me);
-  bar(WG);
-  shufl(WG, lds, u, NW, 4);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 1, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, numWG, lowMe);
+  shufl(WG, lds, u, NW, 4, numWG, sb, lowMe);
 
   // Finish the second tabMul and perform third fft4.  Do third partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, lds, trig, preloads, u, 4, me, 1);
-  partial_tabMul4(WG, lds, trig, preloads, u, 16, me);
-  bar(WG);
-  shufl(WG, lds, u, NW, 16);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 4, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, numWG, lowMe);
+  shufl(WG, lds, u, NW, 16, numWG, sb, lowMe);
 
   // Finish third tabMul and perform final fft4.
-  finish_tabMul4_fft4(WG, lds, trig, preloads, u, 16, me, 1);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 16, numWG, lowMe, 1);
 
 #elif WIDTH == 512 && NW == 8 && FFT_VARIANT_W == 2
 
@@ -118,21 +118,20 @@ void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) {
   trig += WG*8;               // Skip past old FFT_width trig values.
 
   // Preload trig values to hide global memory latencies.  As the preloads are used, the next set of trig values are preloaded.
-  preload_tabMul8_trig(WG, trig, preloads, 1, me);
+  preload_tabMul8_trig(WG, trig, preloads, 1, numWG, lowMe);
 
   // Do first fft8, partial tabMul, and shufl.
   fft8(u);
-  partial_tabMul8(WG, lds, trig, preloads, u, 1, me);
-  shufl(WG, lds, u, NW, 1);
+  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe);
+  shufl(WG, lds, u, NW, 1, numWG, sb, lowMe);
 
   // Finish the first tabMul and perform second fft8.  Do second partial tabMul and shufl.
-  finish_tabMul8_fft8(WG, lds, trig, preloads, u, 1, me, 0);  // We'd rather set save_one_more_mul to 1
-  partial_tabMul8(WG, lds, trig, preloads, u, 8, me);
-  bar();
-  shufl(WG, lds, u, NW, 8);
+  finish_tabMul8_fft8(WG, trig, preloads, u, 1, numWG, lowMe, 0);  // We'd rather set save_one_more_mul to 1
+  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 8, numWG, lowMe);
+  shufl(WG, lds, u, NW, 8, numWG, sb, lowMe);
 
   // Finish second tabMul and perform final fft8.
-  finish_tabMul8_fft8(WG, lds, trig, preloads, u, 8, me, 0);  // We'd rather set save_one_more_mul to 1
+  finish_tabMul8_fft8(WG, trig, preloads, u, 8, numWG, lowMe, 0);  // We'd rather set save_one_more_mul to 1
 
 #elif WIDTH == 1024 && NW == 4 && FFT_VARIANT_W == 2
 
@@ -142,33 +141,30 @@ void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) {
   trig += WG*4 + 2*WG*4;      // Skip past old FFT_width trig values.  Also skip past !save_one_more_mul trig values.
 
   // Preload trig values to hide global memory latencies.  As the preloads are used, the next set of trig values are preloaded.
-  preload_tabMul4_trig(WG, trig, preloads, 1, me);
+  preload_tabMul4_trig(WG, trig, preloads, 1, numWG, lowMe);
 
   // Do first fft4, partial tabMul, and shufl.
   fft4(u);
-  partial_tabMul4(WG, lds, trig, preloads, u, 1, me);
-  shufl(WG, lds, u, NW, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe);
+  shufl(WG, lds, u, NW, 1, numWG, sb, lowMe);
 
   // Finish the first tabMul and perform second fft4.  Do second partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, lds, trig, preloads, u, 1, me, 1);
-  partial_tabMul4(WG, lds, trig, preloads, u, 4, me);
-  bar(WG);
-  shufl(WG, lds, u, NW, 4);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 1, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, numWG, lowMe);
+  shufl(WG, lds, u, NW, 4, numWG, sb, lowMe);
 
   // Finish the second tabMul and perform third fft4.  Do third partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, lds, trig, preloads, u, 4, me, 1);
-  partial_tabMul4(WG, lds, trig, preloads, u, 16, me);
-  bar(WG);
-  shufl(WG, lds, u, NW, 16);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 4, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, numWG, lowMe);
+  shufl(WG, lds, u, NW, 16, numWG, sb, lowMe);
 
   // Finish the third tabMul and perform fourth fft4.  Do fourth partial tabMul and shufl.
-  finish_tabMul4_fft4(WG, lds, trig, preloads, u, 16, me, 1);
-  partial_tabMul4(WG, lds, trig, preloads, u, 64, me);
-  bar(WG);
-  shufl(WG, lds, u, NW, 64);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 16, numWG, lowMe, 1);
+  partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 64, numWG, lowMe);
+  shufl(WG, lds, u, NW, 64, numWG, sb, lowMe);
 
   // Finish fourth tabMul and perform final fft4.
-  finish_tabMul4_fft4(WG, lds, trig, preloads, u, 64, me, 1);
+  finish_tabMul4_fft4(WG, trig, preloads, u, 64, numWG, lowMe, 1);
 
 #elif WIDTH == 4096 && NW == 8 && FFT_VARIANT_W == 2
 
@@ -178,39 +174,37 @@ void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) {
   trig += WG*8;               // Skip past old FFT_width trig values to the !save_one_more_mul trig values
 
   // Preload trig values to hide global memory latencies.  As the preloads are used, the next set of trig values are preloaded.
-  preload_tabMul8_trig(WG, trig, preloads, 1, me);
+  preload_tabMul8_trig(WG, trig, preloads, 1, numWG, lowMe);
 
   // Do first fft8, partial tabMul, and shufl.
   fft8(u);
-  partial_tabMul8(WG, lds, trig, preloads, u, 1, me);
-  shufl(WG, lds, u, NW, 1);
+  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe);
+  shufl(WG, lds, u, NW, 1, numWG, sb, lowMe);
 
   // Finish the first tabMul and perform second fft8.  Do second partial tabMul and shufl.
-  finish_tabMul8_fft8(WG, lds, trig, preloads, u, 1, me, 0);  // We'd rather set save_one_more_mul to 1
-  partial_tabMul8(WG, lds, trig, preloads, u, 8, me);
-  bar();
-  shufl(WG, lds, u, NW, 8);
+  finish_tabMul8_fft8(WG, trig, preloads, u, 1, numWG, lowMe, 0);  // We'd rather set save_one_more_mul to 1
+  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 8, numWG, lowMe);
+  shufl(WG, lds, u, NW, 8, numWG, sb, lowMe);
 
   // Finish the second tabMul and perform third fft8.  Do third partial tabMul and shufl.
-  finish_tabMul8_fft8(WG, lds, trig, preloads, u, 8, me, 0);  // We'd rather set save_one_more_mul to 1
-  partial_tabMul8(WG, lds, trig, preloads, u, 64, me);
-  bar();
-  shufl(WG, lds, u, NW, 64);
+  finish_tabMul8_fft8(WG, trig, preloads, u, 8, numWG, lowMe, 0);  // We'd rather set save_one_more_mul to 1
+  partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 64, numWG, lowMe);
+  shufl(WG, lds, u, NW, 64, numWG, sb, lowMe);
 
   // Finish third tabMul and perform final fft8.
-  finish_tabMul8_fft8(WG, lds, trig, preloads, u, 64, me, 0);  // We'd rather set save_one_more_mul to 1
+  finish_tabMul8_fft8(WG, trig, preloads, u, 64, numWG, lowMe, 0);  // We'd rather set save_one_more_mul to 1
 
 #else
 
   // Old version
-  fft_WIDTH(lds, u, trig);
+  fft_WIDTH(lds, u, trig, numWG, sb, lowMe);
 
 #endif
 }
 
 // There are two version of new_fft_WIDTH in case we want to try saving some trig values from new_fft_WIDTH1 in LDS memory for later use in new_fft_WIDTH2.
-void OVERLOAD new_fft_WIDTH1(local T2 *lds, T2 *u, Trig trig) { new_fft_WIDTH(lds, u, trig, 1); }
-void OVERLOAD new_fft_WIDTH2(local T2 *lds, T2 *u, Trig trig) { new_fft_WIDTH(lds, u, trig, 2); }
+void OVERLOAD new_fft_WIDTH1(local T2 *lds, T2 *u, Trig trig, u32 numWG, const u32 sb, u32 lowMe) { new_fft_WIDTH(lds, u, trig, numWG, lowMe, sb, 1); }
+void OVERLOAD new_fft_WIDTH2(local T2 *lds, T2 *u, Trig trig, u32 numWG, const u32 sb, u32 lowMe) { new_fft_WIDTH(lds, u, trig, numWG, lowMe, sb, 2); }
 
 #endif
 
@@ -231,23 +225,22 @@ void OVERLOAD fft_NW(F2 *u) {
 #endif
 }
 
-void OVERLOAD fft_WIDTH(local F2 *lds, F2 *u, TrigFP32 trig) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_WIDTH(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) {
+  u32 WG = WIDTH / NW;
 
 #if !UNROLL_W
   __attribute__((opencl_unroll_hint(1)))
 #endif
-  for (u32 s = 1; s < WIDTH / NW; s *= NW) {
-    if (s > 1) { bar(); }
+  for (u32 s = 1; s < WG; s *= NW) {
     fft_NW(u);
-    tabMul(WIDTH / NW, trig, u, NW, s, me);
-    shufl(WIDTH / NW, lds,  u, NW, s);
+    tabMul(WG, trig, u, NW, s, lowMe);
+    shufl(WG, lds,  u, NW, s, numWG, sb, lowMe);
   }
   fft_NW(u);
 }
 
-void OVERLOAD new_fft_WIDTH1(local F2 *lds, F2 *u, TrigFP32 trig) { fft_WIDTH(lds, u, trig); }
-void OVERLOAD new_fft_WIDTH2(local F2 *lds, F2 *u, TrigFP32 trig) { fft_WIDTH(lds, u, trig); }
+void OVERLOAD new_fft_WIDTH1(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); }
+void OVERLOAD new_fft_WIDTH2(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); }
 
 #endif
 
@@ -268,23 +261,22 @@ void OVERLOAD fft_NW(GF31 *u) {
 #endif
 }
 
-void OVERLOAD fft_WIDTH(local GF31 *lds, GF31 *u, TrigGF31 trig) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_WIDTH(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) {
+  u32 WG = WIDTH / NW;
 
 #if !UNROLL_W
   __attribute__((opencl_unroll_hint(1)))
 #endif
-  for (u32 s = 1; s < WIDTH / NW; s *= NW) {
-    if (s > 1) { bar(); }
+  for (u32 s = 1; s < WG; s *= NW) {
     fft_NW(u);
-    tabMul(WIDTH / NW, trig, u, NW, s, me);
-    shufl(WIDTH / NW, lds,  u, NW, s);
+    tabMul(WG, trig, u, NW, s, lowMe);
+    shufl(WG, lds,  u, NW, s, numWG, sb, lowMe);
   }
   fft_NW(u);
 }
 
-void OVERLOAD new_fft_WIDTH1(local GF31 *lds, GF31 *u, TrigGF31 trig) { fft_WIDTH(lds, u, trig); }
-void OVERLOAD new_fft_WIDTH2(local GF31 *lds, GF31 *u, TrigGF31 trig) { fft_WIDTH(lds, u, trig); }
+void OVERLOAD new_fft_WIDTH1(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); }
+void OVERLOAD new_fft_WIDTH2(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); }
 
 #endif
 
@@ -305,22 +297,21 @@ void OVERLOAD fft_NW(GF61 *u) {
 #endif
 }
 
-void OVERLOAD fft_WIDTH(local GF61 *lds, GF61 *u, TrigGF61 trig) {
-  u32 me = get_local_id(0);
+void OVERLOAD fft_WIDTH(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) {
+  u32 WG = WIDTH / NW;
 
 #if !UNROLL_W
   __attribute__((opencl_unroll_hint(1)))
 #endif
-  for (u32 s = 1; s < WIDTH / NW; s *= NW) {
-    if (s > 1) { bar(); }
+  for (u32 s = 1; s < WG; s *= NW) {
     fft_NW(u);
-    tabMul(WIDTH / NW, trig, u, NW, s, me);
-    shufl(WIDTH / NW, lds,  u, NW, s);
+    tabMul(WG, trig, u, NW, s, lowMe);
+    shufl(WG, lds,  u, NW, s, numWG, sb, lowMe);
   }
   fft_NW(u);
 }
 
-void OVERLOAD new_fft_WIDTH1(local GF61 *lds, GF61 *u, TrigGF61 trig) { fft_WIDTH(lds, u, trig); }
-void OVERLOAD new_fft_WIDTH2(local GF61 *lds, GF61 *u, TrigGF61 trig) { fft_WIDTH(lds, u, trig); }
+void OVERLOAD new_fft_WIDTH1(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); }
+void OVERLOAD new_fft_WIDTH2(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); }
 
 #endif
diff --git a/src/cl/math.cl b/src/cl/math.cl
index 479238c6..5ef7d515 100644
--- a/src/cl/math.cl
+++ b/src/cl/math.cl
@@ -171,7 +171,7 @@ i32 optional_sub(i32 a, const i32 b) {
 
 // Optionally subtract a value if first arg is greater than value.
 i32 optional_mod(i32 a, const i32 b) {
-#if 0 //HAS_PTX >= 100        // setp/sub instruction requires sm_10 support or higher  // Not faster on 5xxx GPUs (not sure why)
+#if ENABLE_OPTIONAL_MOD && HAS_PTX >= 100        // setp/sub instruction requires sm_10 support or higher  // Not faster on 5xxx GPUs (not sure why)
   __asm("{.reg .pred %%p;\n\t"
         " setp.ge.s32 %%p, %0, %1;\n\t"   // a > b
         " @%%p sub.s32 %0, %0, %1;}"      // if (a > b) a = a - b
@@ -207,7 +207,7 @@ u64 OVERLOAD mad32(u32 a, u32 b, u64 c) {
 }
 
 u128 OVERLOAD mad64(u64 a, u64 b, u64 c) {
-#if 0 && HAS_PTX >= 200        // mad instruction requires sm_20 support or higher    // Slower on TitanV and mobile 4070, don't understand why
+#if ENABLE_MAD64 && HAS_PTX >= 200        // mad instruction requires sm_20 support or higher    // Slower on TitanV and mobile 4070, don't understand why
   u64 reslo, reshi;
   __asm("mad.lo.cc.u64 %0, %2, %3, %4;\n\t"
         "madc.hi.u64   %1, %2, %3, 0;" : "=l"(reslo), "=l"(reshi) : "l"(a), "l"(b), "l"(u128_lo64(c)));
@@ -236,7 +236,7 @@ u128 OVERLOAD mad64(u64 a, u64 b, u64 c) {
 }
 
 u128 OVERLOAD mad64(u64 a, u64 b, u128 c) {
-#if 0 && HAS_PTX >= 200        // mad instruction requires sm_20 support or higher  // Slower on TitanV and mobile 4070, don't understand why
+#if ENABLE_MAD64 && HAS_PTX >= 200        // mad instruction requires sm_20 support or higher  // Slower on TitanV and mobile 4070, don't understand why
   u64 reslo, reshi;
   __asm("mad.lo.cc.u64 %0, %2, %3, %4;\n\t"
         "madc.hi.u64   %1, %2, %3, %5;" : "=l"(reslo), "=l"(reshi) : "l"(a), "l"(b), "l"(u128_lo64(c)), "l"(u128_hi64(c)));
diff --git a/src/cl/middle.cl b/src/cl/middle.cl
index 263937f3..36fcac56 100644
--- a/src/cl/middle.cl
+++ b/src/cl/middle.cl
@@ -54,13 +54,13 @@
 //      u[i]      i ranges 0...MIDDLE-1 (multiples of SMALL_HEIGHT)
 //      y         ranges 0...SMALL_HEIGHT-1 (multiples of one)
 
-void OVERLOAD writeCarryFusedLine(T2 *u, P(T2) out, u32 line) {
+void OVERLOAD writeCarryFusedLine(T2 *u, P(T2) out, u32 line, u32 me) {
 #if PAD_SIZE > 0
   u32 BIG_PAD_SIZE = (PAD_SIZE/2+1)*PAD_SIZE;
-  out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + (u32) get_local_id(0); // One pad every line + a big pad every SMALL_HEIGHT lines
+  out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + me; // One pad every line + a big pad every SMALL_HEIGHT lines
   for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); }
 #else
-  out += line * WIDTH + (u32) get_local_id(0);
+  out += line * WIDTH + me;
   for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); }
 #endif
 }
@@ -311,8 +311,7 @@ void OVERLOAD writeMiddleOutLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
 }
 
 // Read a line for carryFused or FFTW.  This line was written by writeMiddleOutLine above.
-void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line) {
-  u32 me = get_local_id(0);
+void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) {
   u32 SIZEY = OUT_WG / OUT_SIZEX;
 
 #if PAD_SIZE > 0
@@ -381,13 +380,13 @@ void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line) {
 
 #if FFT_FP32 || NTT_GF31
 
-void OVERLOAD writeCarryFusedLine(F2 *u, P(F2) out, u32 line) {
+void OVERLOAD writeCarryFusedLine(F2 *u, P(F2) out, u32 line, u32 me) {
 #if PAD_SIZE > 0
   u32 BIG_PAD_SIZE = (PAD_SIZE/2+1)*PAD_SIZE;
-  out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + (u32) get_local_id(0); // One pad every line + a big pad every SMALL_HEIGHT lines
+  out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + me; // One pad every line + a big pad every SMALL_HEIGHT lines
   for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); }
 #else
-  out += line * WIDTH + (u32) get_local_id(0);
+  out += line * WIDTH + me;
   for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); }
 #endif
 }
@@ -537,8 +536,7 @@ void OVERLOAD writeMiddleOutLine (P(F2) out, F2 *u, u32 chunk_y, u32 chunk_x)
 #endif
 }
 
-void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line) {
-  u32 me = get_local_id(0);
+void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line, u32 me) {
   u32 SIZEY = OUT_WG / OUT_SIZEX;
 #if PAD_SIZE > 0
   // Adjust in pointer based on the x value used in writeMiddleOutLine
@@ -595,8 +593,8 @@ void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line) {
 
 // Since F2 and GF31 are the same size we can simply call the floats based code
 
-void OVERLOAD writeCarryFusedLine(GF31 *u, P(GF31) out, u32 line) {
-  writeCarryFusedLine((F2 *) u, (P(F2)) out, line);
+void OVERLOAD writeCarryFusedLine(GF31 *u, P(GF31) out, u32 line, u32 me) {
+  writeCarryFusedLine((F2 *) u, (P(F2)) out, line, me);
 }
 
 void OVERLOAD readMiddleInLine(GF31 *u, CP(GF31) in, u32 y, u32 x) {
@@ -623,8 +621,8 @@ void OVERLOAD writeMiddleOutLine (P(GF31) out, GF31 *u, u32 chunk_y, u32 chunk_x
   writeMiddleOutLine ((P(F2)) out, (F2 *) u, chunk_y, chunk_x);
 }
 
-void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line) {
-  readCarryFusedLine((CP(F2)) in, (F2 *) u, line);
+void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line, u32 me) {
+  readCarryFusedLine((CP(F2)) in, (F2 *) u, line, me);
 }
 
 #endif
@@ -638,8 +636,8 @@ void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line) {
 
 // Since T2 and GF61 are the same size we can simply call the doubles based code
 
-void OVERLOAD writeCarryFusedLine(GF61 *u, P(GF61) out, u32 line) {
-  writeCarryFusedLine((T2 *) u, (P(T2)) out, line);
+void OVERLOAD writeCarryFusedLine(GF61 *u, P(GF61) out, u32 line, u32 me) {
+  writeCarryFusedLine((T2 *) u, (P(T2)) out, line, me);
 }
 
 void OVERLOAD readMiddleInLine(GF61 *u, CP(GF61) in, u32 y, u32 x) {
@@ -666,8 +664,8 @@ void OVERLOAD writeMiddleOutLine (P(GF61) out, GF61 *u, u32 chunk_y, u32 chunk_x
   writeMiddleOutLine ((P(T2)) out, (T2 *) u, chunk_y, chunk_x);
 }
 
-void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line) {
-  readCarryFusedLine((CP(T2)) in, (T2 *) u, line);
+void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line, u32 me) {
+  readCarryFusedLine((CP(T2)) in, (T2 *) u, line, me);
 }
 
 #endif
@@ -778,8 +776,7 @@ void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line) {
 //      line      ranges 0...BIG_HEIGHT-1 (multiples of one)
 
 // Read a line for carryFused or FFTW.  This line was written by writeMiddleOutLine above.
-void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line) {
-  u32 me = get_local_id(0);             // Multiples of BIG_HEIGHT
+void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) {
   u32 middle = line / SMALL_HEIGHT;     // Multiples of SMALL_HEIGHT
   line = line % SMALL_HEIGHT;           // Multiples of one
   in += (me / 16 * SIZEW) + (middle * SIZEM) + (line % 16 * SIZEBLK) + SWIZ(line % 16, line / 16) * 16 + (me % 16);
@@ -787,8 +784,7 @@ void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line) {
 }
 
 // Write a line from carryFused.  This data will be read by fftMiddleIn.
-void OVERLOAD writeCarryFusedLine(T2 *u, P(T2) out, u32 line) {
-  u32 me = get_local_id(0);             // Multiples of BIG_HEIGHT
+void OVERLOAD writeCarryFusedLine(T2 *u, P(T2) out, u32 line, u32 me) {  // me is multiples of BIG_HEIGHT
   u32 middle = line / SMALL_HEIGHT;     // Multiples of SMALL_HEIGHT
   line = line % SMALL_HEIGHT;           // Multiples of one
   out += (me / 16 * SIZEW) + (middle * SIZEM) + (line % 16 * SIZEBLK) + SWIZ(line % 16, line / 16) * 16 + (me % 16);
@@ -899,8 +895,7 @@ void OVERLOAD writeMiddleOutLine (P(T2) out, T2 *u, u32 y, u32 x)
 //      line      ranges 0...BIG_HEIGHT-1 (multiples of one)
 
 // Read a line for carryFused or FFTW.  This line was written by writeMiddleOutLine above.
-void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line) {
-  u32 me = get_local_id(0);             // Multiples of BIG_HEIGHT
+void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line, u32 me) {
   u32 middle = line / SMALL_HEIGHT;     // Multiples of SMALL_HEIGHT
   line = line % SMALL_HEIGHT;           // Multiples of one
   in += (me / 16 * SIZEW32) + (middle * SIZEM32) + (line % 16 * SIZEBLK32) + SWIZ32(line % 16, line / 16) * 16 + (me % 16);
@@ -908,8 +903,7 @@ void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line) {
 }
 
 // Write a line from carryFused.  This data will be read by fftMiddleIn.
-void OVERLOAD writeCarryFusedLine(F2 *u, P(F2) out, u32 line) {
-  u32 me = get_local_id(0);             // Multiples of BIG_HEIGHT
+void OVERLOAD writeCarryFusedLine(F2 *u, P(F2) out, u32 line, u32 me) {   // me is multiples of BIG_HEIGHT
   u32 middle = line / SMALL_HEIGHT;     // Multiples of SMALL_HEIGHT
   line = line % SMALL_HEIGHT;           // Multiples of one
   out += (me / 16 * SIZEW32) + (middle * SIZEM32) + (line % 16 * SIZEBLK32) + SWIZ32(line % 16, line / 16) * 16 + (me % 16);
@@ -990,12 +984,12 @@ void OVERLOAD writeMiddleOutLine (P(F2) out, F2 *u, u32 y, u32 x)
 
 // Since F2 and GF31 are the same size we can simply call the floats based code
 
-void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line) {
-  readCarryFusedLine((CP(F2)) in, (F2 *) u, line);
+void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line, u32 me) {
+  readCarryFusedLine((CP(F2)) in, (F2 *) u, line, me);
 }
 
-void OVERLOAD writeCarryFusedLine(GF31 *u, P(GF31) out, u32 line) {
-  writeCarryFusedLine((F2 *) u, (P(F2)) out, line);
+void OVERLOAD writeCarryFusedLine(GF31 *u, P(GF31) out, u32 line, u32 me) {
+  writeCarryFusedLine((F2 *) u, (P(F2)) out, line, me);
 }
 
 void OVERLOAD readMiddleInLine(GF31 *u, CP(GF31) in, u32 y, u32 x) {
@@ -1033,12 +1027,12 @@ void OVERLOAD writeMiddleOutLine (P(GF31) out, GF31 *u, u32 y, u32 x) {
 
 // Since T2 and GF61 are the same size we can simply call the doubles based code
 
-void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line) {
-  readCarryFusedLine((CP(T2)) in, (T2 *) u, line);
+void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line, u32 me) {
+  readCarryFusedLine((CP(T2)) in, (T2 *) u, line, me);
 }
 
-void OVERLOAD writeCarryFusedLine(GF61 *u, P(GF61) out, u32 line) {
-  writeCarryFusedLine((T2 *) u, (P(T2)) out, line);
+void OVERLOAD writeCarryFusedLine(GF61 *u, P(GF61) out, u32 line, u32 me) {
+  writeCarryFusedLine((T2 *) u, (P(T2)) out, line, me);
 }
 
 void OVERLOAD readMiddleInLine(GF61 *u, CP(GF61) in, u32 y, u32 x) {
diff --git a/src/cl/tailmul.cl b/src/cl/tailmul.cl
index 1cdd5db0..3e2299f8 100644
--- a/src/cl/tailmul.cl
+++ b/src/cl/tailmul.cl
@@ -49,7 +49,8 @@ void OVERLOAD pairMul(u32 N, T2 *u, T2 *v, T2 *p, T2 *q, T2 base_squared, bool s
 }
 
 KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
-  local T2 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local T2 lds[lds_bytes / sizeof(T2)];
 
   T2 u[NH], v[NH];
   T2 p[NH], q[NH];
@@ -65,7 +66,9 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
   readTailFusedLine(in, u, line1, me);
   readTailFusedLine(in, v, line2, me);
 
-#if NH == 8
+#if FFT_VARIANT_H != 0
+  T2 w;
+#elif NH == 8
   T2 w = fancyTrig_N(ND / SMALL_HEIGHT * me);
 #else
   T2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH);
@@ -74,19 +77,15 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
 #if MUL_LOW
   read(G_H, NH, p, a, memline1 * SMALL_HEIGHT);
   read(G_H, NH, q, a, memline2 * SMALL_HEIGHT);
-  fft_HEIGHT(lds, u, smallTrig, w);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig, w);
+  fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, v, smallTrig, w, 1, SHUFL_BYTES_H, me);
 #else
   readTailFusedLine(a, p, line1, me);
   readTailFusedLine(a, q, line2, me);
-  fft_HEIGHT(lds, u, smallTrig, w);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig, w);
-  bar();
-  fft_HEIGHT(lds, p, smallTrig, w);
-  bar();
-  fft_HEIGHT(lds, q, smallTrig, w);
+  fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, v, smallTrig, w, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, p, smallTrig, w, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, q, smallTrig, w, 1, SHUFL_BYTES_H, me);
 #endif
 
   T2 trig = slowTrig_N(line1 + me * H, ND / NH);
@@ -109,10 +108,8 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
     reverseLine(G_H, lds, v);
   }
 
-  bar();
-  fft_HEIGHT(lds, v, smallTrig, w);
-  bar();
-  fft_HEIGHT(lds, u, smallTrig, w);
+  fft_HEIGHT(lds, v, smallTrig, w, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me);
   writeTailFusedLine(v, out, memline2, me);
   writeTailFusedLine(u, out, memline1, me);
 }
@@ -164,7 +161,8 @@ void OVERLOAD pairMul(u32 N, F2 *u, F2 *v, F2 *p, F2 *q, F2 base_squared, bool s
 }
 
 KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
-  local F2 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local F2 lds[lds_bytes / sizeof(F2)];
 
   CP(F2) inF2 = (CP(F2)) in;
   CP(F2) aF2 = (CP(F2)) a;
@@ -188,19 +186,15 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
 #if MUL_LOW
   read(G_H, NH, p, aF2, memline1 * SMALL_HEIGHT);
   read(G_H, NH, q, aF2, memline2 * SMALL_HEIGHT);
-  fft_HEIGHT(lds, u, smallTrigF2);
-  bar();
-  fft_HEIGHT(lds, v, smallTrigF2);
+  fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, v, smallTrigF2, 1, SHUFL_BYTES_H, me);
 #else
   readTailFusedLine(aF2, p, line1, me);
   readTailFusedLine(aF2, q, line2, me);
-  fft_HEIGHT(lds, u, smallTrigF2);
-  bar();
-  fft_HEIGHT(lds, v, smallTrigF2);
-  bar();
-  fft_HEIGHT(lds, p, smallTrigF2);
-  bar();
-  fft_HEIGHT(lds, q, smallTrigF2);
+  fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, v, smallTrigF2, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, p, smallTrigF2, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, q, smallTrigF2, 1, SHUFL_BYTES_H, me);
 #endif
 
   F2 trig = slowTrig_N(line1 + me * H, ND / NH);
@@ -223,10 +217,8 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
     reverseLine(G_H, lds, v);
   }
 
-  bar();
-  fft_HEIGHT(lds, v, smallTrigF2);
-  bar();
-  fft_HEIGHT(lds, u, smallTrigF2);
+  fft_HEIGHT(lds, v, smallTrigF2, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me);
   writeTailFusedLine(v, outF2, memline2, me);
   writeTailFusedLine(u, outF2, memline1, me);
 }
@@ -278,7 +270,8 @@ void OVERLOAD pairMul(u32 N, GF31 *u, GF31 *v, GF31 *p, GF31 *q, GF31 base_squar
 }
 
 KERNEL(G_H) tailMulGF31(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
-  local GF31 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF31 lds[lds_bytes / sizeof(GF31)];
 
   CP(GF31) in31 = (CP(GF31)) (in + DISTGF31);
   CP(GF31) a31 = (CP(GF31)) (a + DISTGF31);
@@ -302,19 +295,15 @@ KERNEL(G_H) tailMulGF31(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
 #if MUL_LOW
   read(G_H, NH, p, a31, memline1 * SMALL_HEIGHT);
   read(G_H, NH, q, a31, memline2 * SMALL_HEIGHT);
-  fft_HEIGHT(lds, u, smallTrig31);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig31);
+  fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, v, smallTrig31, 1, SHUFL_BYTES_H, me);
 #else
   readTailFusedLine(a31, p, line1, me);
   readTailFusedLine(a31, q, line2, me);
-  fft_HEIGHT(lds, u, smallTrig31);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig31);
-  bar();
-  fft_HEIGHT(lds, p, smallTrig31);
-  bar();
-  fft_HEIGHT(lds, q, smallTrig31);
+  fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, v, smallTrig31, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, p, smallTrig31, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, q, smallTrig31, 1, SHUFL_BYTES_H, me);
 #endif
 
   // Calculate number of trig values used by fft_HEIGHT (see genSmallTrigCombo in trigBufCache.cpp)
@@ -354,10 +343,8 @@ KERNEL(G_H) tailMulGF31(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
     reverseLine(G_H, lds, v);
   }
 
-  bar();
-  fft_HEIGHT(lds, v, smallTrig31);
-  bar();
-  fft_HEIGHT(lds, u, smallTrig31);
+  fft_HEIGHT(lds, v, smallTrig31, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me);
   writeTailFusedLine(v, out31, memline2, me);
   writeTailFusedLine(u, out31, memline1, me);
 }
@@ -407,7 +394,8 @@ void OVERLOAD pairMul(u32 N, GF61 *u, GF61 *v, GF61 *p, GF61 *q, GF61 base_squar
 }
 
 KERNEL(G_H) tailMulGF61(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
-  local GF61 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF61 lds[lds_bytes / sizeof(GF61)];
 
   CP(GF61) in61 = (CP(GF61)) (in + DISTGF61);
   CP(GF61) a61 = (CP(GF61)) (a + DISTGF61);
@@ -431,19 +419,15 @@ KERNEL(G_H) tailMulGF61(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
 #if MUL_LOW
   read(G_H, NH, p, a61, memline1 * SMALL_HEIGHT);
   read(G_H, NH, q, a61, memline2 * SMALL_HEIGHT);
-  fft_HEIGHT(lds, u, smallTrig61);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig61);
+  fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, v, smallTrig61, 1, SHUFL_BYTES_H, me);
 #else
   readTailFusedLine(a61, p, line1, me);
   readTailFusedLine(a61, q, line2, me);
-  fft_HEIGHT(lds, u, smallTrig61);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig61);
-  bar();
-  fft_HEIGHT(lds, p, smallTrig61);
-  bar();
-  fft_HEIGHT(lds, q, smallTrig61);
+  fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, v, smallTrig61, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, p, smallTrig61, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, q, smallTrig61, 1, SHUFL_BYTES_H, me);
 #endif
 
   // Calculate number of trig values used by fft_HEIGHT (see genSmallTrigCombo in trigBufCache.cpp)
@@ -483,10 +467,8 @@ KERNEL(G_H) tailMulGF61(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) {
     reverseLine(G_H, lds, v);
   }
 
-  bar();
-  fft_HEIGHT(lds, v, smallTrig61);
-  bar();
-  fft_HEIGHT(lds, u, smallTrig61);
+  fft_HEIGHT(lds, v, smallTrig61, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me);
   writeTailFusedLine(v, out61, memline2, me);
   writeTailFusedLine(u, out61, memline1, me);
 }
diff --git a/src/cl/tailsquare.cl b/src/cl/tailsquare.cl
index bf960f1c..d9edc1f7 100644
--- a/src/cl/tailsquare.cl
+++ b/src/cl/tailsquare.cl
@@ -54,7 +54,8 @@ void OVERLOAD pairSq(u32 N, T2 *u, T2 *v, T2 base_squared, bool special) {
 // The kernel tailSquareZero handles the special cases in tailSquare, i.e. the lines 0 and H/2
 // This kernel is launched with 2 workgroups (handling line 0, resp. H/2)
 KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local T2 lds[SMALL_HEIGHT / 2];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local T2 lds[lds_bytes / sizeof(T2)];
   T2 u[NH];
   u32 H = ND / SMALL_HEIGHT;
 
@@ -66,7 +67,9 @@ KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) {
   u32 me = get_local_id(0);
   readTailFusedLine(in, u, line, me);
 
-#if NH == 8
+#if FFT_VARIANT_H != 0
+  T2 w;
+#elif NH == 8
   T2 w = fancyTrig_N(ND / SMALL_HEIGHT * me);
 #else
   T2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH);
@@ -74,20 +77,20 @@ KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   T2 trig = slowTrig_N(line + me * H, ND / NH);
 
-  fft_HEIGHT(lds, u, smallTrig, w);
+  fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me);
   reverse(G_H, lds, u + NH/2, !which);
   pairSq(NH/2, u,   u + NH/2, trig, !which);
   reverse(G_H, lds, u + NH/2, !which);
 
-  bar();
-  fft_HEIGHT(lds, u, smallTrig, w);
+  fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me);
   writeTailFusedLine(u, out, transPos(line, MIDDLE, WIDTH), me);
 }
 
 #if SINGLE_WIDE
 
 KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local T2 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local T2 lds[lds_bytes / sizeof(T2)];
 
   T2 u[NH], v[NH];
 
@@ -107,22 +110,17 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   readTailFusedLine(in, u, line1, me);
   readTailFusedLine(in, v, line2, me);
 
-#if NH == 8
+#if FFT_VARIANT_H != 0
+  T2 w;
+#elif NH == 8
   T2 w = fancyTrig_N(ND / SMALL_HEIGHT * me);
 #else
   T2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH);
 #endif
 
-#if ZEROHACK_H
-  u32 zerohack = (u32) get_group_id(0) / 131072;
-  fft_HEIGHT(lds + zerohack, u, smallTrig + zerohack, w);
-  bar();
-  fft_HEIGHT(lds + zerohack, v, smallTrig + zerohack, w);
-#else
-  fft_HEIGHT(lds, u, smallTrig, w);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig, w);
-#endif
+  u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072;
+  fft_HEIGHT(lds + zerohack, u, smallTrig + zerohack, w, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds + zerohack, v, smallTrig + zerohack, w, 1, SHUFL_BYTES_H, me);
 
   // Compute trig values from scratch.  Good on GPUs with high DP throughput.
 #if TAIL_TRIGS == 2
@@ -169,10 +167,8 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
     reverseLine(G_H, lds, v);
   }
 
-  bar();
-  fft_HEIGHT(lds, v, smallTrig, w);
-  bar();
-  fft_HEIGHT(lds, u, smallTrig, w);
+  fft_HEIGHT(lds, v, smallTrig, w, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me);
 
   writeTailFusedLine(v, out, memline2, me);
   writeTailFusedLine(u, out, memline1, me);
@@ -202,7 +198,8 @@ void OVERLOAD pairSq2_special(T2 *u, T2 base_squared) {
 }
 
 KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local T2 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local T2 lds[lds_bytes * 2 / sizeof(T2)];
 
   T2 u[NH];
 
@@ -227,18 +224,16 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   // Read lines u and v
   readTailFusedLine(in, u, line, lowMe);
 
-#if NH == 8
+#if FFT_VARIANT_H != 0
+  T2 w;
+#elif NH == 8
   T2 w = fancyTrig_N(H * lowMe);
 #else
   T2 w = slowTrig_N(H * lowMe, ND / NH);
 #endif
 
-#if ZEROHACK_H
-  u32 zerohack = (u32) get_group_id(0) / 131072;
-  new_fft_HEIGHT2_1(lds + zerohack, u, smallTrig + zerohack, w);
-#else
-  new_fft_HEIGHT2_1(lds, u, smallTrig, w);
-#endif
+  u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072;
+  new_fft_HEIGHT1(lds + zerohack, u, smallTrig + zerohack, w, 2, SHUFL_BYTES_H, lowMe);
 
   // Compute trig values from scratch.  Good on GPUs with high DP throughput.
 #if TAIL_TRIGS == 2
@@ -263,8 +258,6 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   T2 trig = NTLOAD(smallTrig[height_trigs + line_u*G_H*2 + me]);
 #endif
 
-  bar(G_H);
-
 #if SINGLE_KERNEL
   // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1.
   if (line_u == 0) {
@@ -276,15 +269,12 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
 #else
   if (1) {
 #endif
-    revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf);
+    revCrossLine(lds, u);
     pairSq(NH/2, u, u + NH/2, trig, false);
-    bar(G_H);
-    revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf);
+    revCrossLine(lds, u);
   }
 
-  bar(G_H);
-
-  new_fft_HEIGHT2_2(lds, u, smallTrig, w);
+  new_fft_HEIGHT2(lds, u, smallTrig, w, 2, SHUFL_BYTES_H, lowMe);
 
   // Write lines u and v
   writeTailFusedLine(u, out, transPos(line, MIDDLE, WIDTH), lowMe);
@@ -342,7 +332,8 @@ void OVERLOAD pairSq(u32 N, F2 *u, F2 *v, F2 base_squared, bool special) {
 // The kernel tailSquareZero handles the special cases in tailSquare, i.e. the lines 0 and H/2
 // This kernel is launched with 2 workgroups (handling line 0, resp. H/2)
 KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local F2 lds[SMALL_HEIGHT / 2];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local F2 lds[lds_bytes / sizeof(F2)];
   F2 u[NH];
   u32 H = ND / SMALL_HEIGHT;
 
@@ -360,20 +351,20 @@ KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   F2 trig = slowTrig_N(line + me * H, ND / NH);
 
-  fft_HEIGHT(lds, u, smallTrigF2);
+  fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me);
   reverse(G_H, lds, u + NH/2, !which);
   pairSq(NH/2, u,   u + NH/2, trig, !which);
   reverse(G_H, lds, u + NH/2, !which);
 
-  bar();
-  fft_HEIGHT(lds, u, smallTrigF2);
+  fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me);
   writeTailFusedLine(u, outF2, transPos(line, MIDDLE, WIDTH), me);
 }
 
 #if SINGLE_WIDE
 
 KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local F2 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local F2 lds[lds_bytes / sizeof(F2)];
 
   CP(F2) inF2 = (CP(F2)) in;
   P(F2) outF2 = (P(F2)) out;
@@ -397,16 +388,9 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   readTailFusedLine(inF2, u, line1, me);
   readTailFusedLine(inF2, v, line2, me);
 
-#if ZEROHACK_H
-  u32 zerohack = get_group_id(0) / 131072;
-  fft_HEIGHT(lds + zerohack, u, smallTrigF2 + zerohack);
-  bar();
-  fft_HEIGHT(lds + zerohack, v, smallTrigF2 + zerohack);
-#else
-  fft_HEIGHT(lds, u, smallTrigF2);
-  bar();
-  fft_HEIGHT(lds, v, smallTrigF2);
-#endif
+  u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072;
+  fft_HEIGHT(lds + zerohack, u, smallTrigF2 + zerohack, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds + zerohack, v, smallTrigF2 + zerohack, 1, SHUFL_BYTES_H, me);
 
   // Compute trig values from scratch.  Good on GPUs with high DP throughput.
 #if TAIL_TRIGS32 == 2
@@ -453,10 +437,8 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
     reverseLine(G_H, lds, v);
   }
 
-  bar();
-  fft_HEIGHT(lds, v, smallTrigF2);
-  bar();
-  fft_HEIGHT(lds, u, smallTrigF2);
+  fft_HEIGHT(lds, v, smallTrigF2, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me);
 
   writeTailFusedLine(v, outF2, memline2, me);
   writeTailFusedLine(u, outF2, memline1, me);
@@ -486,7 +468,8 @@ void OVERLOAD pairSq2_special(F2 *u, F2 base_squared) {
 }
 
 KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local F2 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local F2 lds[lds_bytes * 2 / sizeof(F2)];
 
   CP(F2) inF2 = (CP(F2)) in;
   P(F2) outF2 = (P(F2)) out;
@@ -515,12 +498,8 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   // Read lines u and v
   readTailFusedLine(inF2, u, line, lowMe);
 
-#if ZEROHACK_H
-  u32 zerohack = (u32) get_group_id(0) / 131072;
-  new_fft_HEIGHT2_1(lds + zerohack, u, smallTrigF2 + zerohack);
-#else
-  new_fft_HEIGHT2_1(lds, u, smallTrigF2);
-#endif
+  u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072;
+  new_fft_HEIGHT1(lds + zerohack, u, smallTrigF2 + zerohack, 2, SHUFL_BYTES_H, lowMe);
 
   // Compute trig values from scratch.  Good on GPUs with high DP throughput.
 #if TAIL_TRIGS32 == 2
@@ -545,8 +524,6 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   F2 trig = NTLOAD(smallTrigF2[height_trigs + line_u*G_H*2 + me]);
 #endif
 
-  bar(G_H);
-
 #if SINGLE_KERNEL
   // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1.
   if (line_u == 0) {
@@ -558,15 +535,12 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
 #else
   if (1) {
 #endif
-    revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf);
+    revCrossLine(lds, u);
     pairSq(NH/2, u, u + NH/2, trig, false);
-    bar(G_H);
-    revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf);
+    revCrossLine(lds, u);
   }
 
-  bar(G_H);
-
-  new_fft_HEIGHT2_2(lds, u, smallTrigF2);
+  new_fft_HEIGHT2(lds, u, smallTrigF2, 2, SHUFL_BYTES_H, lowMe);
 
   // Write lines u and v
   writeTailFusedLine(u, outF2, transPos(line, MIDDLE, WIDTH), lowMe);
@@ -627,7 +601,8 @@ void OVERLOAD pairSq(u32 N, GF31 *u, GF31 *v, GF31 base_squared, bool special) {
 // The kernel tailSquareZero handles the special cases in tailSquare, i.e. the lines 0 and H/2
 // This kernel is launched with 2 workgroups (handling line 0, resp. H/2)
 KERNEL(G_H) tailSquareZeroGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF31 lds[SMALL_HEIGHT / 2];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF31 lds[lds_bytes / sizeof(GF31)];
 
   CP(GF31) in31 = (CP(GF31)) (in + DISTGF31);
   P(GF31) out31 = (P(GF31)) (out + DISTGF31);
@@ -663,19 +638,20 @@ KERNEL(G_H) tailSquareZeroGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
 #endif
 #endif
 
-  fft_HEIGHT(lds, u, smallTrig31);
+  fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me);
   reverse(G_H, lds, u + NH/2, !which);
   pairSq(NH/2, u,   u + NH/2, trig, !which);
   reverse(G_H, lds, u + NH/2, !which);
-  bar();
-  fft_HEIGHT(lds, u, smallTrig31);
+
+  fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me);
   writeTailFusedLine(u, out31, transPos(line, MIDDLE, WIDTH), me);
 }
 
 #if SINGLE_WIDE
 
 KERNEL(G_H) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF31 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF31 lds[lds_bytes / sizeof(GF31)];
 
   CP(GF31) in31 = (CP(GF31)) (in + DISTGF31);
   P(GF31) out31 = (P(GF31)) (out + DISTGF31);
@@ -699,16 +675,9 @@ KERNEL(G_H) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
   readTailFusedLine(in31, u, line1, me);
   readTailFusedLine(in31, v, line2, me);
 
-#if ZEROHACK_H
-  u32 zerohack = (u32) get_group_id(0) / 131072;
-  fft_HEIGHT(lds + zerohack, u, smallTrig31 + zerohack);
-  bar();
-  fft_HEIGHT(lds + zerohack, v, smallTrig31 + zerohack);
-#else
-  fft_HEIGHT(lds, u, smallTrig31);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig31);
-#endif
+  u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072;
+  fft_HEIGHT(lds + zerohack, u, smallTrig31 + zerohack, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds + zerohack, v, smallTrig31 + zerohack, 1, SHUFL_BYTES_H, me);
 
   // Do a little bit of memory access and a little bit of math.
 #if TAIL_TRIGS31 >= 1
@@ -751,10 +720,8 @@ KERNEL(G_H) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
     reverseLine(G_H, lds, v);
   }
 
-  bar();
-  fft_HEIGHT(lds, v, smallTrig31);
-  bar();
-  fft_HEIGHT(lds, u, smallTrig31);
+  fft_HEIGHT(lds, v, smallTrig31, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me);
 
   writeTailFusedLine(v, out31, memline2, me);
   writeTailFusedLine(u, out31, memline1, me);
@@ -783,7 +750,8 @@ void OVERLOAD pairSq2_special(GF31 *u, GF31 base_squared) {
 }
 
 KERNEL(G_H * 2) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF31 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF31 lds[lds_bytes * 2 / sizeof(GF31)];
 
   CP(GF31) in31 = (CP(GF31)) (in + DISTGF31);
   P(GF31) out31 = (P(GF31)) (out + DISTGF31);
@@ -812,12 +780,8 @@ KERNEL(G_H * 2) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
   // Read lines u and v
   readTailFusedLine(in31, u, line, lowMe);
 
-#if ZEROHACK_H
-  u32 zerohack = (u32) get_group_id(0) / 131072;
-  new_fft_HEIGHT2_1(lds + zerohack, u, smallTrig31 + zerohack);
-#else
-  new_fft_HEIGHT2_1(lds, u, smallTrig31);
-#endif
+  u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072;
+  new_fft_HEIGHT1(lds + zerohack, u, smallTrig31 + zerohack, 2, SHUFL_BYTES_H, lowMe);
 
   // Do a little bit of memory access and a little bit of math.  Good on a Radeon VII.
 #if TAIL_TRIGS31 >= 1
@@ -838,8 +802,6 @@ KERNEL(G_H * 2) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
   GF31 trig = NTLOAD(smallTrig31[height_trigs + line_u*G_H*2 + me]);
 #endif
 
-  bar(G_H);
-
 #if SINGLE_KERNEL
   // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1.
   if (line_u == 0) {
@@ -851,15 +813,12 @@ KERNEL(G_H * 2) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) {
 #else
   if (1) {
 #endif
-    revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf);
+    revCrossLine(lds, u);
     pairSq(NH/2, u, u + NH/2, trig, false);
-    bar(G_H);
-    revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf);
+    revCrossLine(lds, u);
   }
 
-  bar(G_H);
-
-  new_fft_HEIGHT2_2(lds, u, smallTrig31);
+  new_fft_HEIGHT2(lds, u, smallTrig31, 2, SHUFL_BYTES_H, lowMe);
 
   // Write lines u and v
   writeTailFusedLine(u, out31, transPos(line, MIDDLE, WIDTH), lowMe);
@@ -920,7 +879,8 @@ void OVERLOAD pairSq(u32 N, GF61 *u, GF61 *v, GF61 base_squared, bool special) {
 // The kernel tailSquareZero handles the special cases in tailSquare, i.e. the lines 0 and H/2
 // This kernel is launched with 2 workgroups (handling line 0, resp. H/2)
 KERNEL(G_H) tailSquareZeroGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF61 lds[SMALL_HEIGHT / 2];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF61 lds[lds_bytes / sizeof(GF61)];
 
   CP(GF61) in61 = (CP(GF61)) (in + DISTGF61);
   P(GF61) out61 = (P(GF61)) (out + DISTGF61);
@@ -956,19 +916,20 @@ KERNEL(G_H) tailSquareZeroGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
 #endif
 #endif
 
-  fft_HEIGHT(lds, u, smallTrig61);
+  fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me);
   reverse(G_H, lds, u + NH/2, !which);
   pairSq(NH/2, u,   u + NH/2, trig, !which);
   reverse(G_H, lds, u + NH/2, !which);
-  bar();
-  fft_HEIGHT(lds, u, smallTrig61);
+
+  fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me);
   writeTailFusedLine(u, out61, transPos(line, MIDDLE, WIDTH), me);
 }
 
 #if SINGLE_WIDE
 
 KERNEL(G_H) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF61 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF61 lds[lds_bytes / sizeof(GF61)];
 
   CP(GF61) in61 = (CP(GF61)) (in + DISTGF61);
   P(GF61) out61 = (P(GF61)) (out + DISTGF61);
@@ -992,16 +953,9 @@ KERNEL(G_H) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
   readTailFusedLine(in61, u, line1, me);
   readTailFusedLine(in61, v, line2, me);
 
-#if ZEROHACK_H
-  u32 zerohack = (u32) get_group_id(0) / 131072;
-  fft_HEIGHT(lds + zerohack, u, smallTrig61 + zerohack);
-  bar();
-  fft_HEIGHT(lds + zerohack, v, smallTrig61 + zerohack);
-#else
-  fft_HEIGHT(lds, u, smallTrig61);
-  bar();
-  fft_HEIGHT(lds, v, smallTrig61);
-#endif
+  u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072;
+  fft_HEIGHT(lds + zerohack, u, smallTrig61 + zerohack, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds + zerohack, v, smallTrig61 + zerohack, 1, SHUFL_BYTES_H, me);
 
   // Do a little bit of memory access and a little bit of math.
 #if TAIL_TRIGS61 >= 1
@@ -1044,10 +998,8 @@ KERNEL(G_H) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
     reverseLine(G_H, lds, v);
   }
 
-  bar();
-  fft_HEIGHT(lds, v, smallTrig61);
-  bar();
-  fft_HEIGHT(lds, u, smallTrig61);
+  fft_HEIGHT(lds, v, smallTrig61, 1, SHUFL_BYTES_H, me);
+  fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me);
 
   writeTailFusedLine(v, out61, memline2, me);
   writeTailFusedLine(u, out61, memline1, me);
@@ -1076,7 +1028,8 @@ void OVERLOAD pairSq2_special(GF61 *u, GF61 base_squared) {
 }
 
 KERNEL(G_H * 2) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
-  local GF61 lds[SMALL_HEIGHT];
+  const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H;
+  local GF61 lds[lds_bytes * 2 / sizeof(GF61)];
 
   CP(GF61) in61 = (CP(GF61)) (in + DISTGF61);
   P(GF61) out61 = (P(GF61)) (out + DISTGF61);
@@ -1105,12 +1058,8 @@ KERNEL(G_H * 2) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
   // Read lines u and v
   readTailFusedLine(in61, u, line, lowMe);
 
-#if ZEROHACK_H
-  u32 zerohack = (u32) get_group_id(0) / 131072;
-  new_fft_HEIGHT2_1(lds + zerohack, u, smallTrig61 + zerohack);
-#else
-  new_fft_HEIGHT2_1(lds, u, smallTrig61);
-#endif
+  u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072;
+  new_fft_HEIGHT1(lds + zerohack, u, smallTrig61 + zerohack, 2, SHUFL_BYTES_H, lowMe);
 
   // Do a little bit of memory access and a little bit of math.  Good on a Radeon VII.
 #if TAIL_TRIGS61 >= 1
@@ -1131,8 +1080,6 @@ KERNEL(G_H * 2) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
   GF61 trig = NTLOAD(smallTrig61[height_trigs + line_u*G_H*2 + me]);
 #endif
 
-  bar(G_H);
-
 #if SINGLE_KERNEL
   // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1.
   if (line_u == 0) {
@@ -1144,15 +1091,12 @@ KERNEL(G_H * 2) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) {
 #else
   if (1) {
 #endif
-    revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf);
+    revCrossLine(lds, u);
     pairSq(NH/2, u, u + NH/2, trig, false);
-    bar(G_H);
-    revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf);
+    revCrossLine(lds, u);
   }
 
-  bar(G_H);
-
-  new_fft_HEIGHT2_2(lds, u, smallTrig61);
+  new_fft_HEIGHT2(lds, u, smallTrig61, 2, SHUFL_BYTES_H, lowMe);
 
   // Write lines u and v
   writeTailFusedLine(u, out61, transPos(line, MIDDLE, WIDTH), lowMe);
diff --git a/src/cl/tailutil.cl b/src/cl/tailutil.cl
index 01710cf1..9234f858 100644
--- a/src/cl/tailutil.cl
+++ b/src/cl/tailutil.cl
@@ -30,84 +30,181 @@
 #define SINGLE_WIDE     TAIL_KERNELS < 2          // Old single-wide tailSquare vs. new double-wide tailSquare
 #define SINGLE_KERNEL   (TAIL_KERNELS & 1) == 0   // TailSquare uses a single kernel vs. two kernels
 
-#if FFT_FP64
+// 64-bit implementations of reverse routines
 
-void OVERLOAD reverse(u32 WG, local T2 *lds, T2 *u, bool bump) {
+#if FFT_FP64 | NTT_GF61
+
+void OVERLOAD reverse(u32 WG, local T2 *lds2, T2 *u, bool bump) {
   u32 me = get_local_id(0);
   u32 revMe = WG - 1 - me + bump;
 
-  bar();
-
+  if (SHUFL_BYTES_H >= 8) {
+    local T2 *lds = lds2;
+    bar(WG);
 #if NH == 8
-  lds[revMe + 0 * WG] = u[3];
-  lds[revMe + 1 * WG] = u[2];
-  lds[revMe + 2 * WG] = u[1];
-  lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0];
+    lds[revMe + 0 * WG] = u[3];
+    lds[revMe + 1 * WG] = u[2];
+    lds[revMe + 2 * WG] = u[1];
+    lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0];
 #elif NH == 4
-  lds[revMe + 0 * WG] = u[1];
-  lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0];
-#else
-#error
+    lds[revMe + 0 * WG] = u[1];
+    lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0];
 #endif
+    bar(WG);
+    for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; }
+  }
 
-  bar();
-  for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; }
+  else if (SHUFL_BYTES_H == 4) {
+    local T *lds = (local T *) lds2;
+    bar(WG);
+#if NH == 8
+    lds[revMe + 0 * WG] = u[3].x;
+    lds[revMe + 1 * WG] = u[2].x;
+    lds[revMe + 2 * WG] = u[1].x;
+    lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0].x;
+#elif NH == 4
+    lds[revMe + 0 * WG] = u[1].x;
+    lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0].x;
+#endif
+    bar(WG);
+    for (i32 i = 0; i < NH/2; ++i) { u[i].x = lds[i * WG + me]; }
+    bar(WG);
+#if NH == 8
+    lds[revMe + 0 * WG] = u[3].y;
+    lds[revMe + 1 * WG] = u[2].y;
+    lds[revMe + 2 * WG] = u[1].y;
+    lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0].y;
+#elif NH == 4
+    lds[revMe + 0 * WG] = u[1].y;
+    lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0].y;
+#endif
+    bar(WG);
+    for (i32 i = 0; i < NH/2; ++i) { u[i].y = lds[i * WG + me]; }
+  }
 }
 
-void OVERLOAD reverseLine(u32 WG, local T2 *lds2, T2 *u) {
+void OVERLOAD reverseLine(u32 WG, local T2 *lds, T2 *u) {
   u32 me = get_local_id(0);
   u32 revMe = WG - 1 - me;
 
-  local T2 *lds = lds2 + revMe;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { lds[WG * (NH - 1 - i)] = u[i]; }
-
-  lds = lds2 + me;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; }
-}
-
-// This is used to reverse the second part of a line, and cross the reversed parts between the halves.
-void OVERLOAD revCrossLine(u32 WG, local T2* lds2, T2 *u, u32 n, bool writeSecondHalf) {
-  u32 me = get_local_id(0);
-  u32 lowMe = me % WG;
-  
-  u32 revLowMe = WG - 1 - lowMe;
-
-  for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; }
+  if (SHUFL_BYTES_H == 16) {
+    local T2 *ldsOut = lds + revMe;
+    local T2 *ldsIn = lds + me;
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i]; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { u[i] = ldsIn[WG * i]; }
+  }
 
-  bar();   // we need a full bar because we're crossing halves
+  else if (SHUFL_BYTES_H == 8) {
+    local T *ldsOut = (local T *) lds + revMe;
+    local T *ldsIn = (local T *) lds + me;
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i].x; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { u[i].x = ldsIn[WG * i]; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i].y; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { u[i].y = ldsIn[WG * i]; }
+  }
 
-  for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; }
+  else if (SHUFL_BYTES_H == 4) {
+    local int *ldsOut = (local int *) lds + revMe;
+    local int *ldsIn = (local int *) lds + me;
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = as_int4(u[i]).x; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { int4 tmp = as_int4(u[i]); tmp.x = ldsIn[WG * i]; u[i] = as_double2(tmp); }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = as_int4(u[i]).y; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { int4 tmp = as_int4(u[i]); tmp.y = ldsIn[WG * i]; u[i] = as_double2(tmp); }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = as_int4(u[i]).z; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { int4 tmp = as_int4(u[i]); tmp.z = ldsIn[WG * i]; u[i] = as_double2(tmp); }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = as_int4(u[i]).w; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { int4 tmp = as_int4(u[i]); tmp.w = ldsIn[WG * i]; u[i] = as_double2(tmp); }
+  }
 }
 
 //
-// These versions are for the kernel(s) that uses a double-wide workgroup (u in half the workgroup, v in the other half)
+// These versions are for the kernel(s) that use a double-wide workgroup (u in half the workgroup, v in the other half)
 //
 
-void OVERLOAD reverse2(local T2 *lds, T2 *u) {
+void OVERLOAD reverse2(local T2 *lds2, T2 *u) {
   u32 me = get_local_id(0);
-  
-  // For NH=8, u[0] to u[3] are left unchanged.  Write to lds:
-  //	u[7]rev   u[6]rev
-  //	u[5]rev   u[4]rev
-  //	v[7]rev   v[6]rev
-  //	v[5]rev   v[4]rev
-  bar();
-  for (u32 i = 0; i < NH / 2; ++i) {
-    u32 j = (i * G_H + me % G_H);
-    lds[me < G_H ? ((NH/2)*G_H - j) % ((NH/2)*G_H) : NH*G_H-1 - j] = u[NH/2 + i];
+  u32 lowMe = me % G_H;
+
+  if (SHUFL_BYTES_H >= 8) {
+    local T2 *lds = lds2;
+    if (me >= G_H) lds += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T2);
+    // For NH=8, u[0] to u[3] are left unchanged.  Write to lds:
+    //  u[7]rev   u[6]rev   u[5]rev   u[4]rev
+    //  v[7]rev   v[6]rev   v[5]rev   v[4]rev
+    bar(G_H);
+    for (u32 i = 0; i < NH/2; ++i) { lds[((NH/2 - i) * G_H - (me >= G_H ? 1 : 0) - lowMe) % (NH/2 * G_H)] = u[NH/2 + i]; }
+    // For NH=8, read from lds into u[i]:
+    //  u[4] =   u[7]rev   v[7]rev
+    //  u[5] =   u[6]rev   v[6]rev
+    //  u[6] =   u[5]rev   v[5]rev
+    //  u[7] =   u[4]rev   v[4]rev
+    bar(G_H);
+    for (u32 i = 0; i < NH/2; ++i) { u[NH/2 + i] = lds[i * G_H + lowMe]; }
+  }
+
+  else if (SHUFL_BYTES_H == 4) {
+    local T *lds = (local T *) lds2;
+    if (me >= G_H) lds += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T);
+    bar(G_H);
+    for (u32 i = 0; i < NH/2; ++i) { lds[((NH/2 - i) * G_H - (me >= G_H ? 1 : 0) - lowMe) % (NH/2 * G_H)] = u[NH/2 + i].x; }
+    bar(G_H);
+    for (u32 i = 0; i < NH/2; ++i) { u[NH/2 + i].x = lds[i * G_H + lowMe]; }
+    bar(G_H);
+    for (u32 i = 0; i < NH/2; ++i) { lds[((NH/2 - i) * G_H - (me >= G_H ? 1 : 0) - lowMe) % (NH/2 * G_H)] = u[NH/2 + i].y; }
+    bar(G_H);
+    for (u32 i = 0; i < NH/2; ++i) { u[NH/2 + i].y = lds[i * G_H + lowMe]; }
   }
-  // For NH=8, read from lds into u[i]:
-  //	u[4] =   u[7]rev   v[7]rev
-  //	u[5] =   u[6]rev   v[6]rev
-  //	u[6] =   u[5]rev   v[5]rev
-  //	u[7] =   u[4]rev   v[4]rev
-  bar();
-  lds += me % G_H + (me / G_H) * NH/2 * G_H;
-  for (u32 i = 0; i < NH / 2; ++i) { u[NH/2 + i] = lds[i * G_H]; }
 }
 
+// This is used to reverse the second part of a line, and cross the reversed parts between the halves.
+void OVERLOAD revCrossLine(local T2* lds2, T2 *u) {
+  u32 me = get_local_id(0);
+  u32 lowMe = me % G_H;
+  u32 revLowMe = G_H - 1 - lowMe;
+
+  if (SHUFL_BYTES_H >= 8) {
+    local T2 *ldsOut = lds2;
+    local T2 *ldsIn = lds2;
+    if (me < G_H) ldsOut += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T2);     // Crossing LDS halves
+    else ldsIn += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T2);               // Staying within LDS halves (just like shufl)
+    bar();   // we need a full bar because we're crossing halves
+    for (u32 i = 0; i < NH/2; ++i) { ldsOut[G_H * (NH/2 - 1 - i) + revLowMe] = u[i + NH/2]; }
+    bar();   // we need a full bar because we just crossed halves.  LDS reads are compatible with future shufl calls.
+    for (u32 i = 0; i < NH/2; ++i) { u[i + NH/2] = ldsIn[G_H * i + lowMe]; }
+  }
+
+  else if (SHUFL_BYTES_H == 4) {
+    local T *ldsOut = (local T *) lds2;
+    local T *ldsIn = (local T *) lds2;
+    if (me < G_H) ldsOut += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T);
+    else ldsIn += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T);
+    bar();   // we need a full bar because we're crossing halves
+    for (u32 i = 0; i < NH/2; ++i) { ldsOut[G_H * (NH/2 - 1 - i) + revLowMe] = u[i + NH/2].x; }
+    bar();   // we need a full bar because we just crossed halves
+    for (u32 i = 0; i < NH/2; ++i) { u[i + NH/2].x = ldsIn[G_H * i + lowMe]; }
+    bar();   // we need a full bar because we're crossing halves
+    for (u32 i = 0; i < NH/2; ++i) { ldsOut[G_H * (NH/2 - 1 - i) + revLowMe] = u[i + NH/2].y; }
+    bar();   // we need a full bar because we just crossed halves.  LDS reads are compatible with future shufl calls.
+    for (u32 i = 0; i < NH/2; ++i) { u[i + NH/2].y = ldsIn[G_H * i + lowMe]; }
+  }
+}
+
+#if 0    // Unused
+
 // Somewhat similar to reverseLine.
 // The u values are in threads < G_H, the v values to reverse in threads >= G_H.
 // Whereas reverseLine leaves u values alone.  This reverseLine moves u values around
@@ -119,7 +216,7 @@ void OVERLOAD reverse2(local T2 *lds, T2 *u) {
 void OVERLOAD reverseLine2(local T2 *lds, T2 *u) {
   u32 me = get_local_id(0);
 
-// NOTE:  It is important that this routine use lds memory in coordination with shufl2.  Failure to do so would require an
+// NOTE:  It is important that this routine use lds memory in coordination with shufl.  Failure to do so would require an
 // unqualified bar() call here.  Specifically, the u values are stored in the upper half of lds memory (SMALL_HEIGHT T2 values).
 // The v values are stored in the lower half of lds memory (the next SMALL_HEIGHT T2 values).
 
@@ -191,89 +288,107 @@ void OVERLOAD unreverseLine2(local T2 *lds, T2 *u) {
 
 #endif
 
+#endif
+
 
 /**************************************************************************/
 /*            Similar to above, but for an FFT based on FP32              */
 /**************************************************************************/
 
-#if FFT_FP32
+#if FFT_FP32 | NTT_GF31
 
 void OVERLOAD reverse(u32 WG, local F2 *lds, F2 *u, bool bump) {
   u32 me = get_local_id(0);
   u32 revMe = WG - 1 - me + bump;
 
-  bar();
-
+  if (SHUFL_BYTES_H >= 4) {
+    bar(WG);
 #if NH == 8
-  lds[revMe + 0 * WG] = u[3];
-  lds[revMe + 1 * WG] = u[2];
-  lds[revMe + 2 * WG] = u[1];
-  lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0];
+    lds[revMe + 0 * WG] = u[3];
+    lds[revMe + 1 * WG] = u[2];
+    lds[revMe + 2 * WG] = u[1];
+    lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0];
 #elif NH == 4
-  lds[revMe + 0 * WG] = u[1];
-  lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0];
-#else
-#error
+    lds[revMe + 0 * WG] = u[1];
+    lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0];
 #endif
-
-  bar();
-  for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; }
+    bar(WG);
+    for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; }
+  }
 }
 
-void OVERLOAD reverseLine(u32 WG, local F2 *lds2, F2 *u) {
+void OVERLOAD reverseLine(u32 WG, local F2 *lds, F2 *u) {
   u32 me = get_local_id(0);
   u32 revMe = WG - 1 - me;
 
-  local F2 *lds = lds2 + revMe;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { lds[WG * (NH - 1 - i)] = u[i]; }
-
-  lds = lds2 + me;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; }
-}
-
-// This is used to reverse the second part of a line, and cross the reversed parts between the halves.
-void OVERLOAD revCrossLine(u32 WG, local F2* lds2, F2 *u, u32 n, bool writeSecondHalf) {
-  u32 me = get_local_id(0);
-  u32 lowMe = me % WG;
-
-  u32 revLowMe = WG - 1 - lowMe;
-
-  for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; }
-
-  bar();   // we need a full bar because we're crossing halves
+  if (SHUFL_BYTES_H >= 8) {
+    local F2 *ldsOut = lds + revMe;
+    local F2 *ldsIn = lds + me;
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i]; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { u[i] = ldsIn[WG * i]; }
+  }
 
-  for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; }
+  else if (SHUFL_BYTES_H == 4) {
+    local F *ldsOut = (local F *) lds + revMe;
+    local F *ldsIn = (local F *) lds + me;
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i].x; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { u[i].x = ldsIn[WG * i]; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i].y; }
+    bar(WG);
+    for (u32 i = 0; i < NH; ++i) { u[i].y = ldsIn[WG * i]; }
+  }
 }
 
 //
-// These versions are for the kernel(s) that uses a double-wide workgroup (u in half the workgroup, v in the other half)
+// These versions are for the kernel(s) that use a double-wide workgroup (u in half the workgroup, v in the other half)
 //
 
 void OVERLOAD reverse2(local F2 *lds, F2 *u) {
   u32 me = get_local_id(0);
+  u32 lowMe = me % G_H;
+
+  if (SHUFL_BYTES_H >= 4) {
+    if (me >= G_H) lds += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(F2);
+    // For NH=8, u[0] to u[3] are left unchanged.  Write to lds:
+    //  u[7]rev   u[6]rev   u[5]rev   u[4]rev
+    //  v[7]rev   v[6]rev   v[5]rev   v[4]rev
+    bar(G_H);
+    for (u32 i = 0; i < NH/2; ++i) { lds[((NH/2 - i) * G_H - (me >= G_H ? 1 : 0) - lowMe) % (NH/2 * G_H)] = u[NH/2 + i]; }
+    // For NH=8, read from lds into u[i]:
+    //  u[4] =   u[7]rev   v[7]rev
+    //  u[5] =   u[6]rev   v[6]rev
+    //  u[6] =   u[5]rev   v[5]rev
+    //  u[7] =   u[4]rev   v[4]rev
+    bar(G_H);
+    for (u32 i = 0; i < NH/2; ++i) { u[NH/2 + i] = lds[i * G_H + lowMe]; }
+  }
+}
 
-  // For NH=8, u[0] to u[3] are left unchanged.  Write to lds:
-  //	u[7]rev   u[6]rev
-  //	u[5]rev   u[4]rev
-  //	v[7]rev   v[6]rev
-  //	v[5]rev   v[4]rev
-  bar();
-  for (u32 i = 0; i < NH / 2; ++i) {
-    u32 j = (i * G_H + me % G_H);
-    lds[me < G_H ? ((NH/2)*G_H - j) % ((NH/2)*G_H) : NH*G_H-1 - j] = u[NH/2 + i];
+// This is used to reverse the second part of a line, and cross the reversed parts between the halves.
+void OVERLOAD revCrossLine(local F2* lds2, F2 *u) {
+  u32 me = get_local_id(0);
+  u32 lowMe = me % G_H;
+  u32 revLowMe = G_H - 1 - lowMe;
+
+  if (SHUFL_BYTES_H >= 4) {
+    local F2 *ldsOut = lds2;
+    local F2 *ldsIn = lds2;
+    if (me < G_H) ldsOut += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(F2);
+    else ldsIn += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(F2);
+    bar();   // we need a full bar because we're crossing halves
+    for (u32 i = 0; i < NH/2; ++i) { ldsOut[G_H * (NH/2 - 1 - i) + revLowMe] = u[i + NH/2]; }
+    bar();   // we need a full bar because we just crossed halves.  LDS reads are compatible with future shufl calls.
+    for (u32 i = 0; i < NH/2; ++i) { u[i + NH/2] = ldsIn[G_H * i + lowMe]; }
   }
-  // For NH=8, read from lds into u[i]:
-  //	u[4] =   u[7]rev   v[7]rev
-  //	u[5] =   u[6]rev   v[6]rev
-  //	u[6] =   u[5]rev   v[5]rev
-  //	u[7] =   u[4]rev   v[4]rev
-  bar();
-  lds += me % G_H + (me / G_H) * NH/2 * G_H;
-  for (u32 i = 0; i < NH / 2; ++i) { u[NH/2 + i] = lds[i * G_H]; }
 }
 
+#if 0    // Unused
+
 // Somewhat similar to reverseLine.
 // The u values are in threads < G_H, the v values to reverse in threads >= G_H.
 // Whereas reverseLine leaves u values alone.  This reverseLine moves u values around
@@ -357,6 +472,8 @@ void OVERLOAD unreverseLine2(local F2 *lds, F2 *u) {
 
 #endif
 
+#endif
+
 
 /**************************************************************************/
 /*          Similar to above, but for an NTT based on GF(M31^2)           */
@@ -365,161 +482,32 @@ void OVERLOAD unreverseLine2(local F2 *lds, F2 *u) {
 #if NTT_GF31
 
 void OVERLOAD reverse(u32 WG, local GF31 *lds, GF31 *u, bool bump) {
-  u32 me = get_local_id(0);
-  u32 revMe = WG - 1 - me + bump;
-
-  bar();
-
-#if NH == 8
-  lds[revMe + 0 * WG] = u[3];
-  lds[revMe + 1 * WG] = u[2];
-  lds[revMe + 2 * WG] = u[1];
-  lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0];
-#elif NH == 4
-  lds[revMe + 0 * WG] = u[1];
-  lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0];
-#else
-#error
-#endif
-
-  bar();
-  for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; }
+  reverse(WG, (local F2 *) lds, (F2 *) u, bump);
 }
 
-void OVERLOAD reverseLine(u32 WG, local GF31 *lds2, GF31 *u) {
-  u32 me = get_local_id(0);
-  u32 revMe = WG - 1 - me;
-
-  local GF31 *lds = lds2 + revMe;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { lds[WG * (NH - 1 - i)] = u[i]; }
-
-  lds = lds2 + me;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; }
+void OVERLOAD reverseLine(u32 WG, local GF31 *lds, GF31 *u) {
+  reverseLine(WG, (local F2 *) lds, (F2 *) u);
 }
 
-// This is used to reverse the second part of a line, and cross the reversed parts between the halves.
-void OVERLOAD revCrossLine(u32 WG, local GF31* lds2, GF31 *u, u32 n, bool writeSecondHalf) {
-  u32 me = get_local_id(0);
-  u32 lowMe = me % WG;
-
-  u32 revLowMe = WG - 1 - lowMe;
-
-  for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; }
-
-  bar();   // we need a full bar because we're crossing halves
-
-  for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; }
-}
-
-//
-// These versions are for the kernel(s) that uses a double-wide workgroup (u in half the workgroup, v in the other half)
-//
-
 void OVERLOAD reverse2(local GF31 *lds, GF31 *u) {
-  u32 me = get_local_id(0);
-  
-  // For NH=8, u[0] to u[3] are left unchanged.  Write to lds:
-  //	u[7]rev   u[6]rev
-  //	u[5]rev   u[4]rev
-  //	v[7]rev   v[6]rev
-  //	v[5]rev   v[4]rev
-  bar();
-  for (u32 i = 0; i < NH / 2; ++i) {
-    u32 j = (i * G_H + me % G_H);
-    lds[me < G_H ? ((NH/2)*G_H - j) % ((NH/2)*G_H) : NH*G_H-1 - j] = u[NH/2 + i];
-  }
-  // For NH=8, read from lds into u[i]:
-  //	u[4] =   u[7]rev   v[7]rev
-  //	u[5] =   u[6]rev   v[6]rev
-  //	u[6] =   u[5]rev   v[5]rev
-  //	u[7] =   u[4]rev   v[4]rev
-  bar();
-  lds += me % G_H + (me / G_H) * NH/2 * G_H;
-  for (u32 i = 0; i < NH / 2; ++i) { u[NH/2 + i] = lds[i * G_H]; }
+  reverse2((local F2 *) lds, (F2 *) u);
 }
 
-// Somewhat similar to reverseLine.
-// The u values are in threads < G_H, the v values to reverse in threads >= G_H.
-// Whereas reverseLine leaves u values alone.  This reverseLine moves u values around
-// so that pairSq2 can easily operate on pairs.  This means for NH = 4, web output:
-//      u[0]    u[1]            // Returned in u[0]
-//      u[2]    u[3]            // Returned in u[1]
-//      v[3]rev v[2]rev         // Returned in u[2]
-//      v[1]rev v[0]rev         // Returned in u[3]
-void OVERLOAD reverseLine2(local GF31 *lds, GF31 *u) {
-  u32 me = get_local_id(0);
-
-// NOTE:  It is important that this routine use lds memory in coordination with shufl2.  Failure to do so would require an
-// unqualified bar() call here.  Specifically, the u values are stored in the upper half of lds memory (SMALL_HEIGHT GF31 values).
-// The v values are stored in the lower half of lds memory (the next SMALL_HEIGHT GF31 values).
-
-  if (G_H > WAVEFRONT) bar();
-
-// For NH=4, the lds indices (where to write each incoming u[i] which has v[i] in the upper threads) looks like this:
-// 0..GH-1 +0*G_H    GH-1..0 +7*G_H
-// 0..GH-1 +1*G_H    GH-1..0 +6*G_H
-// 0..GH-1 +2*G_H    GH-1..0 +5*G_H
-// 0..GH-1 +3*G_H    GH-1..0 +4*G_H
-// That means saving to lds using index: me < G_H ? me % G_H + i * G_H : 8*G_H-1 - me % G_H - i * G_H
-
-#if 1
-  local GF31 *ldsOut = lds + (me < G_H ? me % G_H : (NH*2)*G_H-1 - me % G_H);
-  i32 ldsOutInc = (me < G_H) ? G_H : -G_H;
-  for (u32 i = 0; i < NH; ++i, ldsOut += ldsOutInc) { *ldsOut = u[i]; }
+void OVERLOAD revCrossLine(local GF31* lds, GF31 *u) {
+  revCrossLine((local F2 *) lds, (F2 *) u);
+}
 
-  lds += me;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { u[i] = lds[i * 2*G_H]; }
-#else
-  local Z61 *ldsOut = (local Z61 *) lds + (me < G_H ? me % G_H : (NH*2)*G_H-1 - me % G_H);
-  i32 ldsOutInc = (me < G_H) ? G_H : -G_H;
-  for (u32 i = 0; i < NH; ++i, ldsOut += ldsOutInc) { ldsOut[0] = u[i].x; ldsOut[NH*2*G_H] = u[i].y; }
+#if 0    // Unused
 
-  local ZF61 *ldsIn = (local T *) lds + me;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { u[i].x = ldsIn[i * 2*G_H]; u[i].y = ldsIn[NH*2*G_H + i * 2*G_H]; }
-#endif
+void OVERLOAD reverseLine2(local GF31 *lds, GF31 *u) {
+  reverseLine2((local F2 *) lds, (F2 *) u);
 }
 
-// Undo a reverseLine2
 void OVERLOAD unreverseLine2(local GF31 *lds, GF31 *u) {
-  u32 me = get_local_id(0);
-
-// NOTE:  It is important that this routine use lds memory in coordination with reverseLine2 and shufl2.  By initially
-// writing to the lds locations that reverseLine2 read from we do not need an initial bar() call here.  Also, by reading
-// from the lds locations that shufl2 will use (u values in the upper half of lds memory, v values in the lower half of
-// lds memory) we can issue a qualified bar() call before calling FFT_HEIGHT2.
-
-#if 1
-  local GF31 *ldsOut = lds + me;
-  for (u32 i = 0; i < NH; ++i) { ldsOut[i * 2*G_H] = u[i]; }
-
-// For NH=4, the lds indices (where to read each outgoing u[i] which has v[i] in the upper threads) looks like this:
-// 0..GH-1 +0*G_H    GH-1..0 +7*G_H
-// 0..GH-1 +1*G_H    GH-1..0 +6*G_H
-// 0..GH-1 +2*G_H    GH-1..0 +5*G_H
-// 0..GH-1 +3*G_H    GH-1..0 +4*G_H
-  lds += (me < G_H) ? me % G_H : (NH*2)*G_H-1 - me % G_H;
-  i32 ldsInc = (me < G_H) ? G_H : -G_H;
-  bar();
-  for (u32 i = 0; i < NH; ++i, lds += ldsInc) { u[i] = *lds; }
-#else
-  local Z61 *ldsOut = (local T *) lds + me;
-  for (u32 i = 0; i < NH; ++i) { ldsOut[i * 2*G_H] = u[i].x; ldsOut[NH*2*G_H + i * 2*G_H] = u[i].y; }
+  unreverseLine2((local F2 *) lds, (F2 *) u);
+}
 
-// For NH=4, the lds indices (where to read each outgoing u[i] which has v[i] in the upper threads) looks like this:
-// 0..GH-1 +0*G_H    GH-1..0 +7*G_H
-// 0..GH-1 +1*G_H    GH-1..0 +6*G_H
-// 0..GH-1 +2*G_H    GH-1..0 +5*G_H
-// 0..GH-1 +3*G_H    GH-1..0 +4*G_H
-  local Z61 *ldsIn = (local T *) lds + ((me < G_H) ? me % G_H : (NH*2)*G_H-1 - me % G_H);
-  i32 ldsInc = (me < G_H) ? G_H : -G_H;
-  bar();
-  for (u32 i = 0; i < NH; ++i, ldsIn += ldsInc) { u[i].x = ldsIn[0]; u[i].y = ldsIn[NH*2*G_H]; }
 #endif
-}
 
 #endif
 
@@ -531,160 +519,31 @@ void OVERLOAD unreverseLine2(local GF31 *lds, GF31 *u) {
 #if NTT_GF61
 
 void OVERLOAD reverse(u32 WG, local GF61 *lds, GF61 *u, bool bump) {
-  u32 me = get_local_id(0);
-  u32 revMe = WG - 1 - me + bump;
-
-  bar();
-
-#if NH == 8
-  lds[revMe + 0 * WG] = u[3];
-  lds[revMe + 1 * WG] = u[2];
-  lds[revMe + 2 * WG] = u[1];
-  lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0];
-#elif NH == 4
-  lds[revMe + 0 * WG] = u[1];
-  lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0];
-#else
-#error
-#endif
-
-  bar();
-  for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; }
+  reverse(WG, (local T2 *) lds, (T2 *) u, bump);
 }
 
-void OVERLOAD reverseLine(u32 WG, local GF61 *lds2, GF61 *u) {
-  u32 me = get_local_id(0);
-  u32 revMe = WG - 1 - me;
-
-  local GF61 *lds = lds2 + revMe;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { lds[WG * (NH - 1 - i)] = u[i]; }
-
-  lds = lds2 + me;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; }
+void OVERLOAD reverseLine(u32 WG, local GF61 *lds, GF61 *u) {
+  reverseLine(WG, (local T2 *) lds, (T2 *) u);
 }
 
-// This is used to reverse the second part of a line, and cross the reversed parts between the halves.
-void OVERLOAD revCrossLine(u32 WG, local GF61* lds2, GF61 *u, u32 n, bool writeSecondHalf) {
-  u32 me = get_local_id(0);
-  u32 lowMe = me % WG;
-
-  u32 revLowMe = WG - 1 - lowMe;
-
-  for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; }
-
-  bar();   // we need a full bar because we're crossing halves
-
-  for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; }
-}
-
-//
-// These versions are for the kernel(s) that uses a double-wide workgroup (u in half the workgroup, v in the other half)
-//
-
 void OVERLOAD reverse2(local GF61 *lds, GF61 *u) {
-  u32 me = get_local_id(0);
-  
-  // For NH=8, u[0] to u[3] are left unchanged.  Write to lds:
-  //	u[7]rev   u[6]rev
-  //	u[5]rev   u[4]rev
-  //	v[7]rev   v[6]rev
-  //	v[5]rev   v[4]rev
-  bar();
-  for (u32 i = 0; i < NH / 2; ++i) {
-    u32 j = (i * G_H + me % G_H);
-    lds[me < G_H ? ((NH/2)*G_H - j) % ((NH/2)*G_H) : NH*G_H-1 - j] = u[NH/2 + i];
-  }
-  // For NH=8, read from lds into u[i]:
-  //	u[4] =   u[7]rev   v[7]rev
-  //	u[5] =   u[6]rev   v[6]rev
-  //	u[6] =   u[5]rev   v[5]rev
-  //	u[7] =   u[4]rev   v[4]rev
-  bar();
-  lds += me % G_H + (me / G_H) * NH/2 * G_H;
-  for (u32 i = 0; i < NH / 2; ++i) { u[NH/2 + i] = lds[i * G_H]; }
+  reverse2((local T2 *) lds, (T2 *) u);
 }
 
-// Somewhat similar to reverseLine.
-// The u values are in threads < G_H, the v values to reverse in threads >= G_H.
-// Whereas reverseLine leaves u values alone.  This reverseLine moves u values around
-// so that pairSq2 can easily operate on pairs.  This means for NH = 4, web output:
-//      u[0]    u[1]            // Returned in u[0]
-//      u[2]    u[3]            // Returned in u[1]
-//      v[3]rev v[2]rev         // Returned in u[2]
-//      v[1]rev v[0]rev         // Returned in u[3]
-void OVERLOAD reverseLine2(local GF61 *lds, GF61 *u) {
-  u32 me = get_local_id(0);
-
-// NOTE:  It is important that this routine use lds memory in coordination with shufl2.  Failure to do so would require an
-// unqualified bar() call here.  Specifically, the u values are stored in the upper half of lds memory (SMALL_HEIGHT GF61 values).
-// The v values are stored in the lower half of lds memory (the next SMALL_HEIGHT GF61 values).
-
-  if (G_H > WAVEFRONT) bar();
-
-// For NH=4, the lds indices (where to write each incoming u[i] which has v[i] in the upper threads) looks like this:
-// 0..GH-1 +0*G_H    GH-1..0 +7*G_H
-// 0..GH-1 +1*G_H    GH-1..0 +6*G_H
-// 0..GH-1 +2*G_H    GH-1..0 +5*G_H
-// 0..GH-1 +3*G_H    GH-1..0 +4*G_H
-// That means saving to lds using index: me < G_H ? me % G_H + i * G_H : 8*G_H-1 - me % G_H - i * G_H
-
-#if 1
-  local GF61 *ldsOut = lds + (me < G_H ? me % G_H : (NH*2)*G_H-1 - me % G_H);
-  i32 ldsOutInc = (me < G_H) ? G_H : -G_H;
-  for (u32 i = 0; i < NH; ++i, ldsOut += ldsOutInc) { *ldsOut = u[i]; }
+void OVERLOAD revCrossLine(local GF61* lds, GF61 *u) {
+  revCrossLine((local T2 *) lds, (T2 *) u);
+}
 
-  lds += me;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { u[i] = lds[i * 2*G_H]; }
-#else
-  local Z61 *ldsOut = (local Z61 *) lds + (me < G_H ? me % G_H : (NH*2)*G_H-1 - me % G_H);
-  i32 ldsOutInc = (me < G_H) ? G_H : -G_H;
-  for (u32 i = 0; i < NH; ++i, ldsOut += ldsOutInc) { ldsOut[0] = u[i].x; ldsOut[NH*2*G_H] = u[i].y; }
+#if 0    // Unused
 
-  local ZF61 *ldsIn = (local T *) lds + me;
-  bar();
-  for (u32 i = 0; i < NH; ++i) { u[i].x = ldsIn[i * 2*G_H]; u[i].y = ldsIn[NH*2*G_H + i * 2*G_H]; }
-#endif
+void OVERLOAD reverseLine2(local GF61 *lds, GF61 *u) {
+  reverseLine2((local T2 *) lds, (T2 *) u);
 }
 
-// Undo a reverseLine2
 void OVERLOAD unreverseLine2(local GF61 *lds, GF61 *u) {
-  u32 me = get_local_id(0);
-
-// NOTE:  It is important that this routine use lds memory in coordination with reverseLine2 and shufl2.  By initially
-// writing to the lds locations that reverseLine2 read from we do not need an initial bar() call here.  Also, by reading
-// from the lds locations that shufl2 will use (u values in the upper half of lds memory, v values in the lower half of
-// lds memory) we can issue a qualified bar() call before calling FFT_HEIGHT2.
-
-#if 1
-  local GF61 *ldsOut = lds + me;
-  for (u32 i = 0; i < NH; ++i) { ldsOut[i * 2*G_H] = u[i]; }
-
-// For NH=4, the lds indices (where to read each outgoing u[i] which has v[i] in the upper threads) looks like this:
-// 0..GH-1 +0*G_H    GH-1..0 +7*G_H
-// 0..GH-1 +1*G_H    GH-1..0 +6*G_H
-// 0..GH-1 +2*G_H    GH-1..0 +5*G_H
-// 0..GH-1 +3*G_H    GH-1..0 +4*G_H
-  lds += (me < G_H) ? me % G_H : (NH*2)*G_H-1 - me % G_H;
-  i32 ldsInc = (me < G_H) ? G_H : -G_H;
-  bar();
-  for (u32 i = 0; i < NH; ++i, lds += ldsInc) { u[i] = *lds; }
-#else
-  local Z61 *ldsOut = (local T *) lds + me;
-  for (u32 i = 0; i < NH; ++i) { ldsOut[i * 2*G_H] = u[i].x; ldsOut[NH*2*G_H + i * 2*G_H] = u[i].y; }
+  unreverseLine2((local T2 *) lds, (T2 *) u);
+}
 
-// For NH=4, the lds indices (where to read each outgoing u[i] which has v[i] in the upper threads) looks like this:
-// 0..GH-1 +0*G_H    GH-1..0 +7*G_H
-// 0..GH-1 +1*G_H    GH-1..0 +6*G_H
-// 0..GH-1 +2*G_H    GH-1..0 +5*G_H
-// 0..GH-1 +3*G_H    GH-1..0 +4*G_H
-  local Z61 *ldsIn = (local T *) lds + ((me < G_H) ? me % G_H : (NH*2)*G_H-1 - me % G_H);
-  i32 ldsInc = (me < G_H) ? G_H : -G_H;
-  bar();
-  for (u32 i = 0; i < NH; ++i, ldsIn += ldsInc) { u[i].x = ldsIn[0]; u[i].y = ldsIn[NH*2*G_H]; }
 #endif
-}
 
 #endif
diff --git a/src/tune.cpp b/src/tune.cpp
index 4d613d0f..ae376054 100644
--- a/src/tune.cpp
+++ b/src/tune.cpp
@@ -911,7 +911,7 @@ void Tune::tune() {
     }
 
     // Find best BIGLIT setting
-    if (time_FFTs) {
+    if (0 && time_FFTs) {        // Deprecated
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
       u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_biglit = 0;
@@ -954,7 +954,7 @@ void Tune::tune() {
       config.write("\n  -log 1000000\n");
     }
     if (args->workers < 2) {
-      config.write("\n# Running two workers sometimes gives better throughput.  Autoprimenet will need to create up a second worktodo file.");
+      config.write("\n# Running two workers sometimes gives better throughput.  AutoPrimenet will need to create a second worktodo file.");
       config.write("\n#  -workers 2\n");
       config.write("\n# Changing TAIL_KERNELS to 3 when running two workers may be better.");
       config.write("\n#  -use TAIL_KERNELS=3\n");

From 08036f356c6b6a0fa8c28d37880426ed649e939c Mon Sep 17 00:00:00 2001
From: george <woltman@alum.mit.edu>
Date: Thu, 12 Mar 2026 02:00:30 +0000
Subject: [PATCH 9/9] Testing indicates WMUL=2 should dbe the default. RTX4xxx
 and RTX5xxx GPUs benefit from L2STORE and LULOAD.  Added support for those
 options officially. Since FAST_BARRIER seems to now work on nVidia, the
 option is now tuned.

---
 src/Gpu.cpp    |  4 +++-
 src/cl/base.cl |  8 +++++++
 src/tune.cpp   | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/src/Gpu.cpp b/src/Gpu.cpp
index 84cc4434..6d00c0fd 100644
--- a/src/Gpu.cpp
+++ b/src/Gpu.cpp
@@ -228,7 +228,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
   // Default value for -use options that must also be parsed in C++ code
   tail_single_wide = 0, tail_single_kernel = 1;         // Default tailSquare is double-wide in one kernel
   in_place = 0;                                         // Default is not in-place
-  wmul = 1;						// Default is carryFused processes one workgroup at a time
+  wmul = 2;						// Default is carryFused processes two lines at a time
   pad_size = isAmdGpu(id) ? 256 : 0;                    // Default is 256 bytes for AMD, 0 for others
 
   // Validate -use options
@@ -263,6 +263,8 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
                               "TABMUL_CHAIN32",
                               "TABMUL_CHAIN61",
                               "MODM31",
+                              "ENABLE_L2STORE",
+                              "ENABLE_LULOAD",
                               "WMUL"
                             });
     if (!isValid) {
diff --git a/src/cl/base.cl b/src/cl/base.cl
index f252cfe1..1765fdfe 100644
--- a/src/cl/base.cl
+++ b/src/cl/base.cl
@@ -189,6 +189,14 @@ G_H        "group height" == SMALL_HEIGHT / NH
 #define ZEROHACK_H 1
 #endif
 
+#if !defined(ENABLE_L2STORE)
+#define ENABLE_L2STORE 1
+#endif
+
+#if !defined(ENABLE_LULOAD)
+#define ENABLE_LULOAD 1
+#endif
+
 // Expected defines: EXP the exponent.
 // WIDTH, SMALL_HEIGHT, MIDDLE.
 
diff --git a/src/tune.cpp b/src/tune.cpp
index ae376054..76474bcb 100644
--- a/src/tune.cpp
+++ b/src/tune.cpp
@@ -343,6 +343,7 @@ void Tune::tune() {
   
   // There are some options and variants that are different based on GPU manufacturer
   bool AMDGPU = isAmdGpu(q->context->deviceId());
+  bool NVIDIAGPU = isNvidiaGpu(q->context->deviceId());
 
   bool tune_config = 1;
   bool time_FFTs = 0;
@@ -601,7 +602,7 @@ void Tune::tune() {
     }
 
     // Find best FAST_BARRIER setting
-    if (AMDGPU) {
+    if (1 /*AMDGPU*/) {			// FAST_BARRIER now works for nVidia GPUs too (from what I've seen)
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
       u64 exponent = primes.prevPrime(fft.maxExp());
       u32 best_fast_barrier = 0;
@@ -910,6 +911,66 @@ void Tune::tune() {
       args->flags["ZEROHACK_H"] = to_string(best_zerohack_h);
     }
 
+    // Find best WMUL setting
+    if (1) {
+      FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
+      u64 exponent = primes.prevPrime(fft.maxExp());
+      u32 best_wmul = 0;
+      u32 current_wmul = args->value("WMUL", 2);
+      double best_cost = -1.0;
+      double current_cost = -1.0;
+      for (u32 wmul : {1, 2, 4}) {
+        args->flags["WMUL"] = to_string(wmul);
+        double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP(quick);
+        log("Time for %12s using WMUL=%u is %6.1f\n", fft.spec().c_str(), wmul, cost);
+        if (wmul == current_wmul) current_cost = cost;
+        if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_wmul = wmul; }
+      }
+      log("Best WMUL is %u.  Default WMUL is 2.\n", best_wmul);
+      configsUpdate(current_cost, best_cost, 0.003, "WMUL", best_wmul, newConfigKeyVals, suggestedConfigKeyVals);
+      args->flags["WMUL"] = to_string(best_wmul);
+    }
+
+    // Find best ENABLE_L2STORE setting
+    if (NVIDIAGPU) {
+      FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
+      u64 exponent = primes.prevPrime(fft.maxExp());
+      u32 best_enable_l2store = 0;
+      u32 current_enable_l2store = args->value("ENABLE_L2STORE", 2);
+      double best_cost = -1.0;
+      double current_cost = -1.0;
+      for (u32 enable_l2store : {0, 1}) {
+        args->flags["ENABLE_L2STORE"] = to_string(enable_l2store);
+        double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP(quick);
+        log("Time for %12s using ENABLE_L2STORE=%u is %6.1f\n", fft.spec().c_str(), enable_l2store, cost);
+        if (enable_l2store == current_enable_l2store) current_cost = cost;
+        if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_enable_l2store = enable_l2store; }
+      }
+      log("Best ENABLE_L2STORE is %u.  Default ENABLE_L2STORE is 1.\n", best_enable_l2store);
+      configsUpdate(current_cost, best_cost, 0.003, "ENABLE_L2STORE", best_enable_l2store, newConfigKeyVals, suggestedConfigKeyVals);
+      args->flags["ENABLE_L2STORE"] = to_string(best_enable_l2store);
+    }
+
+    // Find best ENABLE_LULOAD setting
+    if (NVIDIAGPU) {
+      FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
+      u64 exponent = primes.prevPrime(fft.maxExp());
+      u32 best_enable_luload = 0;
+      u32 current_enable_luload = args->value("ENABLE_LULOAD", 2);
+      double best_cost = -1.0;
+      double current_cost = -1.0;
+      for (u32 enable_luload : {0, 1}) {
+        args->flags["ENABLE_LULOAD"] = to_string(enable_luload);
+        double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP(quick);
+        log("Time for %12s using ENABLE_LULOAD=%u is %6.1f\n", fft.spec().c_str(), enable_luload, cost);
+        if (enable_luload == current_enable_luload) current_cost = cost;
+        if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_enable_luload = enable_luload; }
+      }
+      log("Best ENABLE_LULOAD is %u.  Default ENABLE_LULOAD is 1.\n", best_enable_luload);
+      configsUpdate(current_cost, best_cost, 0.003, "ENABLE_LULOAD", best_enable_luload, newConfigKeyVals, suggestedConfigKeyVals);
+      args->flags["ENABLE_LULOAD"] = to_string(best_enable_luload);
+    }
+
     // Find best BIGLIT setting
     if (0 && time_FFTs) {        // Deprecated
       FFTConfig fft{*defaultShape, variant, CARRY_AUTO};