From 981c442b7e917b7b0417d4231fbad2c6ac03b17c Mon Sep 17 00:00:00 2001 From: george Date: Fri, 26 Dec 2025 03:45:54 +0000 Subject: [PATCH 1/9] Changed exponent from u32 to u64. Exponents over 2^32-1 don't work yet -- debugging is needed. --- src/Args.cpp | 8 ++-- src/Args.h | 6 +-- src/FFTConfig.cpp | 10 ++--- src/FFTConfig.h | 6 +-- src/Gpu.cpp | 93 ++++++++++++++++++++++++----------------------- src/Gpu.h | 18 ++++----- src/PRPState.cpp | 3 +- src/PRPState.h | 10 ++--- src/Primes.cpp | 16 ++++---- src/Primes.h | 12 +++--- src/Proof.cpp | 60 ++++++++++++++++-------------- src/Proof.h | 46 +++++++++++------------ src/Saver.cpp | 45 ++++++++++++----------- src/Saver.h | 14 +++---- src/Task.cpp | 12 +++--- src/Task.h | 2 +- src/TuneEntry.cpp | 13 ++++--- src/Worktodo.cpp | 4 +- src/common.h | 6 +-- src/shared.h | 4 +- src/state.cpp | 4 +- src/state.h | 12 +++--- src/tune.cpp | 56 ++++++++++++++-------------- 23 files changed, 236 insertions(+), 224 deletions(-) diff --git a/src/Args.cpp b/src/Args.cpp index 041202ad..5556e6a5 100644 --- a/src/Args.cpp +++ b/src/Args.cpp @@ -86,7 +86,7 @@ void Args::readConfig(const fs::path& path) { } } -u32 Args::getProofPow(u32 exponent) const { +u32 Args::getProofPow(u64 exponent) const { if (proofPow == -1) { return ProofSet::bestPower(exponent); } assert(proofPow >= 1); return proofPow; @@ -310,9 +310,9 @@ void Args::parse(const string& line) { } else if (key == "-tune") { doTune = true; if (!s.empty()) { tune = s; } - } else if (key == "-ctune") { - doCtune = true; - if (!s.empty()) { ctune.push_back(s); } +// } else if (key == "-ctune") { +// doCtune = true; +// if (!s.empty()) { ctune.push_back(s); } } else if (key == "-ztune") { doZtune = true; } else if (key == "-carryTune") { diff --git a/src/Args.h b/src/Args.h index 795cd99c..d7273afe 100644 --- a/src/Args.h +++ b/src/Args.h @@ -30,7 +30,7 @@ class Args { bool uses(const std::string& key) const { return flags.find(key) != flags.end(); } int value(const std::string& key, int valNotFound = -1) const; void readConfig(const fs::path& path); - u32 getProofPow(u32 exponent) const; + u32 getProofPow(u64 exponent) const; string tailDir() const; bool hasFlag(const string& key) const; @@ -78,8 +78,8 @@ class Args { u32 logStep = 20000; string fftSpec; - u32 prpExp = 0; - u32 llExp = 0; + u64 prpExp = 0; + u64 llExp = 0; size_t maxAlloc = 0; diff --git a/src/FFTConfig.cpp b/src/FFTConfig.cpp index 2308a037..c21607e1 100644 --- a/src/FFTConfig.cpp +++ b/src/FFTConfig.cpp @@ -182,7 +182,7 @@ if (18.35 + 0.5 * (log2(13 * 1024 * 512) - log2(size())) > 19.0) return 19.0; return 18.35 + 0.5 * (log2(13 * 1024 * 512) - log2(size())); } -bool FFTShape::needsLargeCarry(u32 E) const { +bool FFTShape::needsLargeCarry(u64 E) const { return E / double(size()) > carry32BPW(); } @@ -271,12 +271,12 @@ float FFTConfig::maxBpw() const { return (carry == CARRY_32 && (shape.fft_type == FFT64 || shape.fft_type == FFT3231)) ? std::min(shape.carry32BPW(), b) : b; } -FFTConfig FFTConfig::bestFit(const Args& args, u32 E, const string& spec) { +FFTConfig FFTConfig::bestFit(const Args& args, u64 E, const string& spec) { // A FFT-spec was given, simply take the first FFT from the spec that can handle E if (!spec.empty()) { FFTConfig fft{spec}; if (fft.maxExp() * args.fftOverdrive < E) { - log("Warning: %s (max %" PRIu64 ") may be too small for %u\n", fft.spec().c_str(), fft.maxExp(), E); + log("Warning: %s (max %" PRIu64 ") may be too small for %" PRIu64 "\n", fft.spec().c_str(), fft.maxExp(), E); } return fft; } @@ -288,7 +288,7 @@ FFTConfig FFTConfig::bestFit(const Args& args, u32 E, const string& spec) { if (E <= e.fft.maxExp() * args.fftOverdrive) { return e.fft; } } - log("No FFTs found in tune.txt that can handle %u. Consider tuning with -tune\n", E); + log("No FFTs found in tune.txt that can handle %" PRIu64 ". Consider tuning with -tune\n", E); // Take the first FFT that can handle E for (const FFTShape& shape : FFTShape::allShapes()) { @@ -297,7 +297,7 @@ FFTConfig FFTConfig::bestFit(const Args& args, u32 E, const string& spec) { } } - log("No FFT found for %u\n", E); + log("No FFT found for %" PRIu64 "\n", E); throw "No FFT"; } diff --git a/src/FFTConfig.h b/src/FFTConfig.h index c873eb8c..a9ad052c 100644 --- a/src/FFTConfig.h +++ b/src/FFTConfig.h @@ -27,7 +27,7 @@ class FFTShape { public: static std::vector allShapes(u32 from=0, u32 to = -1); - static tuple getChainLengths(u32 fftSize, u32 exponent, u32 middle); + static tuple getChainLengths(u32 fftSize, u64 exponent, u32 middle); static vector multiSpec(const string& spec); @@ -51,7 +51,7 @@ class FFTShape { std::string spec() const { return (fft_type ? to_string(fft_type) + ':' : "") + numberK(width) + ':' + numberK(middle) + ':' + numberK(height); } float carry32BPW() const; - bool needsLargeCarry(u32 E) const; + bool needsLargeCarry(u64 E) const; bool isFavoredShape() const; }; @@ -73,7 +73,7 @@ enum CARRY_KIND {CARRY_32=0, CARRY_64=1, CARRY_AUTO=2}; struct FFTConfig { public: - static FFTConfig bestFit(const Args& args, u32 E, const std::string& spec); + static FFTConfig bestFit(const Args& args, u64 E, const std::string& spec); // Which FP and NTT primes are involved in the FFT bool FFT_FP64; diff --git a/src/Gpu.cpp b/src/Gpu.cpp index 44656fa1..433ce0a7 100644 --- a/src/Gpu.cpp +++ b/src/Gpu.cpp @@ -43,43 +43,43 @@ namespace { u32 kAt(u32 H, u32 line, u32 col) { return (line + col * H) * 2; } -double weight(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) { +double weight(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) { return exp2l((long double)(extra(N, E, kAt(H, line, col) + rep)) / N); } -double invWeight(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) { +double invWeight(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) { return exp2l(-(long double)(extra(N, E, kAt(H, line, col) + rep)) / N); } -double weightM1(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) { +double weightM1(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) { return exp2l((long double)(extra(N, E, kAt(H, line, col) + rep)) / N) - 1; } -double invWeightM1(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) { +double invWeightM1(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) { return exp2l(- (long double)(extra(N, E, kAt(H, line, col) + rep)) / N) - 1; } double boundUnderOne(double x) { return std::min(x, nexttoward(1, 0)); } -float weight32(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) { +float weight32(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) { return exp2((double)(extra(N, E, kAt(H, line, col) + rep)) / N); } -float invWeight32(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) { +float invWeight32(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) { return exp2(-(double)(extra(N, E, kAt(H, line, col) + rep)) / N); } -float weightM132(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) { +float weightM132(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) { return exp2((double)(extra(N, E, kAt(H, line, col) + rep)) / N) - 1; } -float invWeightM132(u32 N, u32 E, u32 H, u32 line, u32 col, u32 rep) { +float invWeightM132(u32 N, u64 E, u32 H, u32 line, u32 col, u32 rep) { return exp2(- (double)(extra(N, E, kAt(H, line, col) + rep)) / N) - 1; } float boundUnderOne(float x) { return std::min(x, nexttowardf(1, 0)); } -Weights genWeights(FFTConfig fft, u32 E, u32 W, u32 H, u32 nW, bool AmdGpu) { +Weights genWeights(FFTConfig fft, u64 E, u32 W, u32 H, u32 nW, bool AmdGpu) { u32 N = 2u * W * H; u32 groupWidth = W / nW; @@ -227,7 +227,7 @@ constexpr bool isInList(const string& s, initializer_list list) { return false; } -string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector& extraConf, u32 E, bool doLog, +string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector& extraConf, u64 E, bool doLog, bool &tail_single_wide, bool &tail_single_kernel, u32 &in_place, u32 &pad_size) { map config; @@ -457,15 +457,15 @@ RoeInfo roeStat(const vector& roe) { class IterationTimer { Timer timer; - u32 kStart; + u64 kStart; public: - explicit IterationTimer(u32 kStart) : kStart(kStart) { } + explicit IterationTimer(u64 kStart) : kStart(kStart) { } - float reset(u32 k) { + float reset(u64 k) { float secs = timer.reset(); - u32 its = max(1u, k - kStart); + u64 its = max(u64(1), k - kStart); kStart = k; return secs / its; } @@ -506,7 +506,7 @@ string toHex(const vector& v) { // -------- -unique_ptr Gpu::make(Queue* q, u32 E, GpuCommon shared, FFTConfig fftConfig, const vector& extraConf, bool logFftSize) { +unique_ptr Gpu::make(Queue* q, u64 E, GpuCommon shared, FFTConfig fftConfig, const vector& extraConf, bool logFftSize) { return make_unique(q, shared, fftConfig, E, extraConf, logFftSize); } @@ -518,7 +518,7 @@ Gpu::~Gpu() { #define ROE_SIZE 100000 #define CARRY_SIZE 100000 -Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector& extraConf, bool logFftSize) : +Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector& extraConf, bool logFftSize) : queue(q), background{shared.background}, args{*shared.args}, @@ -649,7 +649,7 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector& // Sometimes we do want to run a FFT beyond a reasonable BPW (e.g. during -ztune), and these situations // coincide with logFftSize == false if (fft.maxExp() < E) { - log("Warning: %s (max %" PRIu64 ") may be too small for %u\n", fft.spec().c_str(), fft.maxExp(), E); + log("Warning: %s (max %" PRIu64 ") may be too small for %" PRIu64 "\n", fft.spec().c_str(), fft.maxExp(), E); } } @@ -992,7 +992,7 @@ void Gpu::modMul(Buffer& ioA, Buffer& inB, enum LEAD_TYPE leadInB, b mul(ioA, buf1, buf2, buf3, mul3); }; -void Gpu::writeState(u32 k, const vector& check, u32 blockSize) { +void Gpu::writeState(u64 k, const vector& check, u32 blockSize) { assert(blockSize > 0); writeIn(bufCheck, check); @@ -1022,7 +1022,7 @@ void Gpu::writeState(u32 k, const vector& check, u32 blockSize) { } modMul(bufData, bufAux, true); } - + bool Gpu::doCheck(u32 blockSize) { squareLoop(bufAux, bufCheck, 0, blockSize, true); modMul(bufCheck, bufData); @@ -1278,10 +1278,10 @@ void Gpu::square(Buffer& out, Buffer& in, enum LEAD_TYPE leadIn, enu } } -u32 Gpu::squareLoop(Buffer& out, Buffer& in, u32 from, u32 to, bool doTailMul3) { +u32 Gpu::squareLoop(Buffer& out, Buffer& in, u64 from, u64 to, bool doTailMul3) { assert(from < to); enum LEAD_TYPE leadIn = LEAD_NONE; - for (u32 k = from; k < to; ++k) { + for (u64 k = from; k < to; ++k) { enum LEAD_TYPE leadOut = useLongCarry || (k == to - 1) ? LEAD_NONE : LEAD_WIDTH; square(out, (k==from) ? in : out, leadIn, leadOut, doTailMul3 && (k == to - 1)); leadIn = leadOut; @@ -1350,16 +1350,16 @@ string RoeInfo::toString() const { return buf; } -static string makeLogStr(const string& status, u32 k, u64 res, float secsPerIt, u32 nIters) { +static string makeLogStr(const string& status, u64 k, u64 res, float secsPerIt, u64 nIters) { char buf[256]; - snprintf(buf, sizeof(buf), "%2s %9u %016" PRIx64 " %4.0f ETA %s; ", + snprintf(buf, sizeof(buf), "%2s %9" PRIu64 " %016" PRIx64 " %4.0f ETA %s; ", status.c_str(), k, res, /* k / float(nIters) * 100, */ secsPerIt * 1'000'000, getETA(k, nIters, secsPerIt).c_str()); return buf; } -void Gpu::doBigLog(u32 k, u64 res, bool checkOK, float secsPerIt, u32 nIters, u32 nErrors) { +void Gpu::doBigLog(u64 k, u64 res, bool checkOK, float secsPerIt, u64 nIters, u32 nErrors) { auto [roeSq, roeMul] = readROE(); double z = roeSq.z(); zAvg.update(z, roeSq.N); @@ -1480,7 +1480,7 @@ static u32 mod3(const std::vector &words) { return r % 3; } -static void doDiv3(u32 E, Words& words) { +static void doDiv3(u64 E, Words& words) { u32 r = (3 - mod3(words)) % 3; assert(r < 3); int topBits = E % 32; @@ -1497,7 +1497,7 @@ static void doDiv3(u32 E, Words& words) { } } -void Gpu::doDiv9(u32 E, Words& words) { +void Gpu::doDiv9(u64 E, Words& words) { doDiv3(E, words); doDiv3(E, words); } @@ -1532,12 +1532,12 @@ PRPState Gpu::loadPRP(Saver& saver) { u64 res = dataResidue(); if (res == state.res64) { - log("OK %9u on-load: blockSize %d, %016" PRIx64 "\n", state.k, state.blockSize, res); + log("OK %9" PRIu64 " on-load: blockSize %d, %016" PRIx64 "\n", state.k, state.blockSize, res); return state; // return {loaded.k, loaded.blockSize, loaded.nErrors}; } - log("EE %9u on-load: %016" PRIx64 " vs. %016" PRIx64 "\n", state.k, res, state.res64); + log("EE %9" PRIu64 " on-load: %016" PRIx64 " vs. %016" PRIx64 "\n", state.k, res, state.res64); if (!state.k) { break; } // We failed on PRP start } @@ -1545,7 +1545,7 @@ PRPState Gpu::loadPRP(Saver& saver) { throw "Error on load"; } -u32 Gpu::getProofPower(u32 k) { +u32 Gpu::getProofPower(u64 k) { u32 power = ProofSet::effectivePower(E, args.getProofPow(E), k); if (power != args.getProofPow(E)) { @@ -1785,7 +1785,8 @@ PRPResult Gpu::isPrimePRP(const Task& task) { reload: elapsedTimer.reset(); - u32 blockSize{}, k{}; + u32 blockSize{}; + u64 k{}; double elapsedBefore = 0; { @@ -1814,28 +1815,28 @@ PRPResult Gpu::isPrimePRP(const Task& task) { // For M=2^E-1, residue "type-3" == 3^(M+1), and residue "type-1" == type-3 / 9, // See http://www.mersenneforum.org/showpost.php?p=468378&postcount=209 // For both type-1 and type-3 we need to do E squarings (as M+1==2^E). - const u32 kEnd = E; + const u64 kEnd = E; assert(k < kEnd); // We continue beyound kEnd: to the next multiple of blockSize, to do a check there - u32 kEndEnd = roundUp(kEnd, blockSize); + u64 kEndEnd = roundUp(kEnd, blockSize); bool skipNextCheckUpdate = false; - u32 persistK = proofSet.next(k); + u64 persistK = proofSet.next(k); enum LEAD_TYPE leadIn = LEAD_NONE; assert(k % blockSize == 0); assert(checkStep % blockSize == 0); - const u32 startK = k; + const u64 startK = k; IterationTimer iterationTimer{k}; wantROE = 0; // skip the initial iterations while (true) { assert(k < kEndEnd); - + if (!wantROE && k - startK > 30) { wantROE = args.logROE ? ROE_SIZE : 2'000; } if (skipNextCheckUpdate) { @@ -1876,7 +1877,7 @@ PRPResult Gpu::isPrimePRP(const Task& task) { res2048.clear(); assert(words.size() >= 64); res2048.insert(res2048.end(), words.begin(), std::next(words.begin(), 64)); - log("%s %8d / %d, %s\n", isPrime ? "PP" : "CC", kEnd, E, hex(finalRes64).c_str()); + log("%s %8" PRIu64 " / %" PRIu64 ", %s\n", isPrime ? "PP" : "CC", kEnd, E, hex(finalRes64).c_str()); } if (!doCheck && !doLog) continue; @@ -1888,7 +1889,7 @@ PRPResult Gpu::isPrimePRP(const Task& task) { vector rawCheck = readChecked(bufCheck); if (rawCheck.empty()) { ++nErrors; - log("%9u %016" PRIx64 " read NULL check\n", k, res); + log("%9" PRIu64 " %016" PRIx64 " read NULL check\n", k, res); if (++nSeqErrors > 2) { throw "sequential errors"; } goto reload; } @@ -1899,7 +1900,7 @@ PRPResult Gpu::isPrimePRP(const Task& task) { elapsedBefore + elapsedTimer.at()}); }); - log(" %9u %016" PRIx64 " %4.0f\n", k, res, /*k / float(kEndEnd) * 100*,*/ secsPerIt * 1'000'000); + log(" %9" PRIu64 " %016" PRIx64 " %4.0f\n", k, res, /*k / float(kEndEnd) * 100*,*/ secsPerIt * 1'000'000); RoeInfo carryStats = readCarryStats(); if (carryStats.N) { u32 m = ldexp(carryStats.max, 32); @@ -1965,7 +1966,7 @@ LLResult Gpu::isPrimeLL(const Task& task) { reload: elapsedTimer.reset(); - u32 startK = 0; + u64 startK = 0; double elapsedBefore = 0; { LLState state = saver.load(); @@ -1977,13 +1978,13 @@ LLResult Gpu::isPrimeLL(const Task& task) { u64 res = dataResidue(); if (res != expectedRes) { throw "Invalid savefile (res64)"; } assert(res == expectedRes); - log("LL loaded @ %u : %016" PRIx64 "\n", startK, res); + log("LL loaded @ %" PRIu64 " : %016" PRIx64 "\n", startK, res); } IterationTimer iterationTimer{startK}; - u32 k = startK; - u32 kEnd = E - 2; + u64 k = startK; + u64 kEnd = E - 2; enum LEAD_TYPE leadIn = LEAD_NONE; while (true) { @@ -2009,7 +2010,7 @@ LLResult Gpu::isPrimeLL(const Task& task) { if (isAllZero) { if (k < kEnd) { - log("Error: early ZERO @ %u\n", k); + log("Error: early ZERO @ %" PRIu64 "\n", k); if (doStop) { throw "stop requested"; } else { @@ -2025,7 +2026,7 @@ LLResult Gpu::isPrimeLL(const Task& task) { float secsPerIt = iterationTimer.reset(k); queue->setSquareTime((int) (secsPerIt * 1'000'000)); - log("%9u %016" PRIx64 " %4.0f\n", k, res64, secsPerIt * 1'000'000); + log("%9" PRIu64 " %016" PRIx64 " %4.0f\n", k, res64, secsPerIt * 1'000'000); if (k >= kEnd) { return {isAllZero, res64}; } @@ -2039,13 +2040,13 @@ array Gpu::isCERT(const Task& task) { // Get CERT start value char fname[32]; - sprintf(fname, "M%u.cert", E); + sprintf(fname, "M%" PRIu64 ".cert", E); // Autoprimenet.py does not add the cert entry to worktodo.txt until it has successfully downloaded the .cert file. { // Enclosing this code in braces ensures the file will be closed by the File destructor. The later file deletion requires the file be closed in Windows. File fi = File::openReadThrow(fname); - u32 nBytes = (E - 1) / 8 + 1; + u32 nBytes = u32((E - 1) / 8 + 1); Words B = fi.readBytesLE(nBytes); writeIn(bufData, std::move(B)); } diff --git a/src/Gpu.h b/src/Gpu.h index fc5166f3..ad859eac 100644 --- a/src/Gpu.h +++ b/src/Gpu.h @@ -93,7 +93,7 @@ class Gpu { private: std::unique_ptr> saver; - u32 E; + u64 E; u32 N; FFTConfig fft; @@ -250,8 +250,8 @@ class Gpu { void squareCERT(Buffer& io, enum LEAD_TYPE leadIn, enum LEAD_TYPE leadOut) { square(io, io, leadIn, leadOut, false, false); } void squareLL(Buffer& io, enum LEAD_TYPE leadIn, enum LEAD_TYPE leadOut) { square(io, io, leadIn, leadOut, false, true); } - u32 squareLoop(Buffer& out, Buffer& in, u32 from, u32 to, bool doTailMul3); - u32 squareLoop(Buffer& io, u32 from, u32 to) { return squareLoop(io, io, from, to, false); } + u32 squareLoop(Buffer& out, Buffer& in, u64 from, u64 to, bool doTailMul3); + u32 squareLoop(Buffer& io, u64 from, u64 to) { return squareLoop(io, io, from, to, false); } bool isEqual(Buffer& bufCheck, Buffer& bufAux); u64 bufResidue(Buffer& buf); @@ -260,7 +260,7 @@ class Gpu { void exponentiate(Buffer& bufInOut, u64 exp, Buffer& buf1, Buffer& buf2, Buffer& buf3); - void writeState(u32 k, const vector& check, u32 blockSize); + void writeState(u64 k, const vector& check, u32 blockSize); // does either carrryFused() or the expanded version depending on useLongCarry void doCarry(Buffer& out, Buffer& in, Buffer& tmp); @@ -283,13 +283,13 @@ class Gpu { // void measureTransferSpeed(); - static void doDiv9(u32 E, Words& words); + static void doDiv9(u64 E, Words& words); static bool equals9(const Words& words); void selftestTrig(); public: - Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector& extraConf, bool logFftSize); - static unique_ptr make(Queue* q, u32 E, GpuCommon shared, FFTConfig fft, + Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector& extraConf, bool logFftSize); + static unique_ptr make(Queue* q, u64 E, GpuCommon shared, FFTConfig fft, const vector& extraConf = {}, bool logFftSize = true); ~Gpu(); @@ -337,8 +337,8 @@ class Gpu { void clear(bool isPRP); private: - u32 getProofPower(u32 k); - void doBigLog(u32 k, u64 res, bool checkOK, float secsPerIt, u32 nIters, u32 nErrors); + u32 getProofPower(u64 k); + void doBigLog(u64 k, u64 res, bool checkOK, float secsPerIt, u64 nIters, u32 nErrors); }; // Compute the size of an FFT/NTT data buffer depending on the FFT/NTT float/prime. Size is returned in units of sizeof(double). diff --git a/src/PRPState.cpp b/src/PRPState.cpp index bdc2c9c8..02b9ae68 100644 --- a/src/PRPState.cpp +++ b/src/PRPState.cpp @@ -8,7 +8,8 @@ PRPState::PRPState(File&& fi) { string header = fi.readLine(); - u32 fileE, fileK, blockSize, nErrors, crc; + u64 fileE, fileK; + u32 blockSize, nErrors, crc; u64 res64; vector check; u32 b1, nBits, start, nextK; diff --git a/src/PRPState.h b/src/PRPState.h index edf0ccf9..1eab5085 100644 --- a/src/PRPState.h +++ b/src/PRPState.h @@ -10,23 +10,23 @@ class File; class PRPState { // E, k, block-size, res64, nErrors - static constexpr const char *PRP_v10 = "OWL PRP 10 %u %u %u %016" SCNx64 " %u\n"; + static constexpr const char *PRP_v10 = "OWL PRP 10 %" PRIu64 " %" PRIu64 " %u %016" SCNx64 " %u\n"; // Exponent, iteration, block-size, res64, nErrors // B1, nBits, start, nextK, crc - static constexpr const char *PRP_v11 = "OWL PRP 11 %u %u %u %016" SCNx64 " %u %u %u %u %u %u\n"; + static constexpr const char *PRP_v11 = "OWL PRP 11 %" PRIu64 " %" PRIu64 " %u %016" SCNx64 " %u %u %u %u %u %u\n"; // E, k, block-size, res64, nErrors, CRC - static constexpr const char *PRP_v12 = "OWL PRP 12 %u %u %u %016" SCNx64 " %u %u\n"; + static constexpr const char *PRP_v12 = "OWL PRP 12 %" PRIu64 " %" PRIu64 " %u %016" SCNx64 " %u %u\n"; public: - u32 k{}; + u64 k{}; u32 blockSize{}; u64 res64{}; vector check; u32 nErrors{}; - // PRPState(u32 k, u32 blockSize, u64 res64, vector<) + // PRPState(u64 k, u32 blockSize, u64 res64, vector<) PRPState(File&& f); void saveTo(const File& f); }; diff --git a/src/Primes.cpp b/src/Primes.cpp index 865cfaec..6e3ceb36 100644 --- a/src/Primes.cpp +++ b/src/Primes.cpp @@ -15,14 +15,14 @@ Primes::Primes() { } } -bool Primes::isPrimeOdd(u32 n) const { +bool Primes::isPrimeOdd(u64 n) const { assert(n % 2); // must be odd to call here if (n < 3) { return false; } for (u32 k = 0; k < sieve.size(); ++k) { if (sieve[k]) { u32 p = k * 2 + 3; - if (p * p > n) { return true; } + if (u64(p) * u64(p) > n) { return true; } if (n % p == 0) { return false; } } } @@ -30,11 +30,11 @@ bool Primes::isPrimeOdd(u32 n) const { return false; } -bool Primes::isPrime(u32 n) const { +bool Primes::isPrime(u64 n) const { return (n%2 && isPrimeOdd(n)) || (n == 2); } -u32 Primes::prevPrime(u32 n) const { +u64 Primes::prevPrime(u64 n) const { --n; if (n % 2 == 0) { --n; } @@ -43,7 +43,7 @@ u32 Primes::prevPrime(u32 n) const { return 0; } -u32 Primes::nextPrime(u32 n) const { +u64 Primes::nextPrime(u64 n) const { ++n; if (n % 2 == 0) { ++n; } for (; ; n += 2) { if (isPrimeOdd(n)) { return n; }} @@ -51,10 +51,10 @@ u32 Primes::nextPrime(u32 n) const { return 0; } -u32 Primes::nearestPrime(u32 n) const { +u64 Primes::nearestPrime(u64 n) const { if (isPrime(n)) { return n; } - u32 a = prevPrime(n); - u32 b = nextPrime(n); + u64 a = prevPrime(n); + u64 b = nextPrime(n); assert(a < n && n < b); return n-a < b-n ? a : b; } diff --git a/src/Primes.h b/src/Primes.h index b951c02d..7f1f16fb 100644 --- a/src/Primes.h +++ b/src/Primes.h @@ -6,14 +6,14 @@ #include "common.h" class Primes { - std::bitset<50000> sieve; - bool isPrimeOdd(u32 n) const; + std::bitset<50000> sieve; // Allows for testing prims up to 10 billion + bool isPrimeOdd(u64 n) const; public: Primes(); - bool isPrime(u32 n) const; - u32 prevPrime(u32 n) const; - u32 nextPrime(u32 n) const; - u32 nearestPrime(u32 n) const; + bool isPrime(u64 n) const; + u64 prevPrime(u64 n) const; + u64 nextPrime(u64 n) const; + u64 nearestPrime(u64 n) const; }; diff --git a/src/Proof.cpp b/src/Proof.cpp index 31c488f8..bd8d4be6 100644 --- a/src/Proof.cpp +++ b/src/Proof.cpp @@ -19,11 +19,11 @@ namespace proof { -array hashWords(u32 E, const Words& words) { +array hashWords(u64 E, const Words& words) { return std::move(SHA3{}.update(words.data(), (E-1)/8+1)).finish(); } -array hashWords(u32 E, array prefix, const Words& words) { +array hashWords(u64 E, array prefix, const Words& words) { return std::move(SHA3{}.update(prefix).update(words.data(), (E-1)/8+1)).finish(); } @@ -39,7 +39,8 @@ string fileHash(const fs::path& filePath) { ProofInfo getInfo(const fs::path& proofFile) { string hash = proof::fileHash(proofFile); File fi = File::openReadThrow(proofFile); - u32 E = 0, power = 0; + u64 E = 0; + u32 power = 0; char c = 0; if (fi.scanf(Proof::HEADER_v2, &power, &E, &c) != 3 || c != '\n') { log("Proof file '%s' has invalid header\n", proofFile.string().c_str()); @@ -68,7 +69,8 @@ void Proof::save(const fs::path& proofFile) const { Proof Proof::load(const fs::path& path) { File fi = File::openReadThrow(path); - u32 E = 0, power = 0; + u64 E = 0; + u32 power = 0; char c = 0; if (fi.scanf(HEADER_v2, &power, &E, &c) != 3 || c != '\n') { log("Proof file '%s' has invalid header\n", path.string().c_str()); @@ -84,7 +86,7 @@ Proof Proof::load(const fs::path& path) { bool Proof::verify(Gpu *gpu, const vector& hashes) const { // log("B %016" PRIx64 "\n", res64(B)); // for (u32 i = 0; i < middles.size(); ++i) { log("Middle[%u] %016" PRIx64 "\n", i, res64(middles[i])); } - + u32 power = middles.size(); assert(power > 0); @@ -92,10 +94,10 @@ bool Proof::verify(Gpu *gpu, const vector& hashes) const { Words A{makeWords(E, 3)}; Words B{this->B}; - + auto hash = proof::hashWords(E, B); - u32 span = E; + u64 span = E; for (u32 i = 0; i < power; ++i, span = (span + 1) / 2) { const Words& M = middles[i]; hash = proof::hashWords(E, hash, M); @@ -113,12 +115,12 @@ bool Proof::verify(Gpu *gpu, const vector& hashes) const { if (gpu->args.verbose) { log("proof [%u] : A %016" PRIx64 ", B %016" PRIx64 ", h %016" PRIx64 "\n", i, res64(A), res64(B), h); } } - log("proof verification: doing %d iterations\n", span); + log("proof verification: doing %" PRIu64 " iterations\n", span); A = gpu->expExp2(A, span); bool ok = (A == B); if (ok) { - log("proof: %u proved %s\n", E, isPrime ? "probable prime" : "composite"); + log("proof: %" PRIu64 " proved %s\n", E, isPrime ? "probable prime" : "composite"); } else { log("proof: invalid (%016" PRIx64 " expected %016" PRIx64 ")\n", res64(A), res64(B)); } @@ -127,9 +129,9 @@ bool Proof::verify(Gpu *gpu, const vector& hashes) const { // ---- ProofSet ---- -ProofSet::ProofSet(u32 E, u32 power) +ProofSet::ProofSet(u64 E, u32 power) : E{E}, power{power} { - + assert(E & 1); // E is supposed to be prime if (power <= 0 || power > 12) { log("Invalid proof power: %u\n", power); @@ -138,11 +140,13 @@ ProofSet::ProofSet(u32 E, u32 power) fs::create_directories(proofPath(E)); - vector spans; - for (u32 span = (E + 1) / 2; spans.size() < power; span = (span + 1) / 2) { spans.push_back(span); } + vector spans; + for (u64 span = (E + 1) / 2; spans.size() < power; span = (span + 1) / 2) { spans.push_back(span); } points.push_back(0); - for (u32 p = 0, span = (E + 1) / 2; p < power; ++p, span = (span + 1) / 2) { + u32 p; + u64 span; + for (p = 0, span = (E + 1) / 2; p < power; ++p, span = (span + 1) / 2) { for (u32 i = 0, end = points.size(); i < end; ++i) { points.push_back(points[i] + span); } @@ -160,15 +164,17 @@ ProofSet::ProofSet(u32 E, u32 power) points.push_back(u32(-1)); // guard element cacheIt = points.begin(); - for ([[maybe_unused]] u32 p : points) { + for ([[maybe_unused]] u64 p : points) { assert(p > E || isInPoints(E, power, p)); } } -bool ProofSet::isInPoints(u32 E, u32 power, u32 k) { +bool ProofSet::isInPoints(u64 E, u32 power, u64 k) { if (k == E) { return true; } // special-case E - u32 start = 0; - for (u32 p = 0, span = (E + 1) / 2; p < power; ++p, span = (span + 1) / 2) { + u64 start = 0; + u32 p; + u64 span; + for (p = 0, span = (E + 1) / 2; p < power; ++p, span = (span + 1) / 2) { assert(k >= start); if (k > start + span) { start += span; @@ -179,12 +185,12 @@ bool ProofSet::isInPoints(u32 E, u32 power, u32 k) { return false; } -bool ProofSet::canDo(u32 E, u32 power, u32 currentK) { +bool ProofSet::canDo(u64 E, u32 power, u64 currentK) { assert(power > 0 && power <= 12); return ProofSet{E, power}.isValidTo(currentK); } -u32 ProofSet::bestPower(u32 E) { +u32 ProofSet::bestPower(u64 E) { // Best proof powers assuming no disk space concern. // We increment power by 1 for each fourfold increase of the exponent. // The values below produce power=10 at wavefront, and power=11 at 100Mdigits: @@ -197,7 +203,7 @@ u32 ProofSet::bestPower(u32 E) { return power; } -double ProofSet::diskUsageGB(u32 E, u32 power) { +double ProofSet::diskUsageGB(u64 E, u32 power) { // -3 because convert exponent bits to bytes // -30 because convert bytes to GB // +power because needs 2^power residues for proof generation @@ -205,7 +211,7 @@ double ProofSet::diskUsageGB(u32 E, u32 power) { return power ? ldexp(E, -33 + int(power)) * 1.05 : 0.0; } -u32 ProofSet::effectivePower(u32 E, u32 power, u32 currentK) { +u32 ProofSet::effectivePower(u64 E, u32 power, u64 currentK) { for (u32 p = power; p > 0; --p) { // log("validating proof residues for power %u\n", p); if (canDo(E, p, currentK)) { return p; } @@ -213,11 +219,11 @@ u32 ProofSet::effectivePower(u32 E, u32 power, u32 currentK) { return 0; } -bool ProofSet::fileExists(u32 k) const { +bool ProofSet::fileExists(u64 k) const { return File::size(proofPath(E) / to_string(k)) == i64(E / 32 + 2) * 4; } -bool ProofSet::isValidTo(u32 limitK) const { +bool ProofSet::isValidTo(u64 limitK) const { auto it = upper_bound(points.begin(), points.end(), limitK); if (it == points.begin()) { @@ -238,14 +244,14 @@ bool ProofSet::isValidTo(u32 limitK) const { return true; } -u32 ProofSet::next(u32 k) const { +u64 ProofSet::next(u64 k) const { if (*cacheIt <= k || (cacheIt > points.begin() && *prev(cacheIt) > k)) { cacheIt = upper_bound(points.begin(), points.end(), k); } return *cacheIt; } -void ProofSet::save(u32 E, u32 power, u32 k, const Words& words) { +void ProofSet::save(u64 E, u32 power, u64 k, const Words& words) { assert(k && k <= E); assert(isInPoints(E, power, k)); @@ -253,7 +259,7 @@ void ProofSet::save(u32 E, u32 power, u32 k, const Words& words) { assert(load(E, power, k) == words); } -Words ProofSet::load(u32 E, u32 power, u32 k) { +Words ProofSet::load(u64 E, u32 power, u64 k) { assert(k && k <= E); assert(isInPoints(E, power, k)); return File::openReadThrow(proofPath(E) / to_string(k)).readChecked(E/32 + 1); diff --git a/src/Proof.h b/src/Proof.h index 97e9056d..c83d8ace 100644 --- a/src/Proof.h +++ b/src/Proof.h @@ -4,6 +4,7 @@ #include "File.h" #include "common.h" +#include namespace fs = std::filesystem; @@ -11,15 +12,15 @@ class Gpu; struct ProofInfo { u32 power; - u32 exp; + u64 exp; string md5; }; namespace proof { -array hashWords(u32 E, const Words& words); +array hashWords(u64 E, const Words& words); -array hashWords(u32 E, array prefix, const Words& words); +array hashWords(u64 E, array prefix, const Words& words); string fileHash(const fs::path& filePath); @@ -29,7 +30,7 @@ ProofInfo getInfo(const fs::path& proofFile); class Proof { public: - const u32 E; + const u64 E; const Words B; const vector middles; @@ -40,7 +41,7 @@ class Proof { POWER=8\n NUMBER=M216091\n */ - static const constexpr char* HEADER_v2 = "PRP PROOF\nVERSION=2\nHASHSIZE=64\nPOWER=%u\nNUMBER=M%u%c"; + static const constexpr char* HEADER_v2 = "PRP PROOF\nVERSION=2\nHASHSIZE=64\nPOWER=%u\nNUMBER=M%" PRIu64 "%c"; static Proof load(const fs::path& path); @@ -53,38 +54,37 @@ class Proof { class ProofSet { public: - const u32 E; + const u64 E; const u32 power; private: - vector points; + vector points; - bool isValidTo(u32 limitK) const; + bool isValidTo(u64 limitK) const; - static bool canDo(u32 E, u32 power, u32 currentK); + static bool canDo(u64 E, u32 power, u64 currentK); mutable decltype(points)::const_iterator cacheIt{}; - bool fileExists(u32 k) const; + bool fileExists(u64 k) const; - static fs::path proofPath(u32 E) { return fs::path(to_string(E)) / "proof"; } + static fs::path proofPath(u64 E) { return fs::path(to_string(E)) / "proof"; } public: - static u32 bestPower(u32 E); - static u32 effectivePower(u32 E, u32 power, u32 currentK); - static double diskUsageGB(u32 E, u32 power); - static bool isInPoints(u32 E, u32 power, u32 k); + static u32 bestPower(u64 E); + static u32 effectivePower(u64 E, u32 power, u64 currentK); + static double diskUsageGB(u64 E, u32 power); + static bool isInPoints(u64 E, u32 power, u64 k); - ProofSet(u32 E, u32 power); - - u32 next(u32 k) const; + ProofSet(u64 E, u32 power); - static void save(u32 E, u32 power, u32 k, const Words& words); - static Words load(u32 E, u32 power, u32 k); - - void save(u32 k, const Words& words) const { return save(E, power, k, words); } - Words load(u32 k) const { return load(E, power, k); } + u64 next(u64 k) const; + static void save(u64 E, u32 power, u64 k, const Words& words); + static Words load(u64 E, u32 power, u64 k); + + void save(u64 k, const Words& words) const { return save(E, power, k, words); } + Words load(u64 k) const { return load(E, power, k); } std::pair> computeProof(Gpu *gpu) const; }; diff --git a/src/Saver.cpp b/src/Saver.cpp index 118d1eae..68caba41 100644 --- a/src/Saver.cpp +++ b/src/Saver.cpp @@ -15,19 +15,19 @@ namespace { // E, k, block-size, res64, nErrors, CRC -static constexpr const char *PRP_v12 = "OWL PRP 12 %u %u %u %016" SCNx64 " %u %u\n"; +static constexpr const char *PRP_v12 = "OWL PRP 12 %" PRIu64 " %" PRIu64 " %u %016" SCNx64 " %u %u\n"; // Anticipated next version of the header. // Has general number form N=k*b^E+c, and labels for values. -static constexpr const char *PRP_v13 = "OWL PRP 13 N=1*2^%u-1 k=%u block=%u res64=%016" SCNx64 " err=%u time=%lf\n"; +static constexpr const char *PRP_v13 = "OWL PRP 13 N=1*2^%" PRIu64 "-1 k=%" PRIu64 " block=%u res64=%016" SCNx64 " err=%u time=%lf\n"; // static constexpr const char *PRP_v13_PRI = "OWL PRP 13 N=1*2^%u-1 k=%u block=%u res64=%016" PRIx64 " err=%u time=%.0lf\n"; // E, k, CRC -static constexpr const char *LL_v1 = "OWL LL 1 E=%u k=%u CRC=%u\n"; +static constexpr const char *LL_v1 = "OWL LL 1 E=%" PRIu64 " k=%" PRIu64 " CRC=%u\n"; // Anticipated next version. // Push version number to sync it with PRP. -static constexpr const char *LL_v13 = "OWL LL 13 N=1*2^%u-1 k=%u time=%lf\n"; +static constexpr const char *LL_v13 = "OWL LL 13 N=1*2^%" PRIu64 "-1 k=%" PRIu64 " time=%lf\n"; struct BadHeaderError { string name; }; @@ -35,8 +35,8 @@ bool startsWith(const string& s, const string& prefix) { return s.rfind(prefix, 0) == 0; } -vector savefiles(fs::path dir, const string& prefix, const string& kind) { - vector v; +vector savefiles(fs::path dir, const string& prefix, const string& kind) { + vector v; for (const auto& entry: fs::directory_iterator(dir)) { if (entry.is_regular_file()) { string filename = entry.path().filename().string(); @@ -45,7 +45,7 @@ vector savefiles(fs::path dir, const string& prefix, const string& kind) { assert(dot > prefix.size()); string id = filename.substr(prefix.size(), dot - prefix.size()); if (id == "unverified") { continue; } - u32 k = 0; + u64 k = 0; const char* first = id.data(); const char* end = first + id.size(); auto res = from_chars(first, end, k); @@ -61,13 +61,13 @@ vector savefiles(fs::path dir, const string& prefix, const string& kind) { return v; } -string str9(u32 k) { +string str9(u64 k) { char buf[32]; - snprintf(buf, sizeof(buf), "%09u", k); + snprintf(buf, sizeof(buf), "%09" PRIu64, k); return buf; } -fs::path pathFor(fs::path base, const string& prefix, const string& kind, u32 k) { +fs::path pathFor(fs::path base, const string& prefix, const string& kind, u64 k) { return base / (prefix + str9(k) + '.' + kind); } @@ -79,16 +79,17 @@ fs::path pathUnverified(fs::path base, const string& prefix) { // . // e.g.: 125784077-010000000.prp fs::path findLast(fs::path dir, const string& prefix, const string& kind) { - vector v = savefiles(dir, prefix, kind); + vector v = savefiles(dir, prefix, kind); if (v.empty()) { return {}; } - u32 lastK = v.back(); + u64 lastK = v.back(); fs::path path = pathFor(dir, prefix, kind, lastK); assert(is_regular_file(path)); return path; } PRPState readState(const PRPState& dummy, File fi) { - u32 exponent{}, k{}, blockSize{}, nErrors{}; + u64 exponent{}, k{}; + u32 blockSize{}, nErrors{}; u64 res64{}; double elapsed{}; @@ -108,7 +109,7 @@ PRPState readState(const PRPState& dummy, File fi) { } LLState readState(const LLState& dummy, File fi) { - u32 exponent{}, k{}; + u64 exponent{}, k{}; double elapsed{}; string header = fi.readLine(); @@ -142,7 +143,7 @@ void writeState(const File& fo, const LLState& state) { fo.writeChecked(state.data); } -double roundNumberScore(u32 x) { +double roundNumberScore(u64 x) { if (x == 0) { return 1; } double score = 0; @@ -169,7 +170,7 @@ template<> LLState Saver::initState() { // ---- Saver ---- template -Saver::Saver(u32 exponent, u32 blockSize, u32 nSavefiles) : +Saver::Saver(u64 exponent, u32 blockSize, u32 nSavefiles) : exponent{exponent}, blockSize{blockSize}, prefix{to_string(exponent) + '-'}, @@ -188,7 +189,7 @@ template Saver::~Saver() = default; template -void Saver::clear(u32 exponent) { +void Saver::clear(u64 exponent) { error_code dummy; fs::path base = std::is_same_v ? fs::current_path() / to_string(exponent) @@ -244,16 +245,16 @@ State Saver::load() { template void Saver::trimFiles() { - vector v = savefiles(base, prefix, State::KIND); + vector v = savefiles(base, prefix, State::KIND); assert(nSavefiles > 0); while (v.size() > nSavefiles) { int bestIdx = -1; double bestSpan = 1e20; - u32 prevK = 0; + u64 prevK = 0; for (u32 i = 0; i < v.size() - 1; ++i) { - u32 k = v[i]; + u64 k = v[i]; double niceBias = std::min(1.0, roundNumberScore(k) - 4); double span = (v[i + 1] - prevK) * niceBias; prevK = k; @@ -263,8 +264,8 @@ void Saver::trimFiles() { } } assert(bestIdx >= 0); - u32 k = v[bestIdx]; - // log("Deleting savefile %u\n", k); + u64 k = v[bestIdx]; + // log("Deleting savefile %" PRIu64 "\n", k); fs::path path = pathFor(base, prefix, State::KIND, k); fs::remove(path); v.erase(v.begin() + bestIdx); diff --git a/src/Saver.h b/src/Saver.h index 3bf8e6e7..5a3a7ba6 100644 --- a/src/Saver.h +++ b/src/Saver.h @@ -13,8 +13,8 @@ class SaveMan; struct PRPState { static const constexpr char* KIND = "prp"; - u32 exponent; - u32 k; + u64 exponent; + u64 k; u32 blockSize; u64 res64; vector check; @@ -25,15 +25,15 @@ struct PRPState { struct LLState { static const constexpr char* KIND = "ll"; - u32 exponent; - u32 k; + u64 exponent; + u64 k; vector data; double elapsed{}; }; template class Saver { - u32 exponent; + u64 exponent; u32 blockSize; fs::path base; string prefix; @@ -45,7 +45,7 @@ class Saver { fs::path mostRecentSavefile(); public: - Saver(u32 exponent, u32 blockSize, u32 nSavefiles); + Saver(u64 exponent, u32 blockSize, u32 nSavefiles); ~Saver(); State load(); @@ -53,7 +53,7 @@ class Saver { void dropMostRecent(); - static void clear(u32 exponent); + static void clear(u64 exponent); // For PRP, we can save a verified save (see save() above) or an unverified save. void saveUnverified(const PRPState& s) const; diff --git a/src/Task.cpp b/src/Task.cpp index d72f4db2..9a49654f 100644 --- a/src/Task.cpp +++ b/src/Task.cpp @@ -103,7 +103,9 @@ string json(const vector& v) { } string json(const string& s) { return '"' + s + '"'; } +string json(int x) { return to_string(x); } string json(u32 x) { return to_string(x); } +string json(u64 x) { return to_string(x); } template string json(const string& key, const T& value) { return json(key) + ':' + json(value); } @@ -112,7 +114,7 @@ string maybe(const string& key, const string& value) { return value.empty() ? "" template void operator+=(vector& a, const vector& b) { a.insert(a.end(), b.begin(), b.end()); } -vector commonFields(u32 E, const char *worktype, const string &status) { +vector commonFields(u64 E, const char *worktype, const string &status) { return { json("status", status), json("exponent", E), @@ -140,7 +142,7 @@ vector tailFields(const std::string &AID, const Args &args) { }; } -void writeResult(u32 instance, u32 E, const char *workType, const string &status, const std::string &AID, const Args &args, +void writeResult(u32 instance, u64 E, const char *workType, const string &status, const std::string &AID, const Args &args, const vector& extras) { fs::path resultsFile = "results-" + to_string(instance) + ".txt"; vector fields = commonFields(E, workType, status); @@ -220,8 +222,8 @@ void Task::execute(GpuCommon shared, Queue *q, u32 instance) { { Primes primes; if (!primes.isPrime(exponent)) { - u32 new_exponent = primes.prevPrime(exponent); - log("Warning: Exponent %u is not prime. Using exponent %u instead.\n", exponent, new_exponent); + u64 new_exponent = primes.prevPrime(exponent); + log("Warning: Exponent %" PRIu64 " is not prime. Using exponent %" PRIu64 " instead.\n", exponent, new_exponent); exponent = new_exponent; } } @@ -253,7 +255,7 @@ void Task::execute(GpuCommon shared, Queue *q, u32 instance) { Worktodo::deleteTask(*this, instance); if (isPrime) { - log("%u is PRIME!\n", exponent); + log("%" PRIu64 " is PRIME!\n", exponent); } else if (shared.args->clean) { gpu->clear(kind == PRP); } diff --git a/src/Task.h b/src/Task.h index 95f08024..4c130446 100644 --- a/src/Task.h +++ b/src/Task.h @@ -20,7 +20,7 @@ class Task { enum Kind {PRP, VERIFY, LL, CERT}; Kind kind; - u32 exponent; + u64 exponent; string AID; // Assignment ID string line; // the verbatim worktodo line, used in deleteTask(). u32 squarings; // For CERTs diff --git a/src/TuneEntry.cpp b/src/TuneEntry.cpp index c3288d24..68b6915d 100644 --- a/src/TuneEntry.cpp +++ b/src/TuneEntry.cpp @@ -3,10 +3,11 @@ #include "CycleFile.h" #include +#include // Returns whether *results* was updated. bool TuneEntry::update(vector& results) const { - u32 maxExp = fft.maxExp(); + u64 maxExp = fft.maxExp(); [[maybe_unused]] bool didErase = false; int i{}; @@ -28,7 +29,7 @@ bool TuneEntry::update(vector& results) const { // Returns whether entry *e* represents an improvement over *results* (i.e. would update the results). bool TuneEntry::willUpdate(const vector& results) const { - u32 maxExp = fft.maxExp(); + u64 maxExp = fft.maxExp(); for (const auto& r : results) { if (r.cost > cost) { break; @@ -51,7 +52,7 @@ vector TuneEntry::readTuneFile(const Args& args) { File fi = File::openRead(tuneFile); if (!fi) { return {}; } - [[maybe_unused]] u32 prevMaxExp{}; + [[maybe_unused]] u64 prevMaxExp{}; [[maybe_unused]] double prevCost{}; for (const string& line : fi) { @@ -71,14 +72,14 @@ vector TuneEntry::readTuneFile(const Args& args) { } void TuneEntry::writeTuneFile(const vector& results) { - [[maybe_unused]] u32 prevMaxExp{}; + [[maybe_unused]] u64 prevMaxExp{}; [[maybe_unused]] double prevCost{}; CycleFile tune{"tune.txt"}; for (const TuneEntry& r : results) { - u32 maxExp = r.fft.maxExp(); + u64 maxExp = r.fft.maxExp(); assert(r.cost >= prevCost && maxExp > prevMaxExp); prevCost = r.cost; prevMaxExp = maxExp; - tune->printf("%6.1f %14s # %u\n", r.cost, r.fft.spec().c_str(), maxExp); + tune->printf("%6.1f %14s # %" PRIu64 "\n", r.cost, r.fft.spec().c_str(), maxExp); } } diff --git a/src/Worktodo.cpp b/src/Worktodo.cpp index 0a981a39..80afdc51 100644 --- a/src/Worktodo.cpp +++ b/src/Worktodo.cpp @@ -167,11 +167,11 @@ optional getWork(Args& args, i32 instance) { std::optional Worktodo::getTask(Args &args, i32 instance) { if (instance == 0) { if (args.prpExp) { - u32 exp = args.prpExp; + u64 exp = args.prpExp; args.prpExp = 0; return Task{Task::PRP, exp}; } else if (args.llExp) { - u32 exp = args.llExp; + u64 exp = args.llExp; args.llExp = 0; return Task{Task::LL, exp}; } else if (!args.verifyPath.empty()) { diff --git a/src/common.h b/src/common.h index 516b099d..530795bb 100644 --- a/src/common.h +++ b/src/common.h @@ -46,15 +46,15 @@ using Words = vector; inline u64 res64(const Words& words) { return words.empty() ? 0 : ((u64(words[1]) << 32) | words[0]); } -inline u32 nWords(u32 E) { return (E - 1) / 32 + 1; } +inline u32 nWords(u64 E) { return u32((E - 1) / 32 + 1); } -inline Words makeWords(u32 E, u32 value) { +inline Words makeWords(u64 E, u32 value) { Words ret(nWords(E)); ret[0] = value; return ret; } -inline u32 roundUp(u32 x, u32 multiple) { return ((x - 1) / multiple + 1) * multiple; } +inline u64 roundUp(u64 x, u32 multiple) { return ((x - 1) / multiple + 1) * multiple; } u32 crc32(const void* data, size_t size); diff --git a/src/shared.h b/src/shared.h index c2d90dbc..6405d9c1 100644 --- a/src/shared.h +++ b/src/shared.h @@ -1,4 +1,4 @@ // included from both C++ and OpenCL. -u32 bitposToWord(u32 E, u32 N, u32 offset) { return offset * ((u64) N) / E; } -u32 wordToBitpos(u32 E, u32 N, u32 word) { return (word * ((u64) E) + (N - 1)) / N; } +u32 bitposToWord(u64 E, u32 N, u32 offset) { return offset * ((u64) N) / E; } +u32 wordToBitpos(u64 E, u32 N, u32 word) { return (word * ((u64) E) + (N - 1)) / N; } diff --git a/src/state.cpp b/src/state.cpp index 4ac159d8..77ebff8f 100644 --- a/src/state.cpp +++ b/src/state.cpp @@ -10,7 +10,7 @@ static i64 lowBits(i64 u, int bits) { return (u << (64 - bits)) >> (64 - bits); } -std::vector compactBits(const vector &dataVect, u32 E) { +std::vector compactBits(const vector &dataVect, u64 E) { if (dataVect.empty()) { return {}; } // Indicating all zero u32 N = dataVect.size(); @@ -87,7 +87,7 @@ struct BitBucket { } }; -vector expandBits(const vector &compactBits, u32 N, u32 E) { +vector expandBits(const vector &compactBits, u32 N, u64 E) { assert(E % 32 != 0); std::vector out(N); diff --git a/src/state.h b/src/state.h index 9b37c0fd..02fab3fb 100644 --- a/src/state.h +++ b/src/state.h @@ -8,10 +8,10 @@ #include #include -vector compactBits(const vector &dataVect, u32 E); -vector expandBits(const vector &compactBits, u32 N, u32 E); +vector compactBits(const vector &dataVect, u64 E); +vector expandBits(const vector &compactBits, u32 N, u64 E); -constexpr u32 step(u32 N, u32 E) { return N - (E % N); } -constexpr u32 extra(u32 N, u32 E, u32 k) { return u64(step(N, E)) * k % N; } -constexpr bool isBigWord(u32 N, u32 E, u32 k) { return extra(N, E, k) + step(N, E) < N; } -constexpr u32 bitlen(u32 N, u32 E, u32 k) { return E / N + isBigWord(N, E, k); } +constexpr u32 step(u32 N, u64 E) { return N - (E % N); } +constexpr u32 extra(u32 N, u64 E, u32 k) { return u64(step(N, E)) * k % N; } +constexpr bool isBigWord(u32 N, u64 E, u32 k) { return extra(N, E, k) + step(N, E) < N; } +constexpr u32 bitlen(u32 N, u64 E, u32 k) { return E / N + isBigWord(N, E, k); } diff --git a/src/tune.cpp b/src/tune.cpp index 3b773437..a3a12321 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -173,7 +173,7 @@ printf ("Reguess bpw for %s is %.2f first Z22 is %.2f\n", fft.spec().c_str(), bp } float Tune::zForBpw(float bpw, FFTConfig fft, u32 count) { - u32 exponent = (count == 1) ? primes.prevPrime(fft.size() * bpw) : primes.nextPrime(fft.size() * bpw); + u64 exponent = (count == 1) ? primes.prevPrime(fft.size() * bpw) : primes.nextPrime(fft.size() * bpw); float total_z = 0.0; for (u32 i = 0; i < count; i++, exponent = primes.nextPrime (exponent + 1)) { auto [ok, res, roeSq, roeMul] = Gpu::make(q, exponent, shared, fft, {}, false)->measureROE(true); @@ -249,7 +249,7 @@ void Tune::carryTune() { double m = 0; const float mid = fft.shape.carry32BPW(); for (float bpw : {mid - 0.05, mid + 0.05}) { - u32 exponent = primes.nearestPrime(fft.size() * bpw); + u64 exponent = primes.nearestPrime(fft.size() * bpw); auto [ok, carry] = Gpu::make(q, exponent, shared, fft, {}, false)->measureCarry(); m = carry.max; if (!ok) { log("Error %s at %f\n", fft.spec().c_str(), bpw); } @@ -257,7 +257,7 @@ void Tune::carryTune() { } float avg = (zv[0] + zv[1]) / 2; - u32 exponent = fft.shape.carry32BPW() * fft.size(); + u64 exponent = fft.shape.carry32BPW() * fft.size(); double pErr100 = -expm1(-exp(-avg) * exponent * 100); log("%14s %.3f : %.3f (%.3f %.3f) %f %.0f%%\n", fft.spec().c_str(), mid, avg, zv[0], zv[1], m, pErr100 * 100); fo.printf("%f %f\n", log2(fft.size()), avg); @@ -292,8 +292,8 @@ void Tune::ctune() { for (FFTShape shape : shapes) { FFTConfig fft{shape, 101, CARRY_32}; - u32 exponent = primes.prevPrime(fft.maxExp()); - // log("tuning %10s with exponent %u\n", fft.shape.spec().c_str(), exponent); + u64 exponent = primes.prevPrime(fft.maxExp()); + // log("tuning %10s with exponent %" PRIu64 "\n", fft.shape.spec().c_str(), exponent); vector bestPos(configsVect.size()); Entry best{{1, 1, 1}, {}, 1e9}; @@ -448,7 +448,7 @@ void Tune::tune() { // Find best IN_WG,IN_SIZEX,OUT_WG,OUT_SIZEX settings if (1/*option to time IN/OUT settings*/) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_in_wg = 0; u32 best_in_sizex = 0; u32 current_in_wg = args->value("IN_WG", 128); @@ -497,7 +497,7 @@ void Tune::tune() { // Find best PAD setting. Default is 256 bytes for AMD, 0 for all others. if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_pad = 0; u32 current_pad = args->value("PAD", AMDGPU ? 256 : 0); double best_cost = -1.0; @@ -517,7 +517,7 @@ void Tune::tune() { // Find best MIDDLE_IN_LDS_TRANSPOSE setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_middle_in_lds_transpose = 0; u32 current_middle_in_lds_transpose = args->value("MIDDLE_IN_LDS_TRANSPOSE", 1); double best_cost = -1.0; @@ -537,7 +537,7 @@ void Tune::tune() { // Find best MIDDLE_OUT_LDS_TRANSPOSE setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_middle_out_lds_transpose = 0; u32 current_middle_out_lds_transpose = args->value("MIDDLE_OUT_LDS_TRANSPOSE", 1); double best_cost = -1.0; @@ -557,7 +557,7 @@ void Tune::tune() { // Find best INPLACE setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_inplace = 0; double best_cost = -1.0; double current_cost = -1.0; @@ -576,7 +576,7 @@ void Tune::tune() { // Find best NONTEMPORAL setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_nontemporal = 0; u32 current_nontemporal = args->value("NONTEMPORAL", 0); double best_cost = -1.0; @@ -596,7 +596,7 @@ void Tune::tune() { // Find best FAST_BARRIER setting if (AMDGPU) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_fast_barrier = 0; u32 current_fast_barrier = args->value("FAST_BARRIER", 0); double best_cost = -1.0; @@ -616,7 +616,7 @@ void Tune::tune() { // Find best TAIL_KERNELS setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_tail_kernels = 0; u32 current_tail_kernels = args->value("TAIL_KERNELS", 2); double best_cost = -1.0; @@ -639,7 +639,7 @@ void Tune::tune() { // Find best TAIL_TRIGS setting if (time_FFTs) { FFTConfig fft{defaultFFTShape, 101, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_tail_trigs = 0; u32 current_tail_trigs = args->value("TAIL_TRIGS", 2); double best_cost = -1.0; @@ -660,7 +660,7 @@ void Tune::tune() { if (time_NTTs) { FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO}; if (!fft.NTT_GF31) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO); - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_tail_trigs = 0; u32 current_tail_trigs = args->value("TAIL_TRIGS31", 0); double best_cost = -1.0; @@ -681,7 +681,7 @@ void Tune::tune() { if (time_NTTs && time_FP32) { FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO}; if (!fft.FFT_FP32) fft = FFTConfig(FFTShape(FFT3261, 512, 8, 512), 202, CARRY_AUTO); - u32 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size()); // Back off the maxExp as different settings will have different maxBpw + u64 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size()); // Back off the maxExp as different settings will have different maxBpw u32 best_tail_trigs = 0; u32 current_tail_trigs = args->value("TAIL_TRIGS32", 2); double best_cost = -1.0; @@ -702,7 +702,7 @@ void Tune::tune() { if (time_NTTs) { FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO}; if (!fft.NTT_GF61) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO); - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_tail_trigs = 0; u32 current_tail_trigs = args->value("TAIL_TRIGS61", 0); double best_cost = -1.0; @@ -722,7 +722,7 @@ void Tune::tune() { // Find best TABMUL_CHAIN setting if (time_FFTs) { FFTConfig fft{defaultFFTShape, 101, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_tabmul_chain = 0; u32 current_tabmul_chain = args->value("TABMUL_CHAIN", 0); double best_cost = -1.0; @@ -743,7 +743,7 @@ void Tune::tune() { if (time_NTTs) { FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO}; if (!fft.NTT_GF31) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO); - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_tabmul_chain = 0; u32 current_tabmul_chain = args->value("TABMUL_CHAIN31", 0); double best_cost = -1.0; @@ -764,7 +764,7 @@ void Tune::tune() { if (time_NTTs && time_FP32) { FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO}; if (!fft.FFT_FP32) fft = FFTConfig(FFTShape(FFT3261, 512, 8, 512), 202, CARRY_AUTO); - u32 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size()); // Back off the maxExp as different settings will have different maxBpw + u64 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size()); // Back off the maxExp as different settings will have different maxBpw u32 best_tabmul_chain = 0; u32 current_tabmul_chain = args->value("TABMUL_CHAIN32", 0); double best_cost = -1.0; @@ -785,7 +785,7 @@ void Tune::tune() { if (time_NTTs) { FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO}; if (!fft.NTT_GF61) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO); - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_tabmul_chain = 0; u32 current_tabmul_chain = args->value("TABMUL_CHAIN61", 0); double best_cost = -1.0; @@ -806,7 +806,7 @@ void Tune::tune() { if (time_NTTs) { FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO}; if (!fft.NTT_GF31) fft = FFTConfig(FFTShape(FFT3161, 512, 8, 512), 202, CARRY_AUTO); - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_modm31 = 0; u32 current_modm31 = args->value("MODM31", 0); double best_cost = -1.0; @@ -826,7 +826,7 @@ void Tune::tune() { // Find best UNROLL_W setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_unroll_w = 0; u32 current_unroll_w = args->value("UNROLL_W", AMDGPU ? 0 : 1); double best_cost = -1.0; @@ -846,7 +846,7 @@ void Tune::tune() { // Find best UNROLL_H setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_unroll_h = 0; u32 current_unroll_h = args->value("UNROLL_H", AMDGPU && defaultShape->height >= 1024 ? 0 : 1); double best_cost = -1.0; @@ -866,7 +866,7 @@ void Tune::tune() { // Find best ZEROHACK_W setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_zerohack_w = 0; u32 current_zerohack_w = args->value("ZEROHACK_W", 1); double best_cost = -1.0; @@ -886,7 +886,7 @@ void Tune::tune() { // Find best ZEROHACK_H setting if (1) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_zerohack_h = 0; u32 current_zerohack_h = args->value("ZEROHACK_H", 1); double best_cost = -1.0; @@ -906,7 +906,7 @@ void Tune::tune() { // Find best BIGLIT setting if (time_FFTs) { FFTConfig fft{*defaultShape, 101, CARRY_AUTO}; - u32 exponent = primes.prevPrime(fft.maxExp()); + u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_biglit = 0; u32 current_biglit = args->value("BIGLIT", 1); double best_cost = -1.0; @@ -987,7 +987,7 @@ skip_1K_256 = 0; if ((shape.fft_type == FFT3261 || shape.fft_type == FFT323161 || shape.fft_type == FFT3231 || shape.fft_type == FFT32) && !time_FP32) continue; // Time an exponent that's good for all variants and carry-config. - u32 exponent = primes.prevPrime(FFTConfig{shape, shape.width <= 1024 ? 0u : 100u, CARRY_32}.maxExp()); + u64 exponent = primes.prevPrime(FFTConfig{shape, shape.width <= 1024 ? 0u : 100u, CARRY_32}.maxExp()); u32 adjusted_quick = (exponent < 50000000) ? quick - 1 : (exponent < 170000000) ? quick : (exponent < 350000000) ? quick + 1 : quick + 2; if (adjusted_quick < 1) adjusted_quick = 1; if (adjusted_quick > 10) adjusted_quick = 10; From 205754df4610dfccef54f314dee11fe1fde413e5 Mon Sep 17 00:00:00 2001 From: george Date: Fri, 26 Dec 2025 03:58:25 +0000 Subject: [PATCH 2/9] Merged in previous inplace changes to tune.cpp --- src/tune.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/tune.cpp b/src/tune.cpp index a3a12321..4d613d0f 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -348,6 +348,7 @@ void Tune::tune() { bool time_FFTs = 0; bool time_NTTs = 0; bool time_FP32 = 1; + bool time_inplace_only = 0; int quick = 7; // Run config from slowest (quick=1) to fastest (quick=10) u64 min_exponent = 75000000; u64 max_exponent = 350000000; @@ -360,6 +361,7 @@ void Tune::tune() { if (s == "fp64") time_FFTs = 1; if (s == "ntt") time_NTTs = 1; if (s == "nofp32") time_FP32 = 0; + if (s == "inplace") time_inplace_only = 1; auto keyVal = split(s, '='); if (keyVal.size() == 2) { if (keyVal.front() == "quick") quick = stod(keyVal.back()); @@ -446,7 +448,7 @@ void Tune::tune() { args->flags["INPLACE"] = to_string(0); // Find best IN_WG,IN_SIZEX,OUT_WG,OUT_SIZEX settings - if (1/*option to time IN/OUT settings*/) { + if (!time_inplace_only) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_in_wg = 0; @@ -495,7 +497,7 @@ void Tune::tune() { } // Find best PAD setting. Default is 256 bytes for AMD, 0 for all others. - if (1) { + if (!time_inplace_only) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_pad = 0; @@ -515,7 +517,7 @@ void Tune::tune() { } // Find best MIDDLE_IN_LDS_TRANSPOSE setting - if (1) { + if (!time_inplace_only) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_middle_in_lds_transpose = 0; @@ -535,7 +537,7 @@ void Tune::tune() { } // Find best MIDDLE_OUT_LDS_TRANSPOSE setting - if (1) { + if (!time_inplace_only) { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_middle_out_lds_transpose = 0; @@ -554,8 +556,13 @@ void Tune::tune() { args->flags["MIDDLE_OUT_LDS_TRANSPOSE"] = to_string(best_middle_out_lds_transpose); } + // If only timing INPLACE=1 options, then set INPLACE + if (time_inplace_only) { + args->flags["INPLACE"] = to_string(1); + newConfigKeyVals.push_back({"INPLACE", 1}); + } // Find best INPLACE setting - if (1) { + else { FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_inplace = 0; @@ -905,7 +912,7 @@ void Tune::tune() { // Find best BIGLIT setting if (time_FFTs) { - FFTConfig fft{*defaultShape, 101, CARRY_AUTO}; + FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_biglit = 0; u32 current_biglit = args->value("BIGLIT", 1); From a3719b01449edf18a961cc5b44acbced44deca15 Mon Sep 17 00:00:00 2001 From: george Date: Fri, 26 Dec 2025 04:10:47 +0000 Subject: [PATCH 3/9] Fixed carry propagtion bug when BPW was very low (lower than one would see in normal usage). The optimization that generated 32-bit FFT data words requiring long carries was modified. --- src/cl/carryutil.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cl/carryutil.cl b/src/cl/carryutil.cl index 6cdff372..4e04812b 100644 --- a/src/cl/carryutil.cl +++ b/src/cl/carryutil.cl @@ -681,7 +681,7 @@ Word OVERLOAD carryStepSignedSloppy(i96 x, i64 *outCarry, bool isBigWord) { // i64 xhi = i96_hi64(x) + xmid_topbit; // *outCarry = xhi >> (nBits - 32); // return as_long((int2)(i96_lo32(x), whi)); -#elif EXP / NWORDS == 31 || SLOPPY_MAXBPW >= 3200 // nBits = 31 or 32, bigwordBits = 32 (or allowed to create 32-bit word for better performance) +#elif EXP / NWORDS == 31 || (SLOPPY_MAXBPW >= 3200 && EXP / NWORDS >= 22) // nBits = 31 or 32, bigwordBits = 32 (or allowed to create 32-bit word for better performance) i32 w = i96_lo32(x); // lowBits(x, bigwordBits = 32); *outCarry = (i96_hi64(x) + (w < 0)) << (32 - nBits); return w; From eceaad6173232be7af9980f8cd961ad9792ea9ac Mon Sep 17 00:00:00 2001 From: george Date: Tue, 30 Dec 2025 01:34:09 +0000 Subject: [PATCH 4/9] Output num squarings a cert will perform as well as an ETA. --- src/Gpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Gpu.cpp b/src/Gpu.cpp index 433ce0a7..e9d20522 100644 --- a/src/Gpu.cpp +++ b/src/Gpu.cpp @@ -2086,7 +2086,7 @@ array Gpu::isCERT(const Task& task) { float secsPerIt = iterationTimer.reset(k); queue->setSquareTime((int) (secsPerIt * 1'000'000)); - log("%9u %016" PRIx64 " %4.0f\n", k, res64, secsPerIt * 1'000'000); + log("%7u / %7u %016" PRIx64 " %4.0f ETA %s\n", k, kEnd, res64, secsPerIt * 1'000'000, getETA(k, kEnd, secsPerIt).c_str()); if (k >= kEnd) { fs::remove (fname); From 5cfbe0b3d0adb42ede81fdea15a3d17a3c59fceb Mon Sep 17 00:00:00 2001 From: george Date: Tue, 30 Dec 2025 01:43:17 +0000 Subject: [PATCH 5/9] Output error message when worktodo-N.txt is empty. Helpful for novice users. --- src/Worktodo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Worktodo.cpp b/src/Worktodo.cpp index 80afdc51..1628a391 100644 --- a/src/Worktodo.cpp +++ b/src/Worktodo.cpp @@ -114,7 +114,7 @@ optional getWork(Args& args, i32 instance) { // Try to get a task from the local worktodo- file. if (optional task = bestTask(localWork, args.smallest)) { return task; } - if (args.masterDir.empty()) { return {}; } + if (args.masterDir.empty()) { log("No work to do found. Add work to %s.\n", localWork.c_str()); return {}; } fs::path worktodo = args.masterDir / "worktodo.txt"; From 3a7a76ddb33a3c69999065b2274fe2d0e4e7788b Mon Sep 17 00:00:00 2001 From: Teal Dulcet Date: Sun, 14 Dec 2025 03:29:45 -0800 Subject: [PATCH 6/9] Fixed GitHub Actions CI and replaced macOS 13. --- .github/workflows/ci.yml | 47 +++++++++++++++++++++++++--------------- Makefile | 8 +++---- genbundle.sh | 17 ++++++++------- src/Args.cpp | 2 +- src/common.h | 2 +- src/tune.cpp | 2 +- 6 files changed, 45 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3947c982..6d43b725 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,13 +13,16 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-22.04, ubuntu-24.04] + os: [ubuntu-22.04, ubuntu-24.04, ubuntu-22.04-arm, ubuntu-24.04-arm] cxx: [g++, clang++] + exclude: + - os: ubuntu-22.04-arm + cxx: clang++ fail-fast: false env: CXX: ${{ matrix.cxx }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install run: | sudo apt-get update -y @@ -27,14 +30,14 @@ jobs: $CXX --version - name: Script run: | - make prpll -O -j "$(nproc)" + make -O -j "$(nproc)" cd build-release rm -f -- *.o ./prpll -h - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 if: always() with: - name: ${{ matrix.os }}_${{ matrix.cxx }}_prpll + name: ${{ matrix.os }}_${{ endsWith(matrix.os, '-arm') && 'arm' || 'x86' }}_${{ matrix.cxx }}_prpll path: ${{ github.workspace }} - name: Cppcheck run: cppcheck --enable=all --force . @@ -49,15 +52,17 @@ jobs: Windows: name: Windows - runs-on: windows-latest + runs-on: ${{ matrix.os }} strategy: matrix: + os: [windows-latest] # windows-11-arm cxx: [g++, clang++] fail-fast: false env: CXX: ${{ matrix.cxx }} + PACKAGE_PREFIX: mingw-w64-${{ endsWith(matrix.os, '-arm') && 'clang-aarch64' || 'x86_64' }}- steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Before Install run: | echo "C:\msys64\mingw64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append @@ -66,7 +71,7 @@ jobs: echo "LIBPATH=-LC:\msys64\mingw64\lib" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append - name: Install run: | - pacman -S --noconfirm mingw-w64-x86_64-gmp mingw-w64-x86_64-opencl-icd + pacman -S --noconfirm "${env:PACKAGE_PREFIX}opencl-icd" & $env:CXX --version - name: Install Clang if: ${{ matrix.cxx == 'clang++' }} @@ -74,34 +79,40 @@ jobs: pacman -S --noconfirm mingw-w64-x86_64-clang & $env:CXX --version - name: Script - run: | # Cannot use `make exe`, as the OpenCL ICD Loader does not support static linking - make prpll -O -j $env:NUMBER_OF_PROCESSORS + run: | + make -O -j $env:NUMBER_OF_PROCESSORS cd build-release rm *.o .\prpll.exe -h - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 if: always() with: - name: win_${{ matrix.cxx }}_prpll + name: win_${{ endsWith(matrix.os, '-arm') && 'arm' || 'x86' }}_${{ matrix.cxx }}_prpll path: ${{ github.workspace }} macOS: name: macOS - runs-on: macos-13 + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-15-intel, macos-latest] + fail-fast: false + env: + CXX: g++-15 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install run: | - brew install gcc@14 + $CXX --version - name: Script run: | - make prpll -j "$(sysctl -n hw.ncpu)" + make -j "$(sysctl -n hw.ncpu)" cd build-release rm -f -- *.o ./prpll -h - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v7 if: always() with: - name: macos_prpll + name: macos_${{ endsWith(matrix.os, '-intel') && 'x86' || 'arm' }}_prpll path: ${{ github.workspace }} diff --git a/Makefile b/Makefile index 26bffeda..b1c021e0 100644 --- a/Makefile +++ b/Makefile @@ -14,9 +14,9 @@ HOST_OS = $(shell uname -s) ifeq ($(HOST_OS), Darwin) # Real GCC (not clang), needed for 128-bit floats and std::filesystem::path -CXX = g++-14 +CXX ?= g++-15 else -CXX = g++ +CXX ?= g++ endif ifneq ($(findstring MINGW, $(HOST_OS)), MINGW) @@ -45,7 +45,7 @@ else BIN=build-release -CXXFLAGS = -O2 -DNDEBUG $(COMMON_FLAGS) +CXXFLAGS = -O3 -DNDEBUG $(COMMON_FLAGS) STRIP=-s endif @@ -90,7 +90,7 @@ $(BIN)/%.o : src/%.cpp $(DEPDIR)/%.d # src/bundle.cpp is just a wrapping of the OpenCL sources (*.cl) as a C string. src/bundle.cpp: genbundle.sh src/cl/*.cl - ./genbundle.sh $^ > src/bundle.cpp + bash genbundle.sh $^ > src/bundle.cpp $(DEPDIR)/%.d: ; .PRECIOUS: $(DEPDIR)/%.d diff --git a/genbundle.sh b/genbundle.sh index ec042bb2..9d176062 100755 --- a/genbundle.sh +++ b/genbundle.sh @@ -1,3 +1,4 @@ +#!/bin/bash cat < CL_FILE_NAMES\{${names}\}\; +echo "static const std::vector CL_FILE_NAMES{${names}};" cat <& getClFileNames() { return CL_FILE_NAMES; } diff --git a/src/Args.cpp b/src/Args.cpp index 041202ad..b18a95ba 100644 --- a/src/Args.cpp +++ b/src/Args.cpp @@ -118,7 +118,7 @@ and should be able to run. PRPLL keeps the active tasks in per-worker files worktodo-0.txt, worktodo-1.txt etc in the local directory. These per-worker files are supplied from the global worktodo.txt file if -pool is used. In turn the global worktodo.txt can be supplied through the primenet.py script, -either the one located at gpuowl/tools/primenet.py or https://download.mersenne.ca/primenet.py +either the one located at gpuowl/tools/primenet.py or https://download.mersenne.ca/AutoPrimeNet It is also possible to manually add exponents by adding lines of the form "PRP=118063003" to worktodo-.txt diff --git a/src/common.h b/src/common.h index 516b099d..6303e539 100644 --- a/src/common.h +++ b/src/common.h @@ -13,7 +13,7 @@ using i64 = int64_t; using u64 = uint64_t; using i128 = __int128; using u128 = unsigned __int128; -using f128 = __float128; +// using f128 = __float128; static_assert(sizeof(u8) == 1, "size u8"); static_assert(sizeof(u32) == 4, "size u32"); diff --git a/src/tune.cpp b/src/tune.cpp index 3b773437..b9cf1f71 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -947,7 +947,7 @@ void Tune::tune() { config.write("\n -log 1000000\n"); } if (args->workers < 2) { - config.write("\n# Running two workers sometimes gives better throughput. Autoprimenet will need to create up a second worktodo file."); + config.write("\n# Running two workers sometimes gives better throughput. AutoPrimeNet will need to create up a second worktodo file (use --num-workers 2)."); config.write("\n# -workers 2\n"); config.write("\n# Changing TAIL_KERNELS to 3 when running two workers may be better."); config.write("\n# -use TAIL_KERNELS=3\n"); From 89be60a188253b74593e4f36cb14b189d08a7c3c Mon Sep 17 00:00:00 2001 From: george Date: Wed, 4 Mar 2026 19:09:21 +0000 Subject: [PATCH 7/9] Changed type 3 4M FFT max bpw. LL test of 100028317 failed under old limit. --- src/fftbpw.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fftbpw.h b/src/fftbpw.h index c62db530..f3af7da7 100644 --- a/src/fftbpw.h +++ b/src/fftbpw.h @@ -138,9 +138,9 @@ {"3:256:16:256", {24.15, 24.15, 24.15, 24.15, 24.15, 24.15}}, { "3:512:8:256", {24.15, 24.15, 24.15, 24.15, 24.15, 24.15}}, { "3:512:4:512", {24.15, 24.15, 24.15, 24.15, 24.15, 24.15}}, -{ "3:1K:8:256", {23.94, 23.94, 23.94, 23.94, 23.94, 23.94}}, -{"3:512:16:256", {23.94, 23.94, 23.94, 23.94, 23.94, 23.94}}, -{ "3:512:8:512", {23.94, 23.94, 23.94, 23.94, 23.94, 23.94}}, +{ "3:1K:8:256", {23.84, 23.84, 23.84, 23.84, 23.84, 23.84}}, // LL of 100028317 failed (ROEmax=0.294, ROEavg=0.247). Lowering bpw from 23.94 to 23.84. +{"3:512:16:256", {23.84, 23.84, 23.84, 23.84, 23.84, 23.84}}, +{ "3:512:8:512", {23.84, 23.84, 23.84, 23.84, 23.84, 23.84}}, { "3:1K:16:256", {23.65, 23.65, 23.65, 23.65, 23.65, 23.65}}, { "3:1K:8:512", {23.65, 23.65, 23.65, 23.65, 23.65, 23.65}}, {"3:512:16:512", {23.65, 23.65, 23.65, 23.65, 23.65, 23.65}}, From 6b39a5f660ba21f103a0dc3f4bb36867f6301af8 Mon Sep 17 00:00:00 2001 From: george Date: Wed, 11 Mar 2026 22:42:11 +0000 Subject: [PATCH 8/9] CarryFused can now process multiple lines at the same time -- 1% performance increase on TitanV. Standardized LDS memory layout and bar() strategy. Made a cleaner, common shufl routine to handle multiple lines using new constants SHUFL_BYTES_W and SHUFL_BYTES_H. Reverse line routines overhauled to use LDS memory layout and bar() strategy. Added L2STORE and LULOAD routines for nVidia. Need to study which GPUs might benefit. Deprecated BIGLIT=0. --- src/Gpu.cpp | 52 +- src/Gpu.h | 2 +- src/cl/base.cl | 63 +- src/cl/carryfused.cl | 1349 +++++++++++++++++++++--------------------- src/cl/fftbase.cl | 412 +++++-------- src/cl/fftheight.cl | 214 ++----- src/cl/ffthin.cl | 25 +- src/cl/fftp.cl | 90 +-- src/cl/fftw.cl | 28 +- src/cl/fftwidth.cl | 171 +++--- src/cl/math.cl | 6 +- src/cl/middle.cl | 62 +- src/cl/tailmul.cl | 104 ++-- src/cl/tailsquare.cl | 222 +++---- src/cl/tailutil.cl | 639 ++++++++------------ src/tune.cpp | 4 +- 16 files changed, 1513 insertions(+), 1930 deletions(-) diff --git a/src/Gpu.cpp b/src/Gpu.cpp index e9d20522..84cc4434 100644 --- a/src/Gpu.cpp +++ b/src/Gpu.cpp @@ -85,7 +85,6 @@ Weights genWeights(FFTConfig fft, u64 E, u32 W, u32 H, u32 nW, bool AmdGpu) { vector weightsConstIF; vector weightsIF; - vector bits; if (fft.FFT_FP64) { // Inverse + Forward @@ -141,24 +140,7 @@ Weights genWeights(FFTConfig fft, u64 E, u32 W, u32 H, u32 nW, bool AmdGpu) { memcpy((double *) weightsIF.data(), weightsIF32.data(), weightsIF32.size() * sizeof(float)); } - if (fft.FFT_FP64 || fft.FFT_FP32) { - for (u32 line = 0; line < H; ++line) { - for (u32 thread = 0; thread < groupWidth; ) { - std::bitset<32> b; - for (u32 bitoffset = 0; bitoffset < 32; bitoffset += nW*2, ++thread) { - for (u32 block = 0; block < nW; ++block) { - for (u32 rep = 0; rep < 2; ++rep) { - if (isBigWord(N, E, kAt(H, line, block * groupWidth + thread) + rep)) { b.set(bitoffset + block * 2 + rep); } - } - } - } - bits.push_back(b.to_ulong()); - } - } - assert(bits.size() == N / 32); - } - - return Weights{weightsConstIF, weightsIF, bits}; + return Weights{weightsConstIF, weightsIF}; } string toLiteral(i32 value) { return to_string(value); } @@ -228,7 +210,7 @@ constexpr bool isInList(const string& s, initializer_list list) { } string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector& extraConf, u64 E, bool doLog, - bool &tail_single_wide, bool &tail_single_kernel, u32 &in_place, u32 &pad_size) { + bool &tail_single_wide, bool &tail_single_kernel, u32 &in_place, u32 &pad_size, u32 &wmul) { map config; // Highest priority is the requested "extra" conf @@ -246,6 +228,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector< // Default value for -use options that must also be parsed in C++ code tail_single_wide = 0, tail_single_kernel = 1; // Default tailSquare is double-wide in one kernel in_place = 0; // Default is not in-place + wmul = 1; // Default is carryFused processes one workgroup at a time pad_size = isAmdGpu(id) ? 256 : 0; // Default is 256 bytes for AMD, 0 for others // Validate -use options @@ -264,7 +247,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector< "NO_ASM", "DEBUG", "CARRY64", - "BIGLIT", + "BIGLIT", // Deprecated "NONTEMPORAL", "INPLACE", "PAD", @@ -279,7 +262,8 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector< "TABMUL_CHAIN31", "TABMUL_CHAIN32", "TABMUL_CHAIN61", - "MODM31" + "MODM31", + "WMUL" }); if (!isValid) { log("Warning: unrecognized -use key '%s'\n", k.c_str()); @@ -293,6 +277,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector< if (atoi(v.c_str()) == 3) tail_single_wide = 0, tail_single_kernel = 0; } if (k == "INPLACE") in_place = atoi(v.c_str()); + if (k == "WMUL") wmul = atoi(v.c_str()); if (k == "PAD") pad_size = atoi(v.c_str()); } @@ -532,7 +517,7 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector& nW(fft.shape.nW()), nH(fft.shape.nH()), useLongCarry{args.carry == Args::CARRY_LONG}, - compiler{args, queue->context, clDefines(args, queue->context->deviceId(), fft, extraConf, E, logFftSize, tail_single_wide, tail_single_kernel, in_place, pad_size)}, + compiler{args, queue->context, clDefines(args, queue->context->deviceId(), fft, extraConf, E, logFftSize, tail_single_wide, tail_single_kernel, in_place, pad_size, wmul)}, #define K(name, ...) name(#name, &compiler, profile.make(#name), queue, __VA_ARGS__) @@ -581,11 +566,11 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector& K(kCarryM, "carry.cl", "carry", hN / CARRY_LEN, "-DMUL3=1"), K(kCarryMROE, "carry.cl", "carry", hN / CARRY_LEN, "-DMUL3=1 -DROE=1"), K(kCarryLL, "carry.cl", "carry", hN / CARRY_LEN, "-DLL=1"), - K(kCarryFused, "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW), - K(kCarryFusedROE, "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW, "-DROE=1"), - K(kCarryFusedMul, "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW, "-DMUL3=1"), - K(kCarryFusedMulROE, "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW, "-DMUL3=1 -DROE=1"), - K(kCarryFusedLL, "carryfused.cl", "carryFused", WIDTH * (BIG_H + 1) / nW, "-DLL=1"), + K(kCarryFused, "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW), + K(kCarryFusedROE, "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW, "-DROE=1"), + K(kCarryFusedMul, "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW, "-DMUL3=1"), + K(kCarryFusedMulROE, "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW, "-DMUL3=1 -DROE=1"), + K(kCarryFusedLL, "carryfused.cl", "carryFused", WIDTH * (BIG_H + wmul) / nW, "-DLL=1"), K(carryB, "carryb.cl", "carryB", hN / CARRY_LEN), @@ -615,7 +600,6 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector& weights{genWeights(fft, E, WIDTH, BIG_H, nW, isAmdGpu(q->context->deviceId()))}, bufConstWeights{q->context, std::move(weights.weightsConstIF)}, bufWeights{q->context, std::move(weights.weightsIF)}, - bufBits{q->context, std::move(weights.bitsCF)}, #define BUF(name, ...) name{profile.make(#name), queue, __VA_ARGS__} @@ -695,16 +679,16 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u64 E, const vector& kfftWGF61.setFixedArgs(2, bufTrigW); } - if (fft.FFT_FP64 || fft.FFT_FP32) { // The FP versions take bufWeight arguments (and bufBits which may be deleted) + if (fft.FFT_FP64 || fft.FFT_FP32) { // The FP versions take bufWeight arguments kfftP.setFixedArgs(2, bufTrigW, bufWeights); for (Kernel* k : {&kCarryA, &kCarryAROE, &kCarryM, &kCarryMROE, &kCarryLL}) { k->setFixedArgs(3, bufCarry, bufWeights); } for (Kernel* k : {&kCarryA, &kCarryM, &kCarryLL}) { k->setFixedArgs(5, bufStatsCarry); } for (Kernel* k : {&kCarryAROE, &kCarryMROE}) { k->setFixedArgs(5, bufROE); } for (Kernel* k : {&kCarryFused, &kCarryFusedROE, &kCarryFusedMul, &kCarryFusedMulROE, &kCarryFusedLL}) { - k->setFixedArgs(3, bufCarry, bufReady, bufTrigW, bufBits, bufConstWeights, bufWeights); + k->setFixedArgs(3, bufCarry, bufReady, bufTrigW, bufConstWeights, bufWeights); } - for (Kernel* k : {&kCarryFusedROE, &kCarryFusedMulROE}) { k->setFixedArgs(9, bufROE); } - for (Kernel* k : {&kCarryFused, &kCarryFusedMul, &kCarryFusedLL}) { k->setFixedArgs(9, bufStatsCarry); } + for (Kernel* k : {&kCarryFusedROE, &kCarryFusedMulROE}) { k->setFixedArgs(8, bufROE); } + for (Kernel* k : {&kCarryFused, &kCarryFusedMul, &kCarryFusedLL}) { k->setFixedArgs(8, bufStatsCarry); } } else { kfftP.setFixedArgs(2, bufTrigW); for (Kernel* k : {&kCarryA, &kCarryAROE, &kCarryM, &kCarryMROE, &kCarryLL}) { k->setFixedArgs(3, bufCarry); } @@ -2042,7 +2026,7 @@ array Gpu::isCERT(const Task& task) { char fname[32]; sprintf(fname, "M%" PRIu64 ".cert", E); -// Autoprimenet.py does not add the cert entry to worktodo.txt until it has successfully downloaded the .cert file. +// AutoPrimenet.py does not add the cert entry to worktodo.txt until it has successfully downloaded the .cert file. { // Enclosing this code in braces ensures the file will be closed by the File destructor. The later file deletion requires the file be closed in Windows. File fi = File::openReadThrow(fname); diff --git a/src/Gpu.h b/src/Gpu.h index ad859eac..b184c4d6 100644 --- a/src/Gpu.h +++ b/src/Gpu.h @@ -172,6 +172,7 @@ class Gpu { bool tail_single_wide; // TailSquare processes one line at a time bool tail_single_kernel; // TailSquare does not use a separate kernel for line zero u32 in_place; // Should GPU perform transform in-place. 1 = nVidia friendly memory layout, 2 = AMD friendly. + u32 wmul; // Number of workgroups carryFused kernel should process ("width multiplier"). u32 pad_size; // Pad size in bytes as specified on the command line or config.txt. Maximum value is 512. // Twiddles: trigonometry constant buffers, used in FFTs. @@ -185,7 +186,6 @@ class Gpu { Weights weights; Buffer bufConstWeights; Buffer bufWeights; - Buffer bufBits; // bigWord bits aligned for CarryFused/fftP // "integer word" buffers. These are "small buffers": N x int. Buffer bufData; // Main int buffer with the words. diff --git a/src/cl/base.cl b/src/cl/base.cl index df1ef02b..f252cfe1 100644 --- a/src/cl/base.cl +++ b/src/cl/base.cl @@ -58,6 +58,13 @@ G_H "group height" == SMALL_HEIGHT / NH //__builtin_assume(condition) #endif // DEBUG +#ifndef AMDGPU +#define AMDGPU 0 +#endif +#ifndef NVIDIAGPU +#define NVIDIAGPU 0 +#endif + #if NO_ASM #define HAS_ASM 0 #define HAS_PTX 0 @@ -128,8 +135,14 @@ G_H "group height" == SMALL_HEIGHT / NH #endif #endif -#if !defined(BIGLIT) -#define BIGLIT 1 +// Shufl width in bytes (can be 4, 8, or 16). See fftbase.cl. Allow different shufl widths for fft_width and fft_height. +// Default is 8 bytes (one double). Historically best for Radeon VII and TitanV. This setting will affect how much LDS +// memory is needed which in turn may affect occupancy and thus performance. +#if !defined(SHUFL_BYTES_W) +#define SHUFL_BYTES_W 8 +#endif +#if !defined(SHUFL_BYTES_H) +#define SHUFL_BYTES_H 8 #endif #if !defined(TABMUL_CHAIN) @@ -259,6 +272,42 @@ ulong2 OVERLOAD U2(ulong a, ulong b) { return (ulong2) (a, b); } #define NTSTORE(mem,val) (mem) = val #endif +// Routines for storing to L2 cache bypassing L1 cache. +void OVERLOAD L2STORE(i64 *mem, i64 val) { +#if ENABLE_L2STORE && HAS_PTX >= 200 // Cache hints requires sm_20 support or higher + __asm("st.global.cg.b64 [%0], %1;" : : "l"(mem), "l"(val)); +#else + *mem = val; +#endif +} +void OVERLOAD L2STORE(i32 *mem, i32 val) { +#if ENABLE_L2STORE && HAS_PTX >= 200 // Cache hints requires sm_20 support or higher + __asm("st.global.cg.b32 [%0], %1;" : : "l"(mem), "r"(val)); +#else + *mem = val; +#endif +} + +// Routines for loading a value and marking it for "last use". +i64 OVERLOAD LULOAD(i64 *mem) { +#if ENABLE_LULOAD && HAS_PTX >= 200 // Cache hints requires sm_20 support or higher + i64 retval; + __asm("ld.global.lu.b64 %0, [%1];" : "=l"(retval) : "l"(mem)); + return retval; +#else + return *mem; +#endif +} +i32 OVERLOAD LULOAD(i32 *mem) { +#if ENABLE_LULOAD && HAS_PTX >= 200 // Cache hints requires sm_20 support or higher + i32 retval; + __asm("ld.global.lu.b32 %0, [%1];" : "=r"(retval) : "l"(mem)); + return retval; +#else + return *mem; +#endif +} + // Prefetch macros. Unused at present, I tried using them in fftMiddleInGF61 on a 5080 with no benefit. void PREFETCHL1(const __global void *addr) { #if HAS_PTX >= 200 // Prefetch instruction requires sm_20 support or higher @@ -371,7 +420,15 @@ void OVERLOAD bar(void) { #endif } -void OVERLOAD bar(u32 WG) { if (WG > WAVEFRONT) { bar(); } } +void OVERLOAD bar(const u32 WG) { + if (WG > WAVEFRONT) { +#if ENABLE_BARSYNC && HAS_PTX >= 200 // bar.sync with thread count requires sm_20 support or higher. Slower on TitanV, need to try on later nVidia GPUs. + __asm("bar.sync %0, %1;" : : "r"(get_local_id(0) / WG + 1), "n"(WG)); +#else + bar(); +#endif + } +} // A half-barrier is only needed when half-a-workgroup needs a barrier. // This is used e.g. by the double-wide tailSquare, where LDS is split between the halves. diff --git a/src/cl/carryfused.cl b/src/cl/carryfused.cl index 05e4ca4c..ba48c8bb 100644 --- a/src/cl/carryfused.cl +++ b/src/cl/carryfused.cl @@ -16,18 +16,101 @@ void spin() { #endif } +// Increasing WMUL to 2 will reduce carryShuttle activity. This led to a 1% speedup on Titan V. Testing on other GPUs is needed. +#ifndef WMUL +#define WMUL 1 +#endif + +#if AMDGPU +#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions +//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 +#else +#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better +#endif + +// The last WMUL workgroup's carries have been written to global memory. Now we shuffle WMUL-1 workgroups carries up using local memory. +void OVERLOAD shufl_carries_up(local void *lds2, i64 *carry, u32 me, u32 lowMe) { + // If WMUL is one, there is no shuffling of carries + if (WMUL == 1) return; + + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes used by shufl for each WMUL workgroup + const u32 lds_i64s = lds_bytes / sizeof(i64); // Number of i64s in LDS used by shufl for each WMUL workgroup + local i64 *lds = (local i64 *) lds2; + + // Handle nasty case where we are writing 8-byte quantities but SHUFL_BYTES_W is only 4 bytes + if (SHUFL_BYTES_W == 4) { + if (WMUL == 2) { + // Full barrier needed as we are using the entire LDS buffer. + bar(); + // Write the carries. This will use the entire LDS buffer. + if (me < G_W) for (i32 i = 0; i < NW; ++i) lds[i * G_W + lowMe] = carry[i]; + // Read carries from previous WMUL workgroup + bar(); + if (me >= G_W) for (i32 i = 0; i < NW; ++i) carry[i] = lds[i * G_W + lowMe]; + // Full barrier needed as one workgroup just read data from two workgroups LDS buffer. Not compatible with shufl(). + bar(); + } + + // The really nasty case where all the carries will not fit in LDS memory + else { + lds += (me / G_W) * lds_i64s + lowMe; // This WMUL workgroup's LDS area + // Write half the carries to next WMUL's workgroup LDS area + bar(); + if (me < (WMUL-1) * G_W) for (i32 i = 0; i < NW/2; ++i) lds[lds_i64s + i * G_W] = carry[i]; + // Read carries from our WMUL workgroup LDS area + bar(); + if (me >= G_W) for (i32 i = 0; i < NW/2; ++i) carry[i] = lds[i * G_W]; + // Write the other half of the carries + bar(); + if (me < (WMUL-1) * G_W) for (i32 i = 0; i < NW/2; ++i) lds[lds_i64s + i * G_W] = carry[i + NW/2]; + // Read carries from our WMUL workgroup LDS area. Compatible with shufl, no trailing bar() needed. + bar(); + if (me >= G_W) for (i32 i = 0; i < NW/2; ++i) carry[i + NW/2] = lds[i * G_W]; + } + } + + // Easy case. Write carries to local memory (except last WMUL workgroup which was written to global memory). + else { + lds += (me / G_W) * lds_i64s + lowMe; // This WMUL workgroup's LDS area + // Full barrier needed as we are moving data to next WMUL workgroup's LDS area + bar(); + if (me < (WMUL-1) * G_W) for (i32 i = 0; i < NW; ++i) lds[lds_i64s + i * G_W] = carry[i]; + // Full barrier needed as we just moved data from one WMUL workgroup LDS area to the another WMUL workgroup's LDS area + bar(); + // Read carries from our WMUL workgroup's LDS area. This is compatible with shufl and no trailing bar() is required. + if (me >= G_W) for (i32 i = 0; i < NW; ++i) carry[i] = lds[i * G_W]; + } +} + +// The last WMUL workgroup's carries have been written to global memory. Now we shuffle WMUL-1 workgroup carries up using local memory. +void OVERLOAD shufl_carries_up(local void *lds2, i32 *carry, u32 me, u32 lowMe) { + // If WMUL is one, there is no shuffling of carries + if (WMUL == 1) return; + + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes used by shufl for each WMUL workgroup + const u32 lds_i32s = lds_bytes / sizeof(i32); // Number of i32s in LDS used by shufl for each WMUL workgroup + local i32 *lds = (local i32 *) lds2; + lds += (me / G_W) * lds_i32s + lowMe; // This WMUL workgroup's LDS area + + // Write carries to local memory (except last WMUL workgroup which was written to global memory) + // Full barrier needed as we are moving data to next WMUL workgroup's LDS area + bar(); + if (me < (WMUL-1) * G_W) for (i32 i = 0; i < NW; ++i) lds[lds_i32s + i * G_W] = carry[i]; + // Full barrier needed as we just moved data from one WMUL workgroup LDS area to the another WMUL workgroup's LDS area + bar(); + // Read carries from our WMUL workgroup's LDS area. This is compatible with shufl and no trailing bar() is required. + if (me >= G_W) for (i32 i = 0; i < NW; ++i) carry[i] = lds[i * G_W]; +} + + #if FFT_TYPE == FFT64 // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, - CP(u32) bits, ConstBigTab CONST_THREAD_WEIGHTS, BigTab THREAD_WEIGHTS, P(uint) bufROE) { - -#if 0 // fft_WIDTH uses shufl_int instead of shufl - local T2 lds[WIDTH / 4]; -#else - local T2 lds[WIDTH / 2]; -#endif +KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, + ConstBigTab CONST_THREAD_WEIGHTS, BigTab THREAD_WEIGHTS, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local T2 lds[WMUL * lds_bytes / sizeof(T2)]; T2 u[NW]; @@ -35,36 +118,31 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif #if HAS_ASM __asm("s_setprio 3"); #endif - readCarryFusedLine(in, u, line); - -// Split 32 bits into NW groups of 2 bits. See later for different way to do this. -#if !BIGLIT -#define GPW (16 / NW) - u32 b = NTLOAD(bits[(G_W * line + me) / GPW]) >> (me % GPW * (2 * NW)); -#undef GPW -#endif + readCarryFusedLine(in, u, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack); -#else - new_fft_WIDTH1(lds, u, smallTrig); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; #if AMDGPU - T2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); + T2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); #else - T2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights + T2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights #endif #if MUL3 @@ -75,24 +153,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( CFcarry carry[NW+1]; #endif -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - float roundMax = 0; float carryMax = 0; - // On Titan V it is faster to derive the big vs. little flags from the fractional number of bits in each FFT word rather than read the flags from memory. - // On Radeon VII this code is about the same speed. Not sure which is better on other GPUs. -#if BIGLIT // Calculate the most significant 32-bits of FRAC_BPW * the word index. Also add FRAC_BPW_HI to test first biglit flag. - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; u32 frac_bits = word_index * FRAC_BPW_HI + mad_hi (word_index, FRAC_BPW_LO, FRAC_BPW_HI); const u32 frac_bits_bigstep = ((G_W * H * 2) * FRAC_BPW_HI + (u32)(((u64)(G_W * H * 2) * FRAC_BPW_LO) >> 32)); -#endif // Apply the inverse weights and carry propagate pairs to generate the output carries @@ -103,13 +170,8 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( T invWeight2 = optionalDouble(fancyMul(invWeight1, IWEIGHT_STEP)); // Generate big-word/little-word flags -#if BIGLIT bool biglit0 = frac_bits + i * frac_bits_bigstep <= FRAC_BPW_HI; bool biglit1 = frac_bits + i * frac_bits_bigstep >= -FRAC_BPW_HI; // Same as frac_bits + i * frac_bits_bigstep + FRAC_BPW_HI <= FRAC_BPW_HI; -#else - bool biglit0 = test(b, 2 * i); - bool biglit1 = test(b, 2 * i + 1); -#endif // Apply the inverse weights, optionally compute roundoff error, and convert to integer. Also apply MUL3 here. // Then propagate carries through two words (the first carry does not have to be accurately calculated because it will @@ -126,28 +188,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -163,68 +225,66 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( u[i] = U2(weight1, weight2); } + // Shuffle carries up + shufl_carries_up(lds, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } // Apply each 32 or 64 bit carry to the 2 words for (i32 i = 0; i < NW; ++i) { -#if BIGLIT bool biglit0 = frac_bits + i * frac_bits_bigstep <= FRAC_BPW_HI; -#else - bool biglit0 = test(b, 2 * i); -#endif wu[i] = carryFinal(wu[i], carry[i], biglit0); u[i] = U2(u[i].x * wu[i].x, u[i].y * wu[i].y); } - bar(); + new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe); -// fft_WIDTH(lds, u, smallTrig); - new_fft_WIDTH2(lds, u, smallTrig); - - writeCarryFusedLine(u, out, line); + writeCarryFusedLine(u, out, line, lowMe); } @@ -236,14 +296,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigFP32 smallTrig, - CP(u32) bits, ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) { - -#if 0 // fft_WIDTH uses shufl_int instead of shufl - local F2 lds[WIDTH / 4]; -#else - local F2 lds[WIDTH / 2]; -#endif +KERNEL(G_W * WMUL) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigFP32 smallTrig, + ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local F2 lds[WMUL * lds_bytes / sizeof(F2)]; F2 u[NW]; @@ -251,46 +307,41 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P( u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif #if HAS_ASM __asm("s_setprio 3"); #endif - readCarryFusedLine(in, u, line); + readCarryFusedLine(in, u, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack); -#else - new_fft_WIDTH1(lds, u, smallTrig); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; #if AMDGPU - F2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); + F2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); #else - F2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights + F2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights #endif P(CFcarry) carryShuttlePtr = (P(CFcarry)) carryShuttle; CFcarry carry[NW+1]; -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - float roundMax = 0; float carryMax = 0; // Calculate the most significant 32-bits of FRAC_BPW * the word index. Also add FRAC_BPW_HI to test first biglit flag. - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; u32 frac_bits = word_index * FRAC_BPW_HI + mad_hi (word_index, FRAC_BPW_LO, FRAC_BPW_HI); const u32 frac_bits_bigstep = ((G_W * H * 2) * FRAC_BPW_HI + (u32)(((u64)(G_W * H * 2) * FRAC_BPW_LO) >> 32)); @@ -321,28 +372,28 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P( updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -358,48 +409,53 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P( u[i] = U2(weight1, weight2); } + // Shuffle carries up + shufl_carries_up(lds, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } @@ -410,12 +466,9 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P( u[i] = U2(u[i].x * wu[i].x, u[i].y * wu[i].y); } - bar(); - -// fft_WIDTH(lds, u, smallTrig); - new_fft_WIDTH2(lds, u, smallTrig); + new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe); - writeCarryFusedLine(u, out, line); + writeCarryFusedLine(u, out, line, lowMe); } @@ -427,13 +480,9 @@ KERNEL(G_W) carryFused(P(F2) out, CP(F2) in, u32 posROE, P(i64) carryShuttle, P( // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigGF31 smallTrig, P(uint) bufROE) { - -#if 0 // fft_WIDTH uses shufl_int instead of shufl - local GF31 lds[WIDTH / 4]; -#else - local GF31 lds[WIDTH / 2]; -#endif +KERNEL(G_W * WMUL) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigGF31 smallTrig, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local GF31 lds[WMUL * lds_bytes / sizeof(GF31)]; GF31 u[NW]; @@ -441,40 +490,35 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif #if HAS_ASM __asm("s_setprio 3"); #endif - readCarryFusedLine(in, u, line); + readCarryFusedLine(in, u, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack); -#else - new_fft_WIDTH1(lds, u, smallTrig); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; P(CFcarry) carryShuttlePtr = (P(CFcarry)) carryShuttle; CFcarry carry[NW+1]; -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - u32 roundMax = 0; float carryMax = 0; - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words. // Weights can be applied with shifts because 2 is the 60th root GF31. @@ -537,28 +581,28 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -566,48 +610,52 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle __asm("s_setprio 0"); #endif + // Shuffle carries up + shufl_carries_up(lds, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } @@ -627,11 +675,9 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle if (weight_shift > 31) weight_shift -= 31; } - bar(); - - new_fft_WIDTH2(lds, u, smallTrig); + new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe); - writeCarryFusedLine(u, out, line); + writeCarryFusedLine(u, out, line, lowMe); } @@ -643,13 +689,9 @@ KERNEL(G_W) carryFused(P(GF31) out, CP(GF31) in, u32 posROE, P(i64) carryShuttle // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigGF61 smallTrig, P(uint) bufROE) { - -#if 0 // fft_WIDTH uses shufl_int instead of shufl - local GF61 lds[WIDTH / 4]; -#else - local GF61 lds[WIDTH / 2]; -#endif +KERNEL(G_W * WMUL) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, TrigGF61 smallTrig, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local GF61 lds[WMUL * lds_bytes / sizeof(GF61)]; GF61 u[NW]; @@ -657,23 +699,25 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif #if HAS_ASM __asm("s_setprio 3"); #endif - readCarryFusedLine(in, u, line); + readCarryFusedLine(in, u, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack); -#else - new_fft_WIDTH1(lds, u, smallTrig); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; @@ -685,17 +729,10 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle CFcarry carry[NW+1]; #endif -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - u32 roundMax = 0; float carryMax = 0; - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words. // Weights can be applied with shifts because 2 is the 60th root GF61. @@ -758,28 +795,28 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -787,48 +824,53 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle __asm("s_setprio 0"); #endif + // Shuffle carries up + shufl_carries_up(lds, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } @@ -848,11 +890,9 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle if (weight_shift > 61) weight_shift -= 61; } - bar(); - - new_fft_WIDTH2(lds, u, smallTrig); + new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe); - writeCarryFusedLine(u, out, line); + writeCarryFusedLine(u, out, line, lowMe); } @@ -864,10 +904,10 @@ KERNEL(G_W) carryFused(P(GF61) out, CP(GF61) in, u32 posROE, P(i64) carryShuttle // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, - CP(u32) bits, ConstBigTab CONST_THREAD_WEIGHTS, BigTab THREAD_WEIGHTS, P(uint) bufROE) { - - local T2 lds[WIDTH / 2]; +KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, + ConstBigTab CONST_THREAD_WEIGHTS, BigTab THREAD_WEIGHTS, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local T2 lds[WMUL * lds_bytes / sizeof(T2)]; local GF31 *lds31 = (local GF31 *) lds; T2 u[NW]; @@ -877,7 +917,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif CP(GF31) in31 = (CP(GF31)) (in + DISTGF31); P(GF31) out31 = (P(GF31)) (out + DISTGF31); @@ -887,43 +933,30 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( __asm("s_setprio 3"); #endif - readCarryFusedLine(in, u, line); - readCarryFusedLine(in31, u31, line); + readCarryFusedLine(in, u, line, lowMe); + readCarryFusedLine(in31, u31, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack); - bar(); - new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack); -#else - new_fft_WIDTH1(lds, u, smallTrig); - bar(); - new_fft_WIDTH1(lds31, u31, smallTrig31); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(lds + zerohack, u, smallTrig + zerohack, WMUL, SHUFL_BYTES_W, lowMe); + new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; #if AMDGPU - T2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); + T2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); #else - T2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights + T2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights #endif + P(i64) carryShuttlePtr = (P(i64)) carryShuttle; i64 carry[NW+1]; -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - float roundMax = 0; float carryMax = 0; - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words. // Let s be the shift amount for word 1. The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 31. @@ -987,28 +1020,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -1024,48 +1057,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( u[i] = U2(weight1, weight2); } + // Shuffle carries up + shufl_carries_up(lds, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } @@ -1087,15 +1125,11 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( if (weight_shift > 31) weight_shift -= 31; } - bar(); - - new_fft_WIDTH2(lds, u, smallTrig); - writeCarryFusedLine(u, out, line); + new_fft_WIDTH2(lds, u, smallTrig, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(u, out, line, lowMe); - bar(); - - new_fft_WIDTH2(lds31, u31, smallTrig31); - writeCarryFusedLine(u31, out31, line); + new_fft_WIDTH2(lds31, u31, smallTrig31, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(u31, out31, line, lowMe); } @@ -1107,10 +1141,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, - CP(u32) bits, ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) { - - local F2 ldsF2[WIDTH / 2]; +KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, + ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local F2 ldsF2[WMUL * lds_bytes / sizeof(F2)]; local GF31 *lds31 = (local GF31 *) ldsF2; F2 uF2[NW]; @@ -1120,7 +1154,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif CP(F2) inF2 = (CP(F2)) in; P(F2) outF2 = (P(F2)) out; @@ -1133,43 +1173,30 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( __asm("s_setprio 3"); #endif - readCarryFusedLine(inF2, uF2, line); - readCarryFusedLine(in31, u31, line); + readCarryFusedLine(inF2, uF2, line, lowMe); + readCarryFusedLine(in31, u31, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack); - bar(); - new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack); -#else - new_fft_WIDTH1(ldsF2, uF2, smallTrigF2); - bar(); - new_fft_WIDTH1(lds31, u31, smallTrig31); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); + new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; #if AMDGPU - F2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); + F2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); #else - F2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights + F2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights #endif + P(i32) carryShuttlePtr = (P(i32)) carryShuttle; i32 carry[NW+1]; -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - float roundMax = 0; float carryMax = 0; - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words. // Let s be the shift amount for word 1. The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 31. @@ -1233,28 +1260,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -1270,48 +1297,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( uF2[i] = U2(weight1, weight2); } + // Shuffle carries up + shufl_carries_up(ldsF2, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } @@ -1333,15 +1365,11 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( if (weight_shift > 31) weight_shift -= 31; } - bar(); - - new_fft_WIDTH2(ldsF2, uF2, smallTrigF2); - writeCarryFusedLine(uF2, outF2, line); + new_fft_WIDTH2(ldsF2, uF2, smallTrigF2, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(uF2, outF2, line, lowMe); - bar(); - - new_fft_WIDTH2(lds31, u31, smallTrig31); - writeCarryFusedLine(u31, out31, line); + new_fft_WIDTH2(lds31, u31, smallTrig31, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(u31, out31, line, lowMe); } @@ -1353,10 +1381,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, - CP(u32) bits, ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) { - - local GF61 lds61[WIDTH / 2]; +KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, + ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local GF61 lds61[WMUL * lds_bytes / sizeof(GF61)]; local F2 *ldsF2 = (local F2 *) lds61; F2 uF2[NW]; @@ -1366,7 +1394,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif CP(F2) inF2 = (CP(F2)) in; P(F2) outF2 = (P(F2)) out; @@ -1379,43 +1413,30 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( __asm("s_setprio 3"); #endif - readCarryFusedLine(inF2, uF2, line); - readCarryFusedLine(in61, u61, line); + readCarryFusedLine(inF2, uF2, line, lowMe); + readCarryFusedLine(in61, u61, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack); - bar(); - new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack); -#else - new_fft_WIDTH1(ldsF2, uF2, smallTrigF2); - bar(); - new_fft_WIDTH1(lds61, u61, smallTrig61); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); + new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; #if AMDGPU - F2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); + F2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); #else - F2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights + F2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights #endif + P(i64) carryShuttlePtr = (P(i64)) carryShuttle; i64 carry[NW+1]; -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - float roundMax = 0; float carryMax = 0; - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words. // Let s be the shift amount for word 1. The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 61. @@ -1479,28 +1500,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -1516,48 +1537,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( uF2[i] = U2(weight1, weight2); } + // Shuffle carries up + shufl_carries_up(lds61, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } @@ -1579,15 +1605,11 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( if (weight_shift > 61) weight_shift -= 61; } - bar(); + new_fft_WIDTH2(ldsF2, uF2, smallTrigF2, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(uF2, outF2, line, lowMe); - new_fft_WIDTH2(ldsF2, uF2, smallTrigF2); - writeCarryFusedLine(uF2, outF2, line); - - bar(); - - new_fft_WIDTH2(lds61, u61, smallTrig61); - writeCarryFusedLine(u61, out61, line); + new_fft_WIDTH2(lds61, u61, smallTrig61, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(u61, out61, line, lowMe); } @@ -1599,13 +1621,9 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, P(uint) bufROE) { - -#if 0 // fft_WIDTH uses shufl_int instead of shufl - local GF61 lds61[WIDTH / 4]; -#else - local GF61 lds61[WIDTH / 2]; -#endif +KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local GF61 lds61[WMUL * lds_bytes / sizeof(GF61)]; local GF31 *lds31 = (local GF31 *) lds61; GF31 u31[NW]; @@ -1615,7 +1633,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif CP(GF31) in31 = (CP(GF31)) (in + DISTGF31); P(GF31) out31 = (P(GF31)) (out + DISTGF31); @@ -1628,38 +1652,25 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( __asm("s_setprio 3"); #endif - readCarryFusedLine(in31, u31, line); - readCarryFusedLine(in61, u61, line); + readCarryFusedLine(in31, u31, line, lowMe); + readCarryFusedLine(in61, u61, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack); - bar(); - new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack); -#else - new_fft_WIDTH1(lds31, u31, smallTrig31); - bar(); - new_fft_WIDTH1(lds61, u61, smallTrig61); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); + new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; + P(i64) carryShuttlePtr = (P(i64)) carryShuttle; i64 carry[NW+1]; -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - u32 roundMax = 0; float carryMax = 0; - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words. // Let s be the shift amount for word 1. The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 31. @@ -1738,28 +1749,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -1767,48 +1778,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( __asm("s_setprio 0"); #endif + // Shuffle carries up + shufl_carries_up(lds61, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } @@ -1836,15 +1852,11 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( m61_weight_shift = adjust_m61_weight_shift(m61_weight_shift); } - bar(); - - new_fft_WIDTH2(lds31, u31, smallTrig31); - writeCarryFusedLine(u31, out31, line); - - bar(); + new_fft_WIDTH2(lds31, u31, smallTrig31, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(u31, out31, line, lowMe); - new_fft_WIDTH2(lds61, u61, smallTrig61); - writeCarryFusedLine(u61, out61, line); + new_fft_WIDTH2(lds61, u61, smallTrig61, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(u61, out61, line, lowMe); } @@ -1856,14 +1868,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( // The "carryFused" is equivalent to the sequence: fftW, carryA, carryB, fftPremul. // It uses "stairway forwarding" (forwarding carry data from one workgroup to the next) -KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, - CP(u32) bits, ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) { - -#if 0 // fft_WIDTH uses shufl_int instead of shufl - local GF61 lds61[WIDTH / 4]; -#else - local GF61 lds61[WIDTH / 2]; -#endif +KERNEL(G_W * WMUL) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(u32) ready, Trig smallTrig, + ConstBigTabFP32 CONST_THREAD_WEIGHTS, BigTabFP32 THREAD_WEIGHTS, P(uint) bufROE) { + const u32 lds_bytes = WIDTH * SHUFL_BYTES_W; // LDS bytes needed for each WMUL workgroup + local GF61 lds61[WMUL * lds_bytes / sizeof(GF61)]; local F2 *ldsF2 = (local F2 *) lds61; local GF31 *lds31 = (local GF31 *) lds61; @@ -1875,7 +1883,13 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( u32 me = get_local_id(0); u32 H = BIG_HEIGHT; +#if WMUL == 1 + u32 lowMe = me; u32 line = gr % H; +#else + u32 lowMe = me % G_W; // lane-id in one of the WMUL sub-workgroups. + u32 line = (gr * WMUL + me / G_W) % H; +#endif CP(F2) inF2 = (CP(F2)) in; P(F2) outF2 = (P(F2)) out; @@ -1891,48 +1905,32 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( __asm("s_setprio 3"); #endif - readCarryFusedLine(inF2, uF2, line); - readCarryFusedLine(in31, u31, line); - readCarryFusedLine(in61, u61, line); + readCarryFusedLine(inF2, uF2, line, lowMe); + readCarryFusedLine(in31, u31, line, lowMe); + readCarryFusedLine(in61, u61, line, lowMe); // Try this weird FFT_width call that adds a "hidden zero" when unrolling. This prevents the compiler from finding // common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs // which causes a terrible reduction in occupancy. -#if ZEROHACK_W - u32 zerohack = get_group_id(0) / 131072; - new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack); - bar(); - new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack); - bar(); - new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack); -#else - new_fft_WIDTH1(ldsF2, uF2, smallTrigF2); - bar(); - new_fft_WIDTH1(lds31, u31, smallTrig31); - bar(); - new_fft_WIDTH1(lds61, u61, smallTrig61); -#endif + u32 zerohack = ZEROHACK_W * (u32) get_group_id(0) / 131072; + new_fft_WIDTH1(ldsF2 + zerohack, uF2, smallTrigF2 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); + new_fft_WIDTH1(lds31 + zerohack, u31, smallTrig31 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); + new_fft_WIDTH1(lds61 + zerohack, u61, smallTrig61 + zerohack, WMUL, SHUFL_BYTES_W, lowMe); Word2 wu[NW]; #if AMDGPU - F2 weights = fancyMul(THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); + F2 weights = fancyMul(THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); #else - F2 weights = fancyMul(CONST_THREAD_WEIGHTS[me], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights + F2 weights = fancyMul(CONST_THREAD_WEIGHTS[lowMe], THREAD_WEIGHTS[G_W + line]); // On nVidia, don't pollute the constant cache with line weights #endif + P(i64) carryShuttlePtr = (P(i64)) carryShuttle; i64 carry[NW+1]; -#if AMDGPU -#define CarryShuttleAccess(me,i) ((me) * NW + (i)) // Generates denser global_load_dwordx4 instructions -//#define CarryShuttleAccess(me,i) ((me) * 4 + (i)%4 + (i)/4 * 4*G_W) // Also generates global_load_dwordx4 instructions and unit stride when NW=8 -#else -#define CarryShuttleAccess(me,i) ((me) + (i) * G_W) // nVidia likes this unit stride better -#endif - float roundMax = 0; float carryMax = 0; - u32 word_index = (me * H + line) * 2; + u32 word_index = (lowMe * H + line) * 2; // Weight is 2^[ceil(qj / n) - qj/n] where j is the word index, q is the Mersenne exponent, and n is the number of words. // Let s be the shift amount for word 1. The shift amount for word x is ceil(x * (s - 1) + num_big_words_less_than_x) % 31. @@ -2013,28 +2011,28 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( updateStats(bufROE, posROE, carryMax); #endif - // Write out our carries. Only groups 0 to H-1 need to write carries out. - // Group H is a duplicate of group 0 (producing the same results) so we don't care about group H writing out, + // Write out our carries for the last line in this group. Only groups 0 to H/WMUL-1 need to write carries out. + // Group H/WMUL is a duplicate of group 0 (producing the same results) so we don't care about that group writing out, // but it's fine either way. - if (gr < H) { for (i32 i = 0; i < NW; ++i) { carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(me, i)] = carry[i]; } } + if (gr < H / WMUL && me >= (WMUL-1) * G_W) { + for (i32 i = 0; i < NW; ++i) { L2STORE(&carryShuttlePtr[gr * WIDTH + CarryShuttleAccess(lowMe, i)], carry[i]); } - // Tell next line that its carries are ready - if (gr < H) { + // Tell next group that its carries are ready #if OLD_FENCE // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); write_mem_fence(CLK_GLOBAL_MEM_FENCE); - bar(); - if (me == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } + bar(G_W); + if (lowMe == 0) { atomic_store((atomic_uint *) &ready[gr], 1); } #else write_mem_fence(CLK_GLOBAL_MEM_FENCE); - if (me % WAVEFRONT == 0) { - u32 pos = gr * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (lowMe % WAVEFRONT == 0) { + u32 pos = gr * (G_W / WAVEFRONT) + lowMe / WAVEFRONT; atomic_store((atomic_uint *) &ready[pos], 1); } #endif } - // Line zero will be redone when gr == H + // Group zero will be redone when gr == H / WMUL if (gr == 0) { return; } // Do some work while our carries may not be ready @@ -2050,48 +2048,53 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( uF2[i] = U2(weight1, weight2); } + // Shuffle carries up + shufl_carries_up(lds61, carry, me, lowMe); + // Wait until our carries are ready + if (me < G_W) { #if OLD_FENCE - if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } - // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); - bar(); - read_mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me == 0) ready[gr - 1] = 0; + if (me == 0) { do { spin(); } while(!atomic_load_explicit((atomic_uint *) &ready[gr - 1], memory_order_relaxed, memory_scope_device)); } + // work_group_barrier(CLK_GLOBAL_MEM_FENCE, memory_scope_device); + bar(); + read_mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me == 0) ready[gr - 1] = 0; #else - u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; - if (me % WAVEFRONT == 0) { - do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); - } - mem_fence(CLK_GLOBAL_MEM_FENCE); - // Clear carry ready flag for next iteration - if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; + u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT; + if (me % WAVEFRONT == 0) { + do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0); + } + mem_fence(CLK_GLOBAL_MEM_FENCE); + // Clear carry ready flag for next iteration + if (me % WAVEFRONT == 0) ready[(gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT] = 0; #endif #if HAS_ASM - __asm("s_setprio 1"); + __asm("s_setprio 1"); #endif - // Read from the carryShuttle carries produced by the previous WIDTH row. Rotate carries from the last WIDTH row. - // The new carry layout lets the compiler generate global_load_dwordx4 instructions. - if (gr < H) { - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]; - } - } else { + // Read from the carryShuttle carries produced by the previous WIDTH group. Rotate carries from the last WIDTH line. + // The new carry layout lets the AMD compiler generate global_load_dwordx4 instructions. + if (gr < H / WMUL) { + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess(me, i)]); + } + } else { #if !OLD_FENCE - // For gr==H we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. - bar(); + // For gr==H/WMUL we need the barrier since the carry reading is shifted, thus the per-wavefront trick does not apply. + bar(); #endif - for (i32 i = 0; i < NW; ++i) { - carry[i] = carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]; - } + for (i32 i = 0; i < NW; ++i) { + carry[i] = LULOAD(&carryShuttlePtr[(gr - 1) * WIDTH + CarryShuttleAccess((me + G_W - 1) % G_W, i) /* ((me!=0) + NW - 1 + i) % NW*/]); + } - if (me == 0) { - carry[NW] = carry[NW-1]; - for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } - carry[0] = carry[NW]; + if (me == 0) { + carry[NW] = carry[NW-1]; + for (i32 i = NW-1; i; --i) { carry[i] = carry[i-1]; } + carry[0] = carry[NW]; + } } } @@ -2120,20 +2123,14 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P( m61_weight_shift = adjust_m61_weight_shift(m61_weight_shift); } - bar(); - - new_fft_WIDTH2(ldsF2, uF2, smallTrigF2); - writeCarryFusedLine(uF2, outF2, line); + new_fft_WIDTH2(ldsF2, uF2, smallTrigF2, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(uF2, outF2, line, lowMe); - bar(); - - new_fft_WIDTH2(lds31, u31, smallTrig31); - writeCarryFusedLine(u31, out31, line); - - bar(); + new_fft_WIDTH2(lds31, u31, smallTrig31, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(u31, out31, line, lowMe); - new_fft_WIDTH2(lds61, u61, smallTrig61); - writeCarryFusedLine(u61, out61, line); + new_fft_WIDTH2(lds61, u61, smallTrig61, WMUL, SHUFL_BYTES_W, lowMe); + writeCarryFusedLine(u61, out61, line, lowMe); } diff --git a/src/cl/fftbase.cl b/src/cl/fftbase.cl index c955e803..9d2fde10 100644 --- a/src/cl/fftbase.cl +++ b/src/cl/fftbase.cl @@ -5,6 +5,119 @@ #include "trig.cl" // #include "math.cl" + +#if FFT_FP64 | NTT_GF61 + +// Shufl two or more fft_WIDTHs or FFT_HEIGHTs operating on 64-bit values. Each WG uses WG * sb bytes of LDS memory. +// Care is taken that each simultaneous workgroup does not interfere with the LDS memory of other simultaneous workgroups -- +// even when operating on differernt sized data elements as can happen in an M31+M61 NTT. +// WG = workgroup size of a single fft_WIDTH or fft_HEIGHT +// n = sizeof array u (nW or nH). n * WG = WIDTH or HEIGHT +// sb = The number of bytes to write to LDS memory at a time. SHUFL_BYTES_W or SHUFL_BYTES_H +// numWG = number of fft_WIDTHs or fft_HEIGHTs being processed simultaneously +// lowMe = me % WG +// NOTE: shufl routines perform a bar(WG) at the start but not at the end. After calling shufl, a bar(WG) is required +// before next LDS memory usage. All routines that use LDS memory MUST OBEY THIS PROTOCOL of bar() before LDS use and +// only bar(WG) required before next use. ALSO NOTE: the first shufl call does not need to do bar(WG). A relatively +// minor optimization would be to spedial case the first shufl call. +void OVERLOAD shufl64(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) { + + u32 mask = f - 1; + assert((mask & (mask + 1)) == 0); + + if (sb == 16) { + local T2* lds = ((local T2*) lds2); + if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(T2); + + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i]; } + bar(WG); + for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + lowMe]; } + } + + else if (sb == 8) { + // Accessing lds memory as doubles is faster than T2 accesses on Radeon VII (halving LDS memory requirements) + local T* lds = ((local T*) lds2); + if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(T); + + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i].x; } + bar(WG); + for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + lowMe]; } + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i].y; } + bar(WG); + for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + lowMe]; } + } + + else if (sb == 4) { + // Lower LDS requirements may let the optimizer use fewer VGPRs and increase occupancy for WIDTHs >= 1024. + // Alas, the increased occupancy does not offset extra code needed for shufl_int (the assembly + // code generated is not pretty). This might not be true for nVidia or future ROCm optimizers. + local int* lds = (local int*) lds2; + if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(int); + + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = as_int4(u[i]).x; } + bar(WG); + for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.x = lds[i * WG + lowMe]; u[i] = as_double2(tmp); } + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = as_int4(u[i]).y; } + bar(WG); + for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.y = lds[i * WG + lowMe]; u[i] = as_double2(tmp); } + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = as_int4(u[i]).z; } + bar(WG); + for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.z = lds[i * WG + lowMe]; u[i] = as_double2(tmp); } + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = as_int4(u[i]).w; } + bar(WG); + for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.w = lds[i * WG + lowMe]; u[i] = as_double2(tmp); } + } +} + +#endif + + +#if FFT_FP32 | NTT_GF31 + +// Shufl two or more fft_WIDTHs or FFT_HEIGHTs using two 4-byte floats. +void OVERLOAD shufl32(u32 WG, local F2 *lds2, F2 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) { + + u32 mask = f - 1; + assert((mask & (mask + 1)) == 0); + + //GW - would a 16 byte implementation be useful? + + if (sb >= 8) { + local F2* lds = ((local F2*) lds2); + if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(F2); + + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i]; } + bar(WG); + for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + lowMe]; } + } + + else if (sb == 4) { + // Accessing lds memory as ints might be faster than F2 accesses (halving LDS memory requirements) + local F* lds = ((local F*) lds2); + if (numWG > 1) lds += ((u32) get_local_id(0) / WG) * n * WG * sb / sizeof(F); + + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i].x; } + bar(WG); + for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + lowMe]; } + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (lowMe & ~mask) * n + (lowMe & mask)] = u[i].y; } + bar(WG); + for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + lowMe]; } + } +} + +#endif + + #if FFT_FP64 void OVERLOAD chainMul4(T2 *u, T2 w) { @@ -45,7 +158,7 @@ void OVERLOAD chainMul8(T2 *u, T2 w, u32 tailSquareBcast) { u[3] = cmulFancy(u[3], w3); w3.x += 1; - T2 base = cmulFancy (w3, w); + T2 base = cmulFancy(w3, w); for (int i = 4; i < 8; ++i) { u[i] = cmul(u[i], base); base = cmulFancy(base, w); @@ -106,83 +219,8 @@ T2 bcast(T2 src, u32 span) { #endif -void OVERLOAD shuflBigLDS(u32 WG, local T2 *lds, T2 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i]; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + me]; } -} - -void OVERLOAD shufl(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - local T* lds = (local T*) lds2; - - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } -} - -// Same as shufl but use ints instead of doubles to reduce LDS memory requirements. -// Lower LDS requirements should let the optimizer use fewer VGPRs and increase occupancy for WIDTHs >= 1024. -// Alas, the increased occupancy does not offset extra code needed for shufl_int (the assembly -// code generated is not pretty). This might not be true for nVidia or future ROCm optimizers. -void OVERLOAD shufl_int(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - local int* lds = (local int*) lds2; - - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).x; } - bar(); - for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.x = lds[i * WG + me]; u[i] = as_double2(tmp); } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).y; } - bar(); - for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.y = lds[i * WG + me]; u[i] = as_double2(tmp); } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).z; } - bar(); - for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.z = lds[i * WG + me]; u[i] = as_double2(tmp); } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).w; } - bar(); - for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.w = lds[i * WG + me]; u[i] = as_double2(tmp); } - bar(); // I'm not sure why this barrier call is needed -} - -// Shufl two simultaneous FFT_HEIGHTs. Needed for tailSquared where u and v are computed simultaneously in different threads. -// NOTE: It is very important for this routine to use lds memory in coordination with reverseLine2 and unreverseLine2. -// Failure to do so would result in the need for more bar() calls. Specifically, the u values are stored in the upper half -// of lds memory (first SMALL_HEIGHT T2 values). The v values are stored in the lower half of lds memory (next SMALL_HEIGHT T2 values). -void OVERLOAD shufl2(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - - // Partition lds memory into upper and lower halves - assert(WG == G_H); - - // Accessing lds memory as doubles is faster than T2 accesses - local T* lds = ((local T*) lds2) + (me / WG) * SMALL_HEIGHT; - - me = me % WG; - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } - bar(WG); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(WG); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } - bar(WG); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } +void OVERLOAD shufl(u32 WG, local T2 *lds, T2 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) { + shufl64(WG, lds, u, n, f, numWG, sb, lowMe); } void OVERLOAD tabMul(u32 WG, Trig trig, T2 *u, u32 n, u32 f, u32 me) { @@ -209,7 +247,7 @@ void OVERLOAD tabMul(u32 WG, Trig trig, T2 *u, u32 n, u32 f, u32 me) { if (TABMUL_CHAIN) { T2 w = trig[p]; - chainMul (n, u, w, 0); + chainMul(n, u, w, 0); return; } @@ -226,8 +264,7 @@ void OVERLOAD tabMul(u32 WG, Trig trig, T2 *u, u32 n, u32 f, u32 me) { } for (u32 i = 2; i < n; ++i) { - T2 base = trig[(i-1)*WG + p]; - u[i] = cmul(u[i], base); + u[i] = cmul(u[i], trig[(i-1)*WG + p]); } return; } @@ -250,7 +287,7 @@ T2 partial_cmul(T2 u, T sine_over_cosine) { #define X2_via_FMA(a, c, b) { T2 t = a; a = fma(c, b, t); b = fma(-c, b, t); } // Preload trig values for the first partial tabMul. We load the sine/cosine values early so that F64 ops can hide the read latency. -void preload_tabMul4_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 me) { +void preload_tabMul4_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 numWG, u32 me) { TrigSingle trig1 = (TrigSingle) trig; // Read 3 lines of sine/cosine values for the first fft4. Read two of the lines as a pair as AMD likes T2 global memory reads @@ -263,19 +300,20 @@ void preload_tabMul4_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 me) { } // Do a partial tabMul. Save the mul-by-cosine for later FMA instructions. -void partial_tabMul4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 me) { +void partial_tabMul4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 numWG, u32 me) { local T *lds1 = (local T *) lds; TrigSingle trig1 = (TrigSingle) trig; trig1 += 4*WG; // Skip past sine_over_cosine values // Use LDS memory to distribute preloaded trig values. if (f > 1) { + bar(WG); lds1[me] = preloads[4]; // Preloaded sine/cosine values lds1[WG+me] = preloads[5]; // Preloaded cosine values - bar(WG); } // Apply sine/cosines + bar(WG); for (u32 i = 1; i < 4; ++i) { T sine_over_cosine; if (f == 1) sine_over_cosine = preloads[i-1]; @@ -299,13 +337,11 @@ void partial_tabMul4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f preloads[2] = lds1[WG + ((me/f) & 3) * WG/4 + (2 * WG + me)/(4*f) * f/4]; preloads[3] = lds1[WG + ((me/f) & 3) * WG/4 + (3 * WG + me)/(4*f) * f/4]; preloads[1] = lds1[WG + ((me/f) & 3) * WG/4 + (1 * WG + me)/(4*f) * f/4]; - bar(WG); } } // Finish off a partial tabMul while doing next fft4 making more use of FMA. -void finish_tabMul4_fft4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 me, u32 save_one_more_mul) { - local T *lds1 = (local T *) lds; +void finish_tabMul4_fft4(u32 WG, Trig trig, T *preloads, T2 *u, u32 f, u32 numWG, u32 me, u32 save_one_more_mul) { TrigSingle trig1 = (TrigSingle) trig; // @@ -338,7 +374,7 @@ void finish_tabMul4_fft4(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u //************************************************************************************ // Preload trig values for the first partial tabMul. We load the sine/cosine values early so that F64 ops can hide the read latency. -void preload_tabMul8_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 me) { +void preload_tabMul8_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 numWG, u32 me) { TrigSingle trig1 = (TrigSingle) trig; // Read 7 lines of sine/cosine values for the first fft8. Read six of the lines as pairs as AMD likes T2 global memory reads @@ -353,19 +389,20 @@ void preload_tabMul8_trig(u32 WG, Trig trig, T *preloads, u32 f, u32 me) { } // Do a partial tabMul. Save the mul-by-cosine for later FMA instructions. -void partial_tabMul8(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 me) { +void partial_tabMul8(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 numWG, u32 me) { local T *lds1 = (local T *) lds; TrigSingle trig1 = (TrigSingle) trig; trig1 += 8*WG; // Skip past sine_over_cosine values // Use LDS memory to distribute preloaded trig values. if (f > 1) { + bar(WG); lds1[me] = preloads[8]; // Preloaded sine/cosine values lds1[WG+me] = preloads[9]; // Preloaded cosine values - bar(WG); } // Apply sine/cosines + bar(WG); for (u32 i = 1; i < 8; ++i) { T sine_over_cosine; if (f == 1) sine_over_cosine = preloads[i-1]; @@ -394,13 +431,11 @@ void partial_tabMul8(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f preloads[7] = lds1[WG + ((me/f) & 7) * WG/8 + (7 * WG + me)/(8*f) * f/8]; preloads[2] = lds1[WG + ((me/f) & 7) * WG/8 + (2 * WG + me)/(8*f) * f/8]; preloads[3] = lds1[WG + ((me/f) & 7) * WG/8 + (3 * WG + me)/(8*f) * f/8]; - bar(WG); } } // Finish off a partial tabMul while doing next fft8 making more use of FMA. -void finish_tabMul8_fft8(u32 WG, local T2 *lds, Trig trig, T *preloads, T2 *u, u32 f, u32 me, u32 save_one_more_mul) { - local T *lds1 = (local T *) lds; +void finish_tabMul8_fft8(u32 WG, Trig trig, T *preloads, T2 *u, u32 f, u32 numWG, u32 me, u32 save_one_more_mul) { TrigSingle trig1 = (TrigSingle) trig; // @@ -503,7 +538,7 @@ void OVERLOAD chainMul8(F2 *u, F2 w, u32 tailSquareBcast) { u[3] = cmulFancy(u[3], w3); w3.x += 1; - F2 base = cmulFancy (w3, w); + F2 base = cmulFancy(w3, w); for (int i = 4; i < 8; ++i) { u[i] = cmul(u[i], base); base = cmulFancy(base, w); @@ -517,55 +552,8 @@ void OVERLOAD chainMul(u32 len, F2 *u, F2 w, u32 tailSquareBcast) { if (len == 8) chainMul8(u, w, tailSquareBcast); } -void OVERLOAD shuflBigLDS(u32 WG, local F2 *lds, F2 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i]; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + me]; } -} - -void OVERLOAD shufl(u32 WG, local F2 *lds2, F2 *u, u32 n, u32 f) { //GWBUG - is shufl of int2 faster (BigLDS)? - u32 me = get_local_id(0); - local F* lds = (local F*) lds2; - - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } -} - -// Shufl two simultaneous FFT_HEIGHTs. Needed for tailSquared where u and v are computed simultaneously in different threads. -// NOTE: It is very important for this routine to use lds memory in coordination with reverseLine2 and unreverseLine2. -// Failure to do so would result in the need for more bar() calls. Specifically, the u values are stored in the upper half -// of lds memory (first SMALL_HEIGHT GF31 values). The v values are stored in the lower half of lds memory (next SMALL_HEIGHT GF31 values). -void OVERLOAD shufl2(u32 WG, local F2 *lds2, F2 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - - // Partition lds memory into upper and lower halves - assert(WG == G_H); - - // Accessing lds memory as F is faster than F2 accesses //GWBUG??? - local F* lds = ((local F*) lds2) + (me / WG) * SMALL_HEIGHT; - - me = me % WG; - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } - bar(WG); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(WG); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } - bar(WG); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } +void OVERLOAD shufl(u32 WG, local F2 *lds, F2 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) { + shufl32(WG, lds, u, n, f, numWG, sb, lowMe); } void OVERLOAD tabMul(u32 WG, TrigFP32 trig, F2 *u, u32 n, u32 f, u32 me) { @@ -574,7 +562,7 @@ void OVERLOAD tabMul(u32 WG, TrigFP32 trig, F2 *u, u32 n, u32 f, u32 me) { // This code uses chained complex multiplies which could be faster on GPUs with great mul throughput or poor memory bandwidth or caching. if (TABMUL_CHAIN32) { - chainMul (n, u, trig[p], 0); + chainMul(n, u, trig[p], 0); return; } @@ -632,55 +620,8 @@ void OVERLOAD chainMul(u32 len, GF31 *u, GF31 w) { if (len == 8) chainMul8(u, w); } -void OVERLOAD shuflBigLDS(u32 WG, local GF31 *lds, GF31 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i]; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + me]; } -} - -void OVERLOAD shufl(u32 WG, local GF31 *lds2, GF31 *u, u32 n, u32 f) { //GWBUG - is shufl of int2 faster (BigLDS)? - u32 me = get_local_id(0); - local Z31* lds = (local Z31*) lds2; - - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } -} - -// Shufl two simultaneous FFT_HEIGHTs. Needed for tailSquared where u and v are computed simultaneously in different threads. -// NOTE: It is very important for this routine to use lds memory in coordination with reverseLine2 and unreverseLine2. -// Failure to do so would result in the need for more bar() calls. Specifically, the u values are stored in the upper half -// of lds memory (first SMALL_HEIGHT GF31 values). The v values are stored in the lower half of lds memory (next SMALL_HEIGHT GF31 values). -void OVERLOAD shufl2(u32 WG, local GF31 *lds2, GF31 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - - // Partition lds memory into upper and lower halves - assert(WG == G_H); - - // Accessing lds memory as Z31s is faster than GF31 accesses //GWBUG??? - local Z31* lds = ((local Z31*) lds2) + (me / WG) * SMALL_HEIGHT; - - me = me % WG; - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } - bar(WG); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(WG); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } - bar(WG); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } +void OVERLOAD shufl(u32 WG, local GF31 *lds, GF31 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) { + shufl32(WG, (local F2 *) lds, (local F2 *) u, n, f, numWG, sb, lowMe); } void OVERLOAD tabMul(u32 WG, TrigGF31 trig, GF31 *u, u32 n, u32 f, u32 me) { @@ -689,7 +630,7 @@ void OVERLOAD tabMul(u32 WG, TrigGF31 trig, GF31 *u, u32 n, u32 f, u32 me) { // This code uses chained complex multiplies which could be faster on GPUs with great mul throughput or poor memory bandwidth or caching. if (TABMUL_CHAIN31) { - chainMul (n, u, trig[p]); + chainMul(n, u, trig[p]); return; } @@ -728,7 +669,7 @@ void OVERLOAD chainMul8(GF61 *u, GF61 w, u32 tailSquareBcast) { GF61 w2 = csq(w); u[2] = cmul(u[2], w2); - GF61 base = cmul (w2, w); //GWBUG - see FP64 version for many possible optimizations + GF61 base = cmul(w2, w); //GWBUG - see FP64 version for many possible optimizations for (int i = 3; i < 8; ++i) { u[i] = cmul(u[i], base); base = cmul(base, w); @@ -742,83 +683,8 @@ void OVERLOAD chainMul(u32 len, GF61 *u, GF61 w, u32 tailSquareBcast) { if (len == 8) chainMul8(u, w, tailSquareBcast); } -void OVERLOAD shuflBigLDS(u32 WG, local GF61 *lds, GF61 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i]; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i] = lds[i * WG + me]; } -} - -void OVERLOAD shufl(u32 WG, local GF61 *lds2, GF61 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - local Z61* lds = (local Z61*) lds2; - - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } -} - -// Same as shufl but use ints instead of Z61s to reduce LDS memory requirements. -// Lower LDS requirements should let the optimizer use fewer VGPRs and increase occupancy for WIDTHs >= 1024. -// Alas, the increased occupancy does not offset extra code needed for shufl_int (the assembly -// code generated is not pretty). This might not be true for nVidia or future ROCm optimizers. -void OVERLOAD shufl_int(u32 WG, local GF61 *lds2, GF61 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - local int* lds = (local int*) lds2; - - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).x; } - bar(); - for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.x = lds[i * WG + me]; u[i] = as_ulong2(tmp); } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).y; } - bar(); - for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.y = lds[i * WG + me]; u[i] = as_ulong2(tmp); } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).z; } - bar(); - for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.z = lds[i * WG + me]; u[i] = as_ulong2(tmp); } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = as_int4(u[i]).w; } - bar(); - for (u32 i = 0; i < n; ++i) { int4 tmp = as_int4(u[i]); tmp.w = lds[i * WG + me]; u[i] = as_ulong2(tmp); } - bar(); // I'm not sure why this barrier call is needed -} - -// Shufl two simultaneous FFT_HEIGHTs. Needed for tailSquared where u and v are computed simultaneously in different threads. -// NOTE: It is very important for this routine to use lds memory in coordination with reverseLine2 and unreverseLine2. -// Failure to do so would result in the need for more bar() calls. Specifically, the u values are stored in the upper half -// of lds memory (first SMALL_HEIGHT GF61 values). The v values are stored in the lower half of lds memory (next SMALL_HEIGHT GF61 values). -void OVERLOAD shufl2(u32 WG, local GF61 *lds2, GF61 *u, u32 n, u32 f) { - u32 me = get_local_id(0); - - // Partition lds memory into upper and lower halves - assert(WG == G_H); - - // Accessing lds memory as Z61s is faster than GF61 accesses - local Z61* lds = ((local Z61*) lds2) + (me / WG) * SMALL_HEIGHT; - - me = me % WG; - u32 mask = f - 1; - assert((mask & (mask + 1)) == 0); - - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } - bar(WG); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(WG); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } - bar(WG); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } +void OVERLOAD shufl(u32 WG, local GF61 *lds, GF61 *u, u32 n, u32 f, u32 numWG, const u32 sb, u32 lowMe) { + shufl64(WG, (local T2 *) lds, (T2 *) u, n, f, numWG, sb, lowMe); } void OVERLOAD tabMul(u32 WG, TrigGF61 trig, GF61 *u, u32 n, u32 f, u32 me) { @@ -827,7 +693,7 @@ void OVERLOAD tabMul(u32 WG, TrigGF61 trig, GF61 *u, u32 n, u32 f, u32 me) { // This code uses chained complex multiplies which could be faster on GPUs with great mul throughput or poor memory bandwidth or caching. if (TABMUL_CHAIN61) { - chainMul (n, u, trig[p], 0); + chainMul(n, u, trig[p], 0); return; } diff --git a/src/cl/fftheight.cl b/src/cl/fftheight.cl index 69fa75b8..49323782 100644 --- a/src/cl/fftheight.cl +++ b/src/cl/fftheight.cl @@ -35,76 +35,43 @@ void OVERLOAD fft_NH(T2 *u) { #error FFT_VARIANT_H == 0 only supported by AMD GPUs #endif -void OVERLOAD fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w) { - for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) { - if (s > 1) { bar(); } - fft_NH(u); - w = bcast(w, s); - - chainMul(NH, u, w, 1); - - shufl(SMALL_HEIGHT / NH, lds, u, NH, s); - } - fft_NH(u); -} - -void OVERLOAD fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w) { +void OVERLOAD fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe) { u32 WG = SMALL_HEIGHT / NH; - for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) { - if (s > 1) { bar(WG); } + for (u32 s = 1; s < WG; s *= NH) { fft_NH(u); w = bcast(w, s); chainMul(NH, u, w, 1); - shufl2(SMALL_HEIGHT / NH, lds, u, NH, s); + shufl(WG, lds, u, NH, s, numWG, sb, lowMe); } fft_NH(u); } #else -void OVERLOAD fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w) { - u32 me = get_local_id(0); - -#if !UNROLL_H - __attribute__((opencl_unroll_hint(1))) -#endif - - for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) { - if (s > 1) { bar(); } - fft_NH(u); - tabMul(SMALL_HEIGHT / NH, trig, u, NH, s, me); - shufl(SMALL_HEIGHT / NH, lds, u, NH, s); - } - fft_NH(u); -} - -void OVERLOAD fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w) { - u32 me = get_local_id(0); +void OVERLOAD fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe) { u32 WG = SMALL_HEIGHT / NH; #if !UNROLL_H __attribute__((opencl_unroll_hint(1))) #endif - for (u32 s = 1; s < WG; s *= NH) { - if (s > 1) { bar(WG); } fft_NH(u); - tabMul(WG, trig, u, NH, s, me % WG); - shufl2(WG, lds, u, NH, s); + tabMul(WG, trig, u, NH, s, lowMe); + shufl(WG, lds, u, NH, s, numWG, sb, lowMe); } fft_NH(u); } #endif -void OVERLOAD new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, int callnum) { +void OVERLOAD new_fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe, int callnum) { u32 WG = SMALL_HEIGHT / NH; - u32 me = get_local_id(0); - // This line mimics shufl2 -- partition lds into halves - local T2* partitioned_lds = lds + (me / WG) * SMALL_HEIGHT / 2; - me = me % WG; + + // This line mimics shufl -- partition lds + local T2* partitioned_lds = lds; + if (numWG > 1) partitioned_lds += ((u32) get_local_id(0) / WG) * SMALL_HEIGHT * sb / sizeof(T2); // Custom code for various SMALL_HEIGHT values @@ -116,27 +83,25 @@ void OVERLOAD new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, int callnum trig += WG*4 + 2*WG*4; // Skip past old FFT_width trig values. Also skip past !save_one_more_mul trig values. // Preload trig values to hide global memory latencies. As the preloads are used, the next set of trig values are preloaded. - preload_tabMul4_trig(WG, trig, preloads, 1, me); + preload_tabMul4_trig(WG, trig, preloads, 1, numWG, lowMe); // Do first fft4, partial tabMul, and shufl. fft4(u); - partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, me); - shufl2(WG, lds, u, NH, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe); + shufl(WG, lds, u, NH, 1, numWG, sb, lowMe); // Finish the first tabMul and perform second fft4. Do second partial tabMul and shufl. - finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 1, me, 1); - partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, me); - bar(WG); - shufl2(WG, lds, u, NH, 4); + finish_tabMul4_fft4(WG, trig, preloads, u, 1, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, numWG, lowMe); + shufl(WG, lds, u, NH, 4, numWG, sb, lowMe); // Finish the second tabMul and perform third fft4. Do third partial tabMul and shufl. - finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 4, me, 1); - partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, me); - bar(WG); - shufl2(WG, lds, u, NH, 16); + finish_tabMul4_fft4(WG, trig, preloads, u, 4, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, numWG, lowMe); + shufl(WG, lds, u, NH, 16, numWG, sb, lowMe); // Finish third tabMul and perform final fft4. - finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 16, me, 1); + finish_tabMul4_fft4(WG, trig, preloads, u, 16, numWG, lowMe, 1); #elif SMALL_HEIGHT == 512 && NH == 8 && FFT_VARIANT_H == 2 @@ -146,21 +111,20 @@ void OVERLOAD new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, int callnum trig += WG*8 + 2*WG*8; // Skip past old FFT_width trig values. Also skip past !save_one_more_mul trig values. // Preload trig values to hide global memory latencies. As the preloads are used, the next set of trig values are preloaded. - preload_tabMul8_trig(WG, trig, preloads, 1, me); + preload_tabMul8_trig(WG, trig, preloads, 1, numWG, lowMe); // Do first fft8, partial tabMul, and shufl. fft8(u); - partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 1, me); - shufl2(WG, lds, u, NH, 1); + partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe); + shufl(WG, lds, u, NH, 1, numWG, sb, lowMe); // Finish the first tabMul and perform second fft8. Do second partial tabMul and shufl. - finish_tabMul8_fft8(WG, partitioned_lds, trig, preloads, u, 1, me, 1); - partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 8, me); - bar(WG); - shufl2(WG, lds, u, NH, 8); + finish_tabMul8_fft8(WG, trig, preloads, u, 1, numWG, lowMe, 1); + partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 8, numWG, lowMe); + shufl(WG, lds, u, NH, 8, numWG, sb, lowMe); // Finish second tabMul and perform final fft8. - finish_tabMul8_fft8(WG, partitioned_lds, trig, preloads, u, 8, me, 1); + finish_tabMul8_fft8(WG, trig, preloads, u, 8, numWG, lowMe, 1); #elif SMALL_HEIGHT == 1024 && NH == 4 && FFT_VARIANT_H == 2 @@ -170,44 +134,41 @@ void OVERLOAD new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, int callnum trig += WG*4 + 2*WG*4; // Skip past old FFT_width trig values. Also skip past !save_one_more_mul trig values. // Preload trig values to hide global memory latencies. As the preloads are used, the next set of trig values are preloaded. - preload_tabMul4_trig(WG, trig, preloads, 1, me); + preload_tabMul4_trig(WG, trig, preloads, 1, numWG, lowMe); // Do first fft4, partial tabMul, and shufl. fft4(u); - partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, me); - shufl2(WG, lds, u, NH, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe); + shufl(WG, lds, u, NH, 1, numWG, sb, lowMe); // Finish the first tabMul and perform second fft4. Do second partial tabMul and shufl. - finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 1, me, 1); - partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, me); - bar(WG); - shufl2(WG, lds, u, NH, 4); + finish_tabMul4_fft4(WG, trig, preloads, u, 1, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, numWG, lowMe); + shufl(WG, lds, u, NH, 4, numWG, sb, lowMe); // Finish the second tabMul and perform third fft4. Do third partial tabMul and shufl. - finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 4, me, 1); - partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, me); - bar(WG); - shufl2(WG, lds, u, NH, 16); + finish_tabMul4_fft4(WG, trig, preloads, u, 4, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, numWG, lowMe); + shufl(WG, lds, u, NH, 16, numWG, sb, lowMe); // Finish the third tabMul and perform fourth fft4. Do fourth partial tabMul and shufl. - finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 16, me, 1); - partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 64, me); - bar(WG); - shufl2(WG, lds, u, NH, 64); + finish_tabMul4_fft4(WG, trig, preloads, u, 16, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 64, numWG, lowMe); + shufl(WG, lds, u, NH, 64, numWG, sb, lowMe); // Finish fourth tabMul and perform final fft4. - finish_tabMul4_fft4(WG, partitioned_lds, trig, preloads, u, 64, me, 1); + finish_tabMul4_fft4(WG, trig, preloads, u, 64, numWG, lowMe, 1); #else // Old version - fft_HEIGHT2(lds, u, trig, w); + fft_HEIGHT(lds, u, trig, w, numWG, sb, lowMe); #endif } -void new_fft_HEIGHT2_1(local T2 *lds, T2 *u, Trig trig, T2 w) { new_fft_HEIGHT2(lds, u, trig, w, 1); } -void new_fft_HEIGHT2_2(local T2 *lds, T2 *u, Trig trig, T2 w) { new_fft_HEIGHT2(lds, u, trig, w, 2); } +void new_fft_HEIGHT1(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe) { new_fft_HEIGHT(lds, u, trig, w, numWG, sb, lowMe, 1); } +void new_fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w, u32 numWG, const u32 sb, u32 lowMe) { new_fft_HEIGHT(lds, u, trig, w, numWG, sb, lowMe, 2); } #endif @@ -228,41 +189,22 @@ void OVERLOAD fft_NH(F2 *u) { #endif } -void OVERLOAD fft_HEIGHT(local F2 *lds, F2 *u, TrigFP32 trig) { - u32 me = get_local_id(0); - -#if !UNROLL_H - __attribute__((opencl_unroll_hint(1))) -#endif - - for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) { - if (s > 1) { bar(); } - fft_NH(u); - tabMul(SMALL_HEIGHT / NH, trig, u, NH, s, me); - shufl(SMALL_HEIGHT / NH, lds, u, NH, s); - } - fft_NH(u); -} - -void OVERLOAD fft_HEIGHT2(local F2 *lds, F2 *u, TrigFP32 trig) { - u32 me = get_local_id(0); +void OVERLOAD fft_HEIGHT(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) { u32 WG = SMALL_HEIGHT / NH; #if !UNROLL_H __attribute__((opencl_unroll_hint(1))) #endif - for (u32 s = 1; s < WG; s *= NH) { - if (s > 1) { bar(WG); } fft_NH(u); - tabMul(WG, trig, u, NH, s, me % WG); - shufl2(WG, lds, u, NH, s); + tabMul(WG, trig, u, NH, s, lowMe); + shufl(WG, lds, u, NH, s, numWG, sb, lowMe); } fft_NH(u); } -void new_fft_HEIGHT2_1(local F2 *lds, F2 *u, TrigFP32 trig) { fft_HEIGHT2(lds, u, trig); } -void new_fft_HEIGHT2_2(local F2 *lds, F2 *u, TrigFP32 trig) { fft_HEIGHT2(lds, u, trig); } +void new_fft_HEIGHT1(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); } +void new_fft_HEIGHT2(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); } #endif @@ -283,41 +225,22 @@ void OVERLOAD fft_NH(GF31 *u) { #endif } -void OVERLOAD fft_HEIGHT(local GF31 *lds, GF31 *u, TrigGF31 trig) { - u32 me = get_local_id(0); - -#if !UNROLL_H - __attribute__((opencl_unroll_hint(1))) -#endif - - for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) { - if (s > 1) { bar(); } - fft_NH(u); - tabMul(SMALL_HEIGHT / NH, trig, u, NH, s, me); - shufl(SMALL_HEIGHT / NH, lds, u, NH, s); - } - fft_NH(u); -} - -void OVERLOAD fft_HEIGHT2(local GF31 *lds, GF31 *u, TrigGF31 trig) { - u32 me = get_local_id(0); +void OVERLOAD fft_HEIGHT(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) { u32 WG = SMALL_HEIGHT / NH; #if !UNROLL_H __attribute__((opencl_unroll_hint(1))) #endif - for (u32 s = 1; s < WG; s *= NH) { - if (s > 1) { bar(WG); } fft_NH(u); - tabMul(WG, trig, u, NH, s, me % WG); - shufl2(WG, lds, u, NH, s); + tabMul(WG, trig, u, NH, s, lowMe); + shufl(WG, lds, u, NH, s, numWG, sb, lowMe); } fft_NH(u); } -void OVERLOAD new_fft_HEIGHT2_1(local GF31 *lds, GF31 *u, TrigGF31 trig) { fft_HEIGHT2(lds, u, trig); } -void OVERLOAD new_fft_HEIGHT2_2(local GF31 *lds, GF31 *u, TrigGF31 trig) { fft_HEIGHT2(lds, u, trig); } +void OVERLOAD new_fft_HEIGHT1(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); } +void OVERLOAD new_fft_HEIGHT2(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); } #endif @@ -338,40 +261,21 @@ void OVERLOAD fft_NH(GF61 *u) { #endif } -void OVERLOAD fft_HEIGHT(local GF61 *lds, GF61 *u, TrigGF61 trig) { - u32 me = get_local_id(0); - -#if !UNROLL_H - __attribute__((opencl_unroll_hint(1))) -#endif - - for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) { - if (s > 1) { bar(); } - fft_NH(u); - tabMul(SMALL_HEIGHT / NH, trig, u, NH, s, me); - shufl(SMALL_HEIGHT / NH, lds, u, NH, s); - } - fft_NH(u); -} - -void OVERLOAD fft_HEIGHT2(local GF61 *lds, GF61 *u, TrigGF61 trig) { - u32 me = get_local_id(0); +void OVERLOAD fft_HEIGHT(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) { u32 WG = SMALL_HEIGHT / NH; #if !UNROLL_H __attribute__((opencl_unroll_hint(1))) #endif - for (u32 s = 1; s < WG; s *= NH) { - if (s > 1) { bar(WG); } fft_NH(u); - tabMul(WG, trig, u, NH, s, me % WG); - shufl2(WG, lds, u, NH, s); + tabMul(WG, trig, u, NH, s, lowMe); + shufl(WG, lds, u, NH, s, numWG, sb, lowMe); } fft_NH(u); } -void OVERLOAD new_fft_HEIGHT2_1(local GF61 *lds, GF61 *u, TrigGF61 trig) { fft_HEIGHT2(lds, u, trig); } -void OVERLOAD new_fft_HEIGHT2_2(local GF61 *lds, GF61 *u, TrigGF61 trig) { fft_HEIGHT2(lds, u, trig); } +void OVERLOAD new_fft_HEIGHT1(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); } +void OVERLOAD new_fft_HEIGHT2(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_HEIGHT(lds, u, trig, numWG, sb, lowMe); } #endif diff --git a/src/cl/ffthin.cl b/src/cl/ffthin.cl index ae14ec53..82b3377d 100644 --- a/src/cl/ffthin.cl +++ b/src/cl/ffthin.cl @@ -8,11 +8,11 @@ // Do an FFT Height after an fftMiddleIn (which may not have fully transposed data, leading to non-sequential input) KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) { - local T2 lds[SMALL_HEIGHT / 2]; - + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local T2 lds[lds_bytes / sizeof(T2)]; + T2 u[NH]; u32 g = get_group_id(0); - u32 me = get_local_id(0); readTailFusedLine(in, u, g, me); @@ -23,7 +23,7 @@ KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) { T2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH); #endif - fft_HEIGHT(lds, u, smallTrig, w); + fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me); write(G_H, NH, u, out, SMALL_HEIGHT * transPos(g, MIDDLE, WIDTH)); } @@ -39,7 +39,8 @@ KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) { // Do an FFT Height after an fftMiddleIn (which may not have fully transposed data, leading to non-sequential input) KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) { - local F2 lds[SMALL_HEIGHT / 2]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local F2 lds[lds_bytes / sizeof(F2)]; CP(F2) inF2 = (CP(F2)) in; P(F2) outF2 = (P(F2)) out; @@ -57,7 +58,7 @@ KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) { F2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH); #endif - fft_HEIGHT(lds, u, smallTrigF2); + fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me); write(G_H, NH, u, outF2, SMALL_HEIGHT * transPos(g, MIDDLE, WIDTH)); } @@ -73,7 +74,8 @@ KERNEL(G_H) fftHin(P(T2) out, CP(T2) in, Trig smallTrig) { // Do an FFT Height after an fftMiddleIn (which may not have fully transposed data, leading to non-sequential input) KERNEL(G_H) fftHinGF31(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF31 lds[SMALL_HEIGHT / 2]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF31 lds[lds_bytes / sizeof(GF31)]; CP(GF31) in31 = (CP(GF31)) (in + DISTGF31); P(GF31) out31 = (P(GF31)) (out + DISTGF31); @@ -81,12 +83,11 @@ KERNEL(G_H) fftHinGF31(P(T2) out, CP(T2) in, Trig smallTrig) { GF31 u[NH]; u32 g = get_group_id(0); - u32 me = get_local_id(0); readTailFusedLine(in31, u, g, me); - fft_HEIGHT(lds, u, smallTrig31); + fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me); write(G_H, NH, u, out31, SMALL_HEIGHT * transPos(g, MIDDLE, WIDTH)); } @@ -102,7 +103,8 @@ KERNEL(G_H) fftHinGF31(P(T2) out, CP(T2) in, Trig smallTrig) { // Do an FFT Height after an fftMiddleIn (which may not have fully transposed data, leading to non-sequential input) KERNEL(G_H) fftHinGF61(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF61 lds[SMALL_HEIGHT / 2]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF61 lds[lds_bytes / sizeof(GF61)]; CP(GF61) in61 = (CP(GF61)) (in + DISTGF61); P(GF61) out61 = (P(GF61)) (out + DISTGF61); @@ -110,12 +112,11 @@ KERNEL(G_H) fftHinGF61(P(T2) out, CP(T2) in, Trig smallTrig) { GF61 u[NH]; u32 g = get_group_id(0); - u32 me = get_local_id(0); readTailFusedLine(in61, u, g, me); - fft_HEIGHT(lds, u, smallTrig61); + fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me); write(G_H, NH, u, out61, SMALL_HEIGHT * transPos(g, MIDDLE, WIDTH)); } diff --git a/src/cl/fftp.cl b/src/cl/fftp.cl index 2fe0d7d4..21cacf77 100644 --- a/src/cl/fftp.cl +++ b/src/cl/fftp.cl @@ -10,7 +10,7 @@ // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) { - local T2 lds[WIDTH / 2]; + local T2 lds[WIDTH * SHUFL_BYTES_W / sizeof(T2)]; T2 u[NW]; u32 g = get_group_id(0); @@ -27,9 +27,9 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) u[i] = U2(in[p].x * w1, in[p].y * w2); } - fft_WIDTH(lds, u, smallTrig); + fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me); - writeCarryFusedLine(u, out, g); + writeCarryFusedLine(u, out, g, me); } @@ -41,7 +41,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(F2) out, CP(Word2) in, TrigFP32 smallTrig, BigTabFP32 THREAD_WEIGHTS) { - local F2 lds[WIDTH / 2]; + local F2 lds[WIDTH * SHUFL_BYTES_W / sizeof(F2)]; F2 u[NW]; u32 g = get_group_id(0); @@ -58,9 +58,9 @@ KERNEL(G_W) fftP(P(F2) out, CP(Word2) in, TrigFP32 smallTrig, BigTabFP32 THREAD_ u[i] = U2(in[p].x * w1, in[p].y * w2); } - fft_WIDTH(lds, u, smallTrig); + fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me); - writeCarryFusedLine(u, out, g); + writeCarryFusedLine(u, out, g, me); } @@ -72,7 +72,7 @@ KERNEL(G_W) fftP(P(F2) out, CP(Word2) in, TrigFP32 smallTrig, BigTabFP32 THREAD_ // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(GF31) out, CP(Word2) in, TrigGF31 smallTrig) { - local GF31 lds[WIDTH / 2]; + local GF31 lds[WIDTH * SHUFL_BYTES_W / sizeof(GF31)]; GF31 u[NW]; u32 g = get_group_id(0); @@ -113,9 +113,9 @@ KERNEL(G_W) fftP(P(GF31) out, CP(Word2) in, TrigGF31 smallTrig) { if (weight_shift > 31) weight_shift -= 31; } - fft_WIDTH(lds, u, smallTrig); + fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me); - writeCarryFusedLine(u, out, g); + writeCarryFusedLine(u, out, g, me); } @@ -127,7 +127,7 @@ KERNEL(G_W) fftP(P(GF31) out, CP(Word2) in, TrigGF31 smallTrig) { // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(GF61) out, CP(Word2) in, TrigGF61 smallTrig) { - local GF61 lds[WIDTH / 2]; + local GF61 lds[WIDTH * SHUFL_BYTES_W / sizeof(GF61)]; GF61 u[NW]; u32 g = get_group_id(0); @@ -170,9 +170,9 @@ KERNEL(G_W) fftP(P(GF61) out, CP(Word2) in, TrigGF61 smallTrig) { if (weight_shift > 61) weight_shift -= 61; } - fft_WIDTH(lds, u, smallTrig); + fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me); - writeCarryFusedLine(u, out, g); + writeCarryFusedLine(u, out, g, me); } @@ -184,7 +184,7 @@ KERNEL(G_W) fftP(P(GF61) out, CP(Word2) in, TrigGF61 smallTrig) { // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) { - local T2 lds[WIDTH / 2]; + local T2 lds[WIDTH * SHUFL_BYTES_W / sizeof(T2)]; local GF31 *lds31 = (local GF31 *) lds; T2 u[NW]; GF31 u31[NW]; @@ -236,11 +236,11 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) if (weight_shift > 31) weight_shift -= 31; } - fft_WIDTH(lds, u, smallTrig); - writeCarryFusedLine(u, out, g); - bar(); - fft_WIDTH(lds31, u31, smallTrig31); - writeCarryFusedLine(u31, out31, g); + fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(u, out, g, me); + + fft_WIDTH(lds31, u31, smallTrig31, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(u31, out31, g, me); } @@ -252,7 +252,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIGHTS) { - local F2 ldsF2[WIDTH / 2]; + local F2 ldsF2[WIDTH * SHUFL_BYTES_W / sizeof(F2)]; local GF31 *lds31 = (local GF31 *) ldsF2; F2 uF2[NW]; GF31 u31[NW]; @@ -306,11 +306,11 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG if (weight_shift > 31) weight_shift -= 31; } - fft_WIDTH(ldsF2, uF2, smallTrigF2); - writeCarryFusedLine(uF2, outF2, g); - bar(); - fft_WIDTH(lds31, u31, smallTrig31); - writeCarryFusedLine(u31, out31, g); + fft_WIDTH(ldsF2, uF2, smallTrigF2, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(uF2, outF2, g, me); + + fft_WIDTH(lds31, u31, smallTrig31, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(u31, out31, g, me); } @@ -322,7 +322,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIGHTS) { - local GF61 lds61[WIDTH / 2]; + local GF61 lds61[WIDTH * SHUFL_BYTES_W / sizeof(GF61)]; local F2 *ldsF2 = (local F2 *) lds61; F2 uF2[NW]; GF61 u61[NW]; @@ -376,11 +376,11 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG if (weight_shift > 61) weight_shift -= 61; } - fft_WIDTH(ldsF2, uF2, smallTrigF2); - writeCarryFusedLine(uF2, outF2, g); - bar(); - fft_WIDTH(lds61, u61, smallTrig61); - writeCarryFusedLine(u61, out61, g); + fft_WIDTH(ldsF2, uF2, smallTrigF2, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(uF2, outF2, g, me); + + fft_WIDTH(lds61, u61, smallTrig61, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(u61, out61, g, me); } @@ -392,7 +392,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig) { - local GF61 lds61[WIDTH / 2]; + local GF61 lds61[WIDTH * SHUFL_BYTES_W / sizeof(GF61)]; local GF31 *lds31 = (local GF31 *) lds61; GF31 u31[NW]; GF61 u61[NW]; @@ -459,11 +459,11 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig) { m61_weight_shift = adjust_m61_weight_shift(m61_weight_shift); } - fft_WIDTH(lds31, u31, smallTrig31); - writeCarryFusedLine(u31, out31, g); - bar(); - fft_WIDTH(lds61, u61, smallTrig61); - writeCarryFusedLine(u61, out61, g); + fft_WIDTH(lds31, u31, smallTrig31, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(u31, out31, g, me); + + fft_WIDTH(lds61, u61, smallTrig61, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(u61, out61, g, me); } @@ -475,7 +475,7 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig) { // fftPremul: weight words with IBDWT weights followed by FFT-width. KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIGHTS) { - local GF61 lds61[WIDTH / 2]; + local GF61 lds61[WIDTH * SHUFL_BYTES_W / sizeof(GF61)]; local F2 *ldsF2 = (local F2 *) lds61; local GF31 *lds31 = (local GF31 *) lds61; F2 uF2[NW]; @@ -551,14 +551,14 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTabFP32 THREAD_WEIG m61_weight_shift = adjust_m61_weight_shift(m61_weight_shift); } - fft_WIDTH(ldsF2, uF2, smallTrigF2); - writeCarryFusedLine(uF2, outF2, g); - bar(); - fft_WIDTH(lds31, u31, smallTrig31); - writeCarryFusedLine(u31, out31, g); - bar(); - fft_WIDTH(lds61, u61, smallTrig61); - writeCarryFusedLine(u61, out61, g); + fft_WIDTH(ldsF2, uF2, smallTrigF2, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(uF2, outF2, g, me); + + fft_WIDTH(lds31, u31, smallTrig31, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(u31, out31, g, me); + + fft_WIDTH(lds61, u61, smallTrig61, 1, SHUFL_BYTES_W, me); + writeCarryFusedLine(u61, out61, g, me); } diff --git a/src/cl/fftw.cl b/src/cl/fftw.cl index a19b26d0..89277649 100644 --- a/src/cl/fftw.cl +++ b/src/cl/fftw.cl @@ -9,13 +9,14 @@ // Do the ending fft_WIDTH after an fftMiddleOut. This is the same as the first half of carryFused. KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) { - local T2 lds[WIDTH / 2]; + local T2 lds[WIDTH * SHUFL_BYTES_W / sizeof(T2)]; T2 u[NW]; u32 g = get_group_id(0); + u32 me = get_local_id(0); - readCarryFusedLine(in, u, g); - fft_WIDTH(lds, u, smallTrig); + readCarryFusedLine(in, u, g, me); + fft_WIDTH(lds, u, smallTrig, 1, SHUFL_BYTES_W, me); out += WIDTH * g; write(G_W, NW, u, out, 0); } @@ -31,7 +32,7 @@ KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) { // Do the ending fft_WIDTH after an fftMiddleOut. This is the same as the first half of carryFused. KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) { - local F2 lds[WIDTH / 2]; + local F2 lds[WIDTH * SHUFL_BYTES_W / sizeof(F2)]; CP(F2) inF2 = (CP(F2)) in; P(F2) outF2 = (P(F2)) out; @@ -39,9 +40,10 @@ KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) { F2 u[NW]; u32 g = get_group_id(0); + u32 me = get_local_id(0); - readCarryFusedLine(inF2, u, g); - fft_WIDTH(lds, u, smallTrigF2); + readCarryFusedLine(inF2, u, g, me); + fft_WIDTH(lds, u, smallTrigF2, 1, SHUFL_BYTES_W, me); outF2 += WIDTH * g; write(G_W, NW, u, outF2, 0); } @@ -56,7 +58,7 @@ KERNEL(G_W) fftW(P(T2) out, CP(T2) in, Trig smallTrig) { #if NTT_GF31 KERNEL(G_W) fftWGF31(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF31 lds[WIDTH / 2]; + local GF31 lds[WIDTH * SHUFL_BYTES_W / sizeof(GF31)]; CP(GF31) in31 = (CP(GF31)) (in + DISTGF31); P(GF31) out31 = (P(GF31)) (out + DISTGF31); @@ -64,9 +66,10 @@ KERNEL(G_W) fftWGF31(P(T2) out, CP(T2) in, Trig smallTrig) { GF31 u[NW]; u32 g = get_group_id(0); + u32 me = get_local_id(0); - readCarryFusedLine(in31, u, g); - fft_WIDTH(lds, u, smallTrig31); + readCarryFusedLine(in31, u, g, me); + fft_WIDTH(lds, u, smallTrig31, 1, SHUFL_BYTES_W, me); out31 += WIDTH * g; write(G_W, NW, u, out31, 0); } @@ -81,7 +84,7 @@ KERNEL(G_W) fftWGF31(P(T2) out, CP(T2) in, Trig smallTrig) { #if NTT_GF61 KERNEL(G_W) fftWGF61(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF61 lds[WIDTH / 2]; + local GF61 lds[WIDTH * SHUFL_BYTES_W / sizeof(GF61)]; CP(GF61) in61 = (CP(GF61)) (in + DISTGF61); P(GF61) out61 = (P(GF61)) (out + DISTGF61); @@ -89,9 +92,10 @@ KERNEL(G_W) fftWGF61(P(T2) out, CP(T2) in, Trig smallTrig) { GF61 u[NW]; u32 g = get_group_id(0); + u32 me = get_local_id(0); - readCarryFusedLine(in61, u, g); - fft_WIDTH(lds, u, smallTrig61); + readCarryFusedLine(in61, u, g, me); + fft_WIDTH(lds, u, smallTrig61, 1, SHUFL_BYTES_W, me); out61 += WIDTH * g; write(G_W, NW, u, out61, 0); } diff --git a/src/cl/fftwidth.cl b/src/cl/fftwidth.cl index d0589ab0..03935149 100644 --- a/src/cl/fftwidth.cl +++ b/src/cl/fftwidth.cl @@ -29,39 +29,38 @@ void OVERLOAD fft_NW(T2 *u) { #error FFT_VARIANT_W == 0 only supported by AMD GPUs #endif -void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig) { - u32 me = get_local_id(0); +void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig, u32 numWG, const u32 sb, u32 lowMe) { + u32 WG = WIDTH / NW; + #if NW == 8 - T2 w = fancyTrig_N(ND / WIDTH * me); + T2 w = fancyTrig_N(ND / WIDTH * lowMe); #else - T2 w = slowTrig_N(ND / WIDTH * me, ND / NW); + T2 w = slowTrig_N(ND / WIDTH * lowMe, ND / NW); #endif - for (u32 s = 1; s < WIDTH / NW; s *= NW) { - if (s > 1) { bar(); } + for (u32 s = 1; s < WG; s *= NW) { fft_NW(u); w = bcast(w, s); chainMul(NW, u, w, 0); - shufl( WIDTH / NW, lds, u, NW, s); + shufl(WG, lds, u, NW, s, numWG, sb, lowMe); } fft_NW(u); } #else -void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig) { - u32 me = get_local_id(0); +void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig, u32 numWG, const u32 sb, u32 lowMe) { + u32 WG = WIDTH / NW; #if !UNROLL_W __attribute__((opencl_unroll_hint(1))) #endif - for (u32 s = 1; s < WIDTH / NW; s *= NW) { - if (s > 1) { bar(); } + for (u32 s = 1; s < WG; s *= NW) { fft_NW(u); - tabMul(WIDTH / NW, trig, u, NW, s, me); - shufl(WIDTH / NW, lds, u, NW, s); + tabMul(WG, trig, u, NW, s, lowMe); + shufl(WG, lds, u, NW, s, numWG, sb, lowMe); } fft_NW(u); } @@ -74,9 +73,12 @@ void OVERLOAD fft_WIDTH(local T2 *lds, T2 *u, Trig trig) { // To maximize FMA opportunities we precompute trig values as cosine and sine/cosine rather than cosine and sine. // The downside is sine/cosine cannot be computed with chained multiplies. -void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) { +void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, u32 numWG, u32 lowMe, const u32 sb, int callnum) { u32 WG = WIDTH / NW; - u32 me = get_local_id(0); + + // This line mimics shufl -- partition lds + local T2* partitioned_lds = lds; + if (numWG > 1) partitioned_lds += ((u32) get_local_id(0) / WG) * WIDTH * sb / sizeof(T2); // Custom code for various WIDTH values @@ -88,27 +90,25 @@ void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) { trig += WG*4 + 2*WG*4; // Skip past old FFT_width trig values. Also skip past !save_one_more_mul trig values. // Preload trig values to hide global memory latencies. As the preloads are used, the next set of trig values are preloaded. - preload_tabMul4_trig(WG, trig, preloads, 1, me); + preload_tabMul4_trig(WG, trig, preloads, 1, numWG, lowMe); // Do first fft4, partial tabMul, and shufl. fft4(u); - partial_tabMul4(WG, lds, trig, preloads, u, 1, me); - shufl(WG, lds, u, NW, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe); + shufl(WG, lds, u, NW, 1, numWG, sb, lowMe); // Finish the first tabMul and perform second fft4. Do second partial tabMul and shufl. - finish_tabMul4_fft4(WG, lds, trig, preloads, u, 1, me, 1); - partial_tabMul4(WG, lds, trig, preloads, u, 4, me); - bar(WG); - shufl(WG, lds, u, NW, 4); + finish_tabMul4_fft4(WG, trig, preloads, u, 1, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, numWG, lowMe); + shufl(WG, lds, u, NW, 4, numWG, sb, lowMe); // Finish the second tabMul and perform third fft4. Do third partial tabMul and shufl. - finish_tabMul4_fft4(WG, lds, trig, preloads, u, 4, me, 1); - partial_tabMul4(WG, lds, trig, preloads, u, 16, me); - bar(WG); - shufl(WG, lds, u, NW, 16); + finish_tabMul4_fft4(WG, trig, preloads, u, 4, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, numWG, lowMe); + shufl(WG, lds, u, NW, 16, numWG, sb, lowMe); // Finish third tabMul and perform final fft4. - finish_tabMul4_fft4(WG, lds, trig, preloads, u, 16, me, 1); + finish_tabMul4_fft4(WG, trig, preloads, u, 16, numWG, lowMe, 1); #elif WIDTH == 512 && NW == 8 && FFT_VARIANT_W == 2 @@ -118,21 +118,20 @@ void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) { trig += WG*8; // Skip past old FFT_width trig values. // Preload trig values to hide global memory latencies. As the preloads are used, the next set of trig values are preloaded. - preload_tabMul8_trig(WG, trig, preloads, 1, me); + preload_tabMul8_trig(WG, trig, preloads, 1, numWG, lowMe); // Do first fft8, partial tabMul, and shufl. fft8(u); - partial_tabMul8(WG, lds, trig, preloads, u, 1, me); - shufl(WG, lds, u, NW, 1); + partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe); + shufl(WG, lds, u, NW, 1, numWG, sb, lowMe); // Finish the first tabMul and perform second fft8. Do second partial tabMul and shufl. - finish_tabMul8_fft8(WG, lds, trig, preloads, u, 1, me, 0); // We'd rather set save_one_more_mul to 1 - partial_tabMul8(WG, lds, trig, preloads, u, 8, me); - bar(); - shufl(WG, lds, u, NW, 8); + finish_tabMul8_fft8(WG, trig, preloads, u, 1, numWG, lowMe, 0); // We'd rather set save_one_more_mul to 1 + partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 8, numWG, lowMe); + shufl(WG, lds, u, NW, 8, numWG, sb, lowMe); // Finish second tabMul and perform final fft8. - finish_tabMul8_fft8(WG, lds, trig, preloads, u, 8, me, 0); // We'd rather set save_one_more_mul to 1 + finish_tabMul8_fft8(WG, trig, preloads, u, 8, numWG, lowMe, 0); // We'd rather set save_one_more_mul to 1 #elif WIDTH == 1024 && NW == 4 && FFT_VARIANT_W == 2 @@ -142,33 +141,30 @@ void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) { trig += WG*4 + 2*WG*4; // Skip past old FFT_width trig values. Also skip past !save_one_more_mul trig values. // Preload trig values to hide global memory latencies. As the preloads are used, the next set of trig values are preloaded. - preload_tabMul4_trig(WG, trig, preloads, 1, me); + preload_tabMul4_trig(WG, trig, preloads, 1, numWG, lowMe); // Do first fft4, partial tabMul, and shufl. fft4(u); - partial_tabMul4(WG, lds, trig, preloads, u, 1, me); - shufl(WG, lds, u, NW, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe); + shufl(WG, lds, u, NW, 1, numWG, sb, lowMe); // Finish the first tabMul and perform second fft4. Do second partial tabMul and shufl. - finish_tabMul4_fft4(WG, lds, trig, preloads, u, 1, me, 1); - partial_tabMul4(WG, lds, trig, preloads, u, 4, me); - bar(WG); - shufl(WG, lds, u, NW, 4); + finish_tabMul4_fft4(WG, trig, preloads, u, 1, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 4, numWG, lowMe); + shufl(WG, lds, u, NW, 4, numWG, sb, lowMe); // Finish the second tabMul and perform third fft4. Do third partial tabMul and shufl. - finish_tabMul4_fft4(WG, lds, trig, preloads, u, 4, me, 1); - partial_tabMul4(WG, lds, trig, preloads, u, 16, me); - bar(WG); - shufl(WG, lds, u, NW, 16); + finish_tabMul4_fft4(WG, trig, preloads, u, 4, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 16, numWG, lowMe); + shufl(WG, lds, u, NW, 16, numWG, sb, lowMe); // Finish the third tabMul and perform fourth fft4. Do fourth partial tabMul and shufl. - finish_tabMul4_fft4(WG, lds, trig, preloads, u, 16, me, 1); - partial_tabMul4(WG, lds, trig, preloads, u, 64, me); - bar(WG); - shufl(WG, lds, u, NW, 64); + finish_tabMul4_fft4(WG, trig, preloads, u, 16, numWG, lowMe, 1); + partial_tabMul4(WG, partitioned_lds, trig, preloads, u, 64, numWG, lowMe); + shufl(WG, lds, u, NW, 64, numWG, sb, lowMe); // Finish fourth tabMul and perform final fft4. - finish_tabMul4_fft4(WG, lds, trig, preloads, u, 64, me, 1); + finish_tabMul4_fft4(WG, trig, preloads, u, 64, numWG, lowMe, 1); #elif WIDTH == 4096 && NW == 8 && FFT_VARIANT_W == 2 @@ -178,39 +174,37 @@ void OVERLOAD new_fft_WIDTH(local T2 *lds, T2 *u, Trig trig, int callnum) { trig += WG*8; // Skip past old FFT_width trig values to the !save_one_more_mul trig values // Preload trig values to hide global memory latencies. As the preloads are used, the next set of trig values are preloaded. - preload_tabMul8_trig(WG, trig, preloads, 1, me); + preload_tabMul8_trig(WG, trig, preloads, 1, numWG, lowMe); // Do first fft8, partial tabMul, and shufl. fft8(u); - partial_tabMul8(WG, lds, trig, preloads, u, 1, me); - shufl(WG, lds, u, NW, 1); + partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 1, numWG, lowMe); + shufl(WG, lds, u, NW, 1, numWG, sb, lowMe); // Finish the first tabMul and perform second fft8. Do second partial tabMul and shufl. - finish_tabMul8_fft8(WG, lds, trig, preloads, u, 1, me, 0); // We'd rather set save_one_more_mul to 1 - partial_tabMul8(WG, lds, trig, preloads, u, 8, me); - bar(); - shufl(WG, lds, u, NW, 8); + finish_tabMul8_fft8(WG, trig, preloads, u, 1, numWG, lowMe, 0); // We'd rather set save_one_more_mul to 1 + partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 8, numWG, lowMe); + shufl(WG, lds, u, NW, 8, numWG, sb, lowMe); // Finish the second tabMul and perform third fft8. Do third partial tabMul and shufl. - finish_tabMul8_fft8(WG, lds, trig, preloads, u, 8, me, 0); // We'd rather set save_one_more_mul to 1 - partial_tabMul8(WG, lds, trig, preloads, u, 64, me); - bar(); - shufl(WG, lds, u, NW, 64); + finish_tabMul8_fft8(WG, trig, preloads, u, 8, numWG, lowMe, 0); // We'd rather set save_one_more_mul to 1 + partial_tabMul8(WG, partitioned_lds, trig, preloads, u, 64, numWG, lowMe); + shufl(WG, lds, u, NW, 64, numWG, sb, lowMe); // Finish third tabMul and perform final fft8. - finish_tabMul8_fft8(WG, lds, trig, preloads, u, 64, me, 0); // We'd rather set save_one_more_mul to 1 + finish_tabMul8_fft8(WG, trig, preloads, u, 64, numWG, lowMe, 0); // We'd rather set save_one_more_mul to 1 #else // Old version - fft_WIDTH(lds, u, trig); + fft_WIDTH(lds, u, trig, numWG, sb, lowMe); #endif } // There are two version of new_fft_WIDTH in case we want to try saving some trig values from new_fft_WIDTH1 in LDS memory for later use in new_fft_WIDTH2. -void OVERLOAD new_fft_WIDTH1(local T2 *lds, T2 *u, Trig trig) { new_fft_WIDTH(lds, u, trig, 1); } -void OVERLOAD new_fft_WIDTH2(local T2 *lds, T2 *u, Trig trig) { new_fft_WIDTH(lds, u, trig, 2); } +void OVERLOAD new_fft_WIDTH1(local T2 *lds, T2 *u, Trig trig, u32 numWG, const u32 sb, u32 lowMe) { new_fft_WIDTH(lds, u, trig, numWG, lowMe, sb, 1); } +void OVERLOAD new_fft_WIDTH2(local T2 *lds, T2 *u, Trig trig, u32 numWG, const u32 sb, u32 lowMe) { new_fft_WIDTH(lds, u, trig, numWG, lowMe, sb, 2); } #endif @@ -231,23 +225,22 @@ void OVERLOAD fft_NW(F2 *u) { #endif } -void OVERLOAD fft_WIDTH(local F2 *lds, F2 *u, TrigFP32 trig) { - u32 me = get_local_id(0); +void OVERLOAD fft_WIDTH(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) { + u32 WG = WIDTH / NW; #if !UNROLL_W __attribute__((opencl_unroll_hint(1))) #endif - for (u32 s = 1; s < WIDTH / NW; s *= NW) { - if (s > 1) { bar(); } + for (u32 s = 1; s < WG; s *= NW) { fft_NW(u); - tabMul(WIDTH / NW, trig, u, NW, s, me); - shufl(WIDTH / NW, lds, u, NW, s); + tabMul(WG, trig, u, NW, s, lowMe); + shufl(WG, lds, u, NW, s, numWG, sb, lowMe); } fft_NW(u); } -void OVERLOAD new_fft_WIDTH1(local F2 *lds, F2 *u, TrigFP32 trig) { fft_WIDTH(lds, u, trig); } -void OVERLOAD new_fft_WIDTH2(local F2 *lds, F2 *u, TrigFP32 trig) { fft_WIDTH(lds, u, trig); } +void OVERLOAD new_fft_WIDTH1(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); } +void OVERLOAD new_fft_WIDTH2(local F2 *lds, F2 *u, TrigFP32 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); } #endif @@ -268,23 +261,22 @@ void OVERLOAD fft_NW(GF31 *u) { #endif } -void OVERLOAD fft_WIDTH(local GF31 *lds, GF31 *u, TrigGF31 trig) { - u32 me = get_local_id(0); +void OVERLOAD fft_WIDTH(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) { + u32 WG = WIDTH / NW; #if !UNROLL_W __attribute__((opencl_unroll_hint(1))) #endif - for (u32 s = 1; s < WIDTH / NW; s *= NW) { - if (s > 1) { bar(); } + for (u32 s = 1; s < WG; s *= NW) { fft_NW(u); - tabMul(WIDTH / NW, trig, u, NW, s, me); - shufl(WIDTH / NW, lds, u, NW, s); + tabMul(WG, trig, u, NW, s, lowMe); + shufl(WG, lds, u, NW, s, numWG, sb, lowMe); } fft_NW(u); } -void OVERLOAD new_fft_WIDTH1(local GF31 *lds, GF31 *u, TrigGF31 trig) { fft_WIDTH(lds, u, trig); } -void OVERLOAD new_fft_WIDTH2(local GF31 *lds, GF31 *u, TrigGF31 trig) { fft_WIDTH(lds, u, trig); } +void OVERLOAD new_fft_WIDTH1(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); } +void OVERLOAD new_fft_WIDTH2(local GF31 *lds, GF31 *u, TrigGF31 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); } #endif @@ -305,22 +297,21 @@ void OVERLOAD fft_NW(GF61 *u) { #endif } -void OVERLOAD fft_WIDTH(local GF61 *lds, GF61 *u, TrigGF61 trig) { - u32 me = get_local_id(0); +void OVERLOAD fft_WIDTH(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) { + u32 WG = WIDTH / NW; #if !UNROLL_W __attribute__((opencl_unroll_hint(1))) #endif - for (u32 s = 1; s < WIDTH / NW; s *= NW) { - if (s > 1) { bar(); } + for (u32 s = 1; s < WG; s *= NW) { fft_NW(u); - tabMul(WIDTH / NW, trig, u, NW, s, me); - shufl(WIDTH / NW, lds, u, NW, s); + tabMul(WG, trig, u, NW, s, lowMe); + shufl(WG, lds, u, NW, s, numWG, sb, lowMe); } fft_NW(u); } -void OVERLOAD new_fft_WIDTH1(local GF61 *lds, GF61 *u, TrigGF61 trig) { fft_WIDTH(lds, u, trig); } -void OVERLOAD new_fft_WIDTH2(local GF61 *lds, GF61 *u, TrigGF61 trig) { fft_WIDTH(lds, u, trig); } +void OVERLOAD new_fft_WIDTH1(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); } +void OVERLOAD new_fft_WIDTH2(local GF61 *lds, GF61 *u, TrigGF61 trig, u32 numWG, const u32 sb, u32 lowMe) { fft_WIDTH(lds, u, trig, numWG, sb, lowMe); } #endif diff --git a/src/cl/math.cl b/src/cl/math.cl index 479238c6..5ef7d515 100644 --- a/src/cl/math.cl +++ b/src/cl/math.cl @@ -171,7 +171,7 @@ i32 optional_sub(i32 a, const i32 b) { // Optionally subtract a value if first arg is greater than value. i32 optional_mod(i32 a, const i32 b) { -#if 0 //HAS_PTX >= 100 // setp/sub instruction requires sm_10 support or higher // Not faster on 5xxx GPUs (not sure why) +#if ENABLE_OPTIONAL_MOD && HAS_PTX >= 100 // setp/sub instruction requires sm_10 support or higher // Not faster on 5xxx GPUs (not sure why) __asm("{.reg .pred %%p;\n\t" " setp.ge.s32 %%p, %0, %1;\n\t" // a > b " @%%p sub.s32 %0, %0, %1;}" // if (a > b) a = a - b @@ -207,7 +207,7 @@ u64 OVERLOAD mad32(u32 a, u32 b, u64 c) { } u128 OVERLOAD mad64(u64 a, u64 b, u64 c) { -#if 0 && HAS_PTX >= 200 // mad instruction requires sm_20 support or higher // Slower on TitanV and mobile 4070, don't understand why +#if ENABLE_MAD64 && HAS_PTX >= 200 // mad instruction requires sm_20 support or higher // Slower on TitanV and mobile 4070, don't understand why u64 reslo, reshi; __asm("mad.lo.cc.u64 %0, %2, %3, %4;\n\t" "madc.hi.u64 %1, %2, %3, 0;" : "=l"(reslo), "=l"(reshi) : "l"(a), "l"(b), "l"(u128_lo64(c))); @@ -236,7 +236,7 @@ u128 OVERLOAD mad64(u64 a, u64 b, u64 c) { } u128 OVERLOAD mad64(u64 a, u64 b, u128 c) { -#if 0 && HAS_PTX >= 200 // mad instruction requires sm_20 support or higher // Slower on TitanV and mobile 4070, don't understand why +#if ENABLE_MAD64 && HAS_PTX >= 200 // mad instruction requires sm_20 support or higher // Slower on TitanV and mobile 4070, don't understand why u64 reslo, reshi; __asm("mad.lo.cc.u64 %0, %2, %3, %4;\n\t" "madc.hi.u64 %1, %2, %3, %5;" : "=l"(reslo), "=l"(reshi) : "l"(a), "l"(b), "l"(u128_lo64(c)), "l"(u128_hi64(c))); diff --git a/src/cl/middle.cl b/src/cl/middle.cl index 263937f3..36fcac56 100644 --- a/src/cl/middle.cl +++ b/src/cl/middle.cl @@ -54,13 +54,13 @@ // u[i] i ranges 0...MIDDLE-1 (multiples of SMALL_HEIGHT) // y ranges 0...SMALL_HEIGHT-1 (multiples of one) -void OVERLOAD writeCarryFusedLine(T2 *u, P(T2) out, u32 line) { +void OVERLOAD writeCarryFusedLine(T2 *u, P(T2) out, u32 line, u32 me) { #if PAD_SIZE > 0 u32 BIG_PAD_SIZE = (PAD_SIZE/2+1)*PAD_SIZE; - out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + (u32) get_local_id(0); // One pad every line + a big pad every SMALL_HEIGHT lines + out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + me; // One pad every line + a big pad every SMALL_HEIGHT lines for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); } #else - out += line * WIDTH + (u32) get_local_id(0); + out += line * WIDTH + me; for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); } #endif } @@ -311,8 +311,7 @@ void OVERLOAD writeMiddleOutLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x) } // Read a line for carryFused or FFTW. This line was written by writeMiddleOutLine above. -void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line) { - u32 me = get_local_id(0); +void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) { u32 SIZEY = OUT_WG / OUT_SIZEX; #if PAD_SIZE > 0 @@ -381,13 +380,13 @@ void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line) { #if FFT_FP32 || NTT_GF31 -void OVERLOAD writeCarryFusedLine(F2 *u, P(F2) out, u32 line) { +void OVERLOAD writeCarryFusedLine(F2 *u, P(F2) out, u32 line, u32 me) { #if PAD_SIZE > 0 u32 BIG_PAD_SIZE = (PAD_SIZE/2+1)*PAD_SIZE; - out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + (u32) get_local_id(0); // One pad every line + a big pad every SMALL_HEIGHT lines + out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + me; // One pad every line + a big pad every SMALL_HEIGHT lines for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); } #else - out += line * WIDTH + (u32) get_local_id(0); + out += line * WIDTH + me; for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); } #endif } @@ -537,8 +536,7 @@ void OVERLOAD writeMiddleOutLine (P(F2) out, F2 *u, u32 chunk_y, u32 chunk_x) #endif } -void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line) { - u32 me = get_local_id(0); +void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line, u32 me) { u32 SIZEY = OUT_WG / OUT_SIZEX; #if PAD_SIZE > 0 // Adjust in pointer based on the x value used in writeMiddleOutLine @@ -595,8 +593,8 @@ void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line) { // Since F2 and GF31 are the same size we can simply call the floats based code -void OVERLOAD writeCarryFusedLine(GF31 *u, P(GF31) out, u32 line) { - writeCarryFusedLine((F2 *) u, (P(F2)) out, line); +void OVERLOAD writeCarryFusedLine(GF31 *u, P(GF31) out, u32 line, u32 me) { + writeCarryFusedLine((F2 *) u, (P(F2)) out, line, me); } void OVERLOAD readMiddleInLine(GF31 *u, CP(GF31) in, u32 y, u32 x) { @@ -623,8 +621,8 @@ void OVERLOAD writeMiddleOutLine (P(GF31) out, GF31 *u, u32 chunk_y, u32 chunk_x writeMiddleOutLine ((P(F2)) out, (F2 *) u, chunk_y, chunk_x); } -void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line) { - readCarryFusedLine((CP(F2)) in, (F2 *) u, line); +void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line, u32 me) { + readCarryFusedLine((CP(F2)) in, (F2 *) u, line, me); } #endif @@ -638,8 +636,8 @@ void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line) { // Since T2 and GF61 are the same size we can simply call the doubles based code -void OVERLOAD writeCarryFusedLine(GF61 *u, P(GF61) out, u32 line) { - writeCarryFusedLine((T2 *) u, (P(T2)) out, line); +void OVERLOAD writeCarryFusedLine(GF61 *u, P(GF61) out, u32 line, u32 me) { + writeCarryFusedLine((T2 *) u, (P(T2)) out, line, me); } void OVERLOAD readMiddleInLine(GF61 *u, CP(GF61) in, u32 y, u32 x) { @@ -666,8 +664,8 @@ void OVERLOAD writeMiddleOutLine (P(GF61) out, GF61 *u, u32 chunk_y, u32 chunk_x writeMiddleOutLine ((P(T2)) out, (T2 *) u, chunk_y, chunk_x); } -void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line) { - readCarryFusedLine((CP(T2)) in, (T2 *) u, line); +void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line, u32 me) { + readCarryFusedLine((CP(T2)) in, (T2 *) u, line, me); } #endif @@ -778,8 +776,7 @@ void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line) { // line ranges 0...BIG_HEIGHT-1 (multiples of one) // Read a line for carryFused or FFTW. This line was written by writeMiddleOutLine above. -void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line) { - u32 me = get_local_id(0); // Multiples of BIG_HEIGHT +void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) { u32 middle = line / SMALL_HEIGHT; // Multiples of SMALL_HEIGHT line = line % SMALL_HEIGHT; // Multiples of one in += (me / 16 * SIZEW) + (middle * SIZEM) + (line % 16 * SIZEBLK) + SWIZ(line % 16, line / 16) * 16 + (me % 16); @@ -787,8 +784,7 @@ void OVERLOAD readCarryFusedLine(CP(T2) in, T2 *u, u32 line) { } // Write a line from carryFused. This data will be read by fftMiddleIn. -void OVERLOAD writeCarryFusedLine(T2 *u, P(T2) out, u32 line) { - u32 me = get_local_id(0); // Multiples of BIG_HEIGHT +void OVERLOAD writeCarryFusedLine(T2 *u, P(T2) out, u32 line, u32 me) { // me is multiples of BIG_HEIGHT u32 middle = line / SMALL_HEIGHT; // Multiples of SMALL_HEIGHT line = line % SMALL_HEIGHT; // Multiples of one out += (me / 16 * SIZEW) + (middle * SIZEM) + (line % 16 * SIZEBLK) + SWIZ(line % 16, line / 16) * 16 + (me % 16); @@ -899,8 +895,7 @@ void OVERLOAD writeMiddleOutLine (P(T2) out, T2 *u, u32 y, u32 x) // line ranges 0...BIG_HEIGHT-1 (multiples of one) // Read a line for carryFused or FFTW. This line was written by writeMiddleOutLine above. -void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line) { - u32 me = get_local_id(0); // Multiples of BIG_HEIGHT +void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line, u32 me) { u32 middle = line / SMALL_HEIGHT; // Multiples of SMALL_HEIGHT line = line % SMALL_HEIGHT; // Multiples of one in += (me / 16 * SIZEW32) + (middle * SIZEM32) + (line % 16 * SIZEBLK32) + SWIZ32(line % 16, line / 16) * 16 + (me % 16); @@ -908,8 +903,7 @@ void OVERLOAD readCarryFusedLine(CP(F2) in, F2 *u, u32 line) { } // Write a line from carryFused. This data will be read by fftMiddleIn. -void OVERLOAD writeCarryFusedLine(F2 *u, P(F2) out, u32 line) { - u32 me = get_local_id(0); // Multiples of BIG_HEIGHT +void OVERLOAD writeCarryFusedLine(F2 *u, P(F2) out, u32 line, u32 me) { // me is multiples of BIG_HEIGHT u32 middle = line / SMALL_HEIGHT; // Multiples of SMALL_HEIGHT line = line % SMALL_HEIGHT; // Multiples of one out += (me / 16 * SIZEW32) + (middle * SIZEM32) + (line % 16 * SIZEBLK32) + SWIZ32(line % 16, line / 16) * 16 + (me % 16); @@ -990,12 +984,12 @@ void OVERLOAD writeMiddleOutLine (P(F2) out, F2 *u, u32 y, u32 x) // Since F2 and GF31 are the same size we can simply call the floats based code -void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line) { - readCarryFusedLine((CP(F2)) in, (F2 *) u, line); +void OVERLOAD readCarryFusedLine(CP(GF31) in, GF31 *u, u32 line, u32 me) { + readCarryFusedLine((CP(F2)) in, (F2 *) u, line, me); } -void OVERLOAD writeCarryFusedLine(GF31 *u, P(GF31) out, u32 line) { - writeCarryFusedLine((F2 *) u, (P(F2)) out, line); +void OVERLOAD writeCarryFusedLine(GF31 *u, P(GF31) out, u32 line, u32 me) { + writeCarryFusedLine((F2 *) u, (P(F2)) out, line, me); } void OVERLOAD readMiddleInLine(GF31 *u, CP(GF31) in, u32 y, u32 x) { @@ -1033,12 +1027,12 @@ void OVERLOAD writeMiddleOutLine (P(GF31) out, GF31 *u, u32 y, u32 x) { // Since T2 and GF61 are the same size we can simply call the doubles based code -void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line) { - readCarryFusedLine((CP(T2)) in, (T2 *) u, line); +void OVERLOAD readCarryFusedLine(CP(GF61) in, GF61 *u, u32 line, u32 me) { + readCarryFusedLine((CP(T2)) in, (T2 *) u, line, me); } -void OVERLOAD writeCarryFusedLine(GF61 *u, P(GF61) out, u32 line) { - writeCarryFusedLine((T2 *) u, (P(T2)) out, line); +void OVERLOAD writeCarryFusedLine(GF61 *u, P(GF61) out, u32 line, u32 me) { + writeCarryFusedLine((T2 *) u, (P(T2)) out, line, me); } void OVERLOAD readMiddleInLine(GF61 *u, CP(GF61) in, u32 y, u32 x) { diff --git a/src/cl/tailmul.cl b/src/cl/tailmul.cl index 1cdd5db0..3e2299f8 100644 --- a/src/cl/tailmul.cl +++ b/src/cl/tailmul.cl @@ -49,7 +49,8 @@ void OVERLOAD pairMul(u32 N, T2 *u, T2 *v, T2 *p, T2 *q, T2 base_squared, bool s } KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { - local T2 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local T2 lds[lds_bytes / sizeof(T2)]; T2 u[NH], v[NH]; T2 p[NH], q[NH]; @@ -65,7 +66,9 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { readTailFusedLine(in, u, line1, me); readTailFusedLine(in, v, line2, me); -#if NH == 8 +#if FFT_VARIANT_H != 0 + T2 w; +#elif NH == 8 T2 w = fancyTrig_N(ND / SMALL_HEIGHT * me); #else T2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH); @@ -74,19 +77,15 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { #if MUL_LOW read(G_H, NH, p, a, memline1 * SMALL_HEIGHT); read(G_H, NH, q, a, memline2 * SMALL_HEIGHT); - fft_HEIGHT(lds, u, smallTrig, w); - bar(); - fft_HEIGHT(lds, v, smallTrig, w); + fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, v, smallTrig, w, 1, SHUFL_BYTES_H, me); #else readTailFusedLine(a, p, line1, me); readTailFusedLine(a, q, line2, me); - fft_HEIGHT(lds, u, smallTrig, w); - bar(); - fft_HEIGHT(lds, v, smallTrig, w); - bar(); - fft_HEIGHT(lds, p, smallTrig, w); - bar(); - fft_HEIGHT(lds, q, smallTrig, w); + fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, v, smallTrig, w, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, p, smallTrig, w, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, q, smallTrig, w, 1, SHUFL_BYTES_H, me); #endif T2 trig = slowTrig_N(line1 + me * H, ND / NH); @@ -109,10 +108,8 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { reverseLine(G_H, lds, v); } - bar(); - fft_HEIGHT(lds, v, smallTrig, w); - bar(); - fft_HEIGHT(lds, u, smallTrig, w); + fft_HEIGHT(lds, v, smallTrig, w, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me); writeTailFusedLine(v, out, memline2, me); writeTailFusedLine(u, out, memline1, me); } @@ -164,7 +161,8 @@ void OVERLOAD pairMul(u32 N, F2 *u, F2 *v, F2 *p, F2 *q, F2 base_squared, bool s } KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { - local F2 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local F2 lds[lds_bytes / sizeof(F2)]; CP(F2) inF2 = (CP(F2)) in; CP(F2) aF2 = (CP(F2)) a; @@ -188,19 +186,15 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { #if MUL_LOW read(G_H, NH, p, aF2, memline1 * SMALL_HEIGHT); read(G_H, NH, q, aF2, memline2 * SMALL_HEIGHT); - fft_HEIGHT(lds, u, smallTrigF2); - bar(); - fft_HEIGHT(lds, v, smallTrigF2); + fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, v, smallTrigF2, 1, SHUFL_BYTES_H, me); #else readTailFusedLine(aF2, p, line1, me); readTailFusedLine(aF2, q, line2, me); - fft_HEIGHT(lds, u, smallTrigF2); - bar(); - fft_HEIGHT(lds, v, smallTrigF2); - bar(); - fft_HEIGHT(lds, p, smallTrigF2); - bar(); - fft_HEIGHT(lds, q, smallTrigF2); + fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, v, smallTrigF2, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, p, smallTrigF2, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, q, smallTrigF2, 1, SHUFL_BYTES_H, me); #endif F2 trig = slowTrig_N(line1 + me * H, ND / NH); @@ -223,10 +217,8 @@ KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { reverseLine(G_H, lds, v); } - bar(); - fft_HEIGHT(lds, v, smallTrigF2); - bar(); - fft_HEIGHT(lds, u, smallTrigF2); + fft_HEIGHT(lds, v, smallTrigF2, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me); writeTailFusedLine(v, outF2, memline2, me); writeTailFusedLine(u, outF2, memline1, me); } @@ -278,7 +270,8 @@ void OVERLOAD pairMul(u32 N, GF31 *u, GF31 *v, GF31 *p, GF31 *q, GF31 base_squar } KERNEL(G_H) tailMulGF31(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { - local GF31 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF31 lds[lds_bytes / sizeof(GF31)]; CP(GF31) in31 = (CP(GF31)) (in + DISTGF31); CP(GF31) a31 = (CP(GF31)) (a + DISTGF31); @@ -302,19 +295,15 @@ KERNEL(G_H) tailMulGF31(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { #if MUL_LOW read(G_H, NH, p, a31, memline1 * SMALL_HEIGHT); read(G_H, NH, q, a31, memline2 * SMALL_HEIGHT); - fft_HEIGHT(lds, u, smallTrig31); - bar(); - fft_HEIGHT(lds, v, smallTrig31); + fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, v, smallTrig31, 1, SHUFL_BYTES_H, me); #else readTailFusedLine(a31, p, line1, me); readTailFusedLine(a31, q, line2, me); - fft_HEIGHT(lds, u, smallTrig31); - bar(); - fft_HEIGHT(lds, v, smallTrig31); - bar(); - fft_HEIGHT(lds, p, smallTrig31); - bar(); - fft_HEIGHT(lds, q, smallTrig31); + fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, v, smallTrig31, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, p, smallTrig31, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, q, smallTrig31, 1, SHUFL_BYTES_H, me); #endif // Calculate number of trig values used by fft_HEIGHT (see genSmallTrigCombo in trigBufCache.cpp) @@ -354,10 +343,8 @@ KERNEL(G_H) tailMulGF31(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { reverseLine(G_H, lds, v); } - bar(); - fft_HEIGHT(lds, v, smallTrig31); - bar(); - fft_HEIGHT(lds, u, smallTrig31); + fft_HEIGHT(lds, v, smallTrig31, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me); writeTailFusedLine(v, out31, memline2, me); writeTailFusedLine(u, out31, memline1, me); } @@ -407,7 +394,8 @@ void OVERLOAD pairMul(u32 N, GF61 *u, GF61 *v, GF61 *p, GF61 *q, GF61 base_squar } KERNEL(G_H) tailMulGF61(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { - local GF61 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF61 lds[lds_bytes / sizeof(GF61)]; CP(GF61) in61 = (CP(GF61)) (in + DISTGF61); CP(GF61) a61 = (CP(GF61)) (a + DISTGF61); @@ -431,19 +419,15 @@ KERNEL(G_H) tailMulGF61(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { #if MUL_LOW read(G_H, NH, p, a61, memline1 * SMALL_HEIGHT); read(G_H, NH, q, a61, memline2 * SMALL_HEIGHT); - fft_HEIGHT(lds, u, smallTrig61); - bar(); - fft_HEIGHT(lds, v, smallTrig61); + fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, v, smallTrig61, 1, SHUFL_BYTES_H, me); #else readTailFusedLine(a61, p, line1, me); readTailFusedLine(a61, q, line2, me); - fft_HEIGHT(lds, u, smallTrig61); - bar(); - fft_HEIGHT(lds, v, smallTrig61); - bar(); - fft_HEIGHT(lds, p, smallTrig61); - bar(); - fft_HEIGHT(lds, q, smallTrig61); + fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, v, smallTrig61, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, p, smallTrig61, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, q, smallTrig61, 1, SHUFL_BYTES_H, me); #endif // Calculate number of trig values used by fft_HEIGHT (see genSmallTrigCombo in trigBufCache.cpp) @@ -483,10 +467,8 @@ KERNEL(G_H) tailMulGF61(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig) { reverseLine(G_H, lds, v); } - bar(); - fft_HEIGHT(lds, v, smallTrig61); - bar(); - fft_HEIGHT(lds, u, smallTrig61); + fft_HEIGHT(lds, v, smallTrig61, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me); writeTailFusedLine(v, out61, memline2, me); writeTailFusedLine(u, out61, memline1, me); } diff --git a/src/cl/tailsquare.cl b/src/cl/tailsquare.cl index bf960f1c..d9edc1f7 100644 --- a/src/cl/tailsquare.cl +++ b/src/cl/tailsquare.cl @@ -54,7 +54,8 @@ void OVERLOAD pairSq(u32 N, T2 *u, T2 *v, T2 base_squared, bool special) { // The kernel tailSquareZero handles the special cases in tailSquare, i.e. the lines 0 and H/2 // This kernel is launched with 2 workgroups (handling line 0, resp. H/2) KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) { - local T2 lds[SMALL_HEIGHT / 2]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local T2 lds[lds_bytes / sizeof(T2)]; T2 u[NH]; u32 H = ND / SMALL_HEIGHT; @@ -66,7 +67,9 @@ KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) { u32 me = get_local_id(0); readTailFusedLine(in, u, line, me); -#if NH == 8 +#if FFT_VARIANT_H != 0 + T2 w; +#elif NH == 8 T2 w = fancyTrig_N(ND / SMALL_HEIGHT * me); #else T2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH); @@ -74,20 +77,20 @@ KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) { T2 trig = slowTrig_N(line + me * H, ND / NH); - fft_HEIGHT(lds, u, smallTrig, w); + fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me); reverse(G_H, lds, u + NH/2, !which); pairSq(NH/2, u, u + NH/2, trig, !which); reverse(G_H, lds, u + NH/2, !which); - bar(); - fft_HEIGHT(lds, u, smallTrig, w); + fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me); writeTailFusedLine(u, out, transPos(line, MIDDLE, WIDTH), me); } #if SINGLE_WIDE KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { - local T2 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local T2 lds[lds_bytes / sizeof(T2)]; T2 u[NH], v[NH]; @@ -107,22 +110,17 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { readTailFusedLine(in, u, line1, me); readTailFusedLine(in, v, line2, me); -#if NH == 8 +#if FFT_VARIANT_H != 0 + T2 w; +#elif NH == 8 T2 w = fancyTrig_N(ND / SMALL_HEIGHT * me); #else T2 w = slowTrig_N(ND / SMALL_HEIGHT * me, ND / NH); #endif -#if ZEROHACK_H - u32 zerohack = (u32) get_group_id(0) / 131072; - fft_HEIGHT(lds + zerohack, u, smallTrig + zerohack, w); - bar(); - fft_HEIGHT(lds + zerohack, v, smallTrig + zerohack, w); -#else - fft_HEIGHT(lds, u, smallTrig, w); - bar(); - fft_HEIGHT(lds, v, smallTrig, w); -#endif + u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072; + fft_HEIGHT(lds + zerohack, u, smallTrig + zerohack, w, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds + zerohack, v, smallTrig + zerohack, w, 1, SHUFL_BYTES_H, me); // Compute trig values from scratch. Good on GPUs with high DP throughput. #if TAIL_TRIGS == 2 @@ -169,10 +167,8 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { reverseLine(G_H, lds, v); } - bar(); - fft_HEIGHT(lds, v, smallTrig, w); - bar(); - fft_HEIGHT(lds, u, smallTrig, w); + fft_HEIGHT(lds, v, smallTrig, w, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, u, smallTrig, w, 1, SHUFL_BYTES_H, me); writeTailFusedLine(v, out, memline2, me); writeTailFusedLine(u, out, memline1, me); @@ -202,7 +198,8 @@ void OVERLOAD pairSq2_special(T2 *u, T2 base_squared) { } KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { - local T2 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local T2 lds[lds_bytes * 2 / sizeof(T2)]; T2 u[NH]; @@ -227,18 +224,16 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { // Read lines u and v readTailFusedLine(in, u, line, lowMe); -#if NH == 8 +#if FFT_VARIANT_H != 0 + T2 w; +#elif NH == 8 T2 w = fancyTrig_N(H * lowMe); #else T2 w = slowTrig_N(H * lowMe, ND / NH); #endif -#if ZEROHACK_H - u32 zerohack = (u32) get_group_id(0) / 131072; - new_fft_HEIGHT2_1(lds + zerohack, u, smallTrig + zerohack, w); -#else - new_fft_HEIGHT2_1(lds, u, smallTrig, w); -#endif + u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072; + new_fft_HEIGHT1(lds + zerohack, u, smallTrig + zerohack, w, 2, SHUFL_BYTES_H, lowMe); // Compute trig values from scratch. Good on GPUs with high DP throughput. #if TAIL_TRIGS == 2 @@ -263,8 +258,6 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { T2 trig = NTLOAD(smallTrig[height_trigs + line_u*G_H*2 + me]); #endif - bar(G_H); - #if SINGLE_KERNEL // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1. if (line_u == 0) { @@ -276,15 +269,12 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { #else if (1) { #endif - revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf); + revCrossLine(lds, u); pairSq(NH/2, u, u + NH/2, trig, false); - bar(G_H); - revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf); + revCrossLine(lds, u); } - bar(G_H); - - new_fft_HEIGHT2_2(lds, u, smallTrig, w); + new_fft_HEIGHT2(lds, u, smallTrig, w, 2, SHUFL_BYTES_H, lowMe); // Write lines u and v writeTailFusedLine(u, out, transPos(line, MIDDLE, WIDTH), lowMe); @@ -342,7 +332,8 @@ void OVERLOAD pairSq(u32 N, F2 *u, F2 *v, F2 base_squared, bool special) { // The kernel tailSquareZero handles the special cases in tailSquare, i.e. the lines 0 and H/2 // This kernel is launched with 2 workgroups (handling line 0, resp. H/2) KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) { - local F2 lds[SMALL_HEIGHT / 2]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local F2 lds[lds_bytes / sizeof(F2)]; F2 u[NH]; u32 H = ND / SMALL_HEIGHT; @@ -360,20 +351,20 @@ KERNEL(G_H) tailSquareZero(P(T2) out, CP(T2) in, Trig smallTrig) { F2 trig = slowTrig_N(line + me * H, ND / NH); - fft_HEIGHT(lds, u, smallTrigF2); + fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me); reverse(G_H, lds, u + NH/2, !which); pairSq(NH/2, u, u + NH/2, trig, !which); reverse(G_H, lds, u + NH/2, !which); - bar(); - fft_HEIGHT(lds, u, smallTrigF2); + fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me); writeTailFusedLine(u, outF2, transPos(line, MIDDLE, WIDTH), me); } #if SINGLE_WIDE KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { - local F2 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local F2 lds[lds_bytes / sizeof(F2)]; CP(F2) inF2 = (CP(F2)) in; P(F2) outF2 = (P(F2)) out; @@ -397,16 +388,9 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { readTailFusedLine(inF2, u, line1, me); readTailFusedLine(inF2, v, line2, me); -#if ZEROHACK_H - u32 zerohack = get_group_id(0) / 131072; - fft_HEIGHT(lds + zerohack, u, smallTrigF2 + zerohack); - bar(); - fft_HEIGHT(lds + zerohack, v, smallTrigF2 + zerohack); -#else - fft_HEIGHT(lds, u, smallTrigF2); - bar(); - fft_HEIGHT(lds, v, smallTrigF2); -#endif + u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072; + fft_HEIGHT(lds + zerohack, u, smallTrigF2 + zerohack, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds + zerohack, v, smallTrigF2 + zerohack, 1, SHUFL_BYTES_H, me); // Compute trig values from scratch. Good on GPUs with high DP throughput. #if TAIL_TRIGS32 == 2 @@ -453,10 +437,8 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { reverseLine(G_H, lds, v); } - bar(); - fft_HEIGHT(lds, v, smallTrigF2); - bar(); - fft_HEIGHT(lds, u, smallTrigF2); + fft_HEIGHT(lds, v, smallTrigF2, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, u, smallTrigF2, 1, SHUFL_BYTES_H, me); writeTailFusedLine(v, outF2, memline2, me); writeTailFusedLine(u, outF2, memline1, me); @@ -486,7 +468,8 @@ void OVERLOAD pairSq2_special(F2 *u, F2 base_squared) { } KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { - local F2 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local F2 lds[lds_bytes * 2 / sizeof(F2)]; CP(F2) inF2 = (CP(F2)) in; P(F2) outF2 = (P(F2)) out; @@ -515,12 +498,8 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { // Read lines u and v readTailFusedLine(inF2, u, line, lowMe); -#if ZEROHACK_H - u32 zerohack = (u32) get_group_id(0) / 131072; - new_fft_HEIGHT2_1(lds + zerohack, u, smallTrigF2 + zerohack); -#else - new_fft_HEIGHT2_1(lds, u, smallTrigF2); -#endif + u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072; + new_fft_HEIGHT1(lds + zerohack, u, smallTrigF2 + zerohack, 2, SHUFL_BYTES_H, lowMe); // Compute trig values from scratch. Good on GPUs with high DP throughput. #if TAIL_TRIGS32 == 2 @@ -545,8 +524,6 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { F2 trig = NTLOAD(smallTrigF2[height_trigs + line_u*G_H*2 + me]); #endif - bar(G_H); - #if SINGLE_KERNEL // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1. if (line_u == 0) { @@ -558,15 +535,12 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { #else if (1) { #endif - revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf); + revCrossLine(lds, u); pairSq(NH/2, u, u + NH/2, trig, false); - bar(G_H); - revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf); + revCrossLine(lds, u); } - bar(G_H); - - new_fft_HEIGHT2_2(lds, u, smallTrigF2); + new_fft_HEIGHT2(lds, u, smallTrigF2, 2, SHUFL_BYTES_H, lowMe); // Write lines u and v writeTailFusedLine(u, outF2, transPos(line, MIDDLE, WIDTH), lowMe); @@ -627,7 +601,8 @@ void OVERLOAD pairSq(u32 N, GF31 *u, GF31 *v, GF31 base_squared, bool special) { // The kernel tailSquareZero handles the special cases in tailSquare, i.e. the lines 0 and H/2 // This kernel is launched with 2 workgroups (handling line 0, resp. H/2) KERNEL(G_H) tailSquareZeroGF31(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF31 lds[SMALL_HEIGHT / 2]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF31 lds[lds_bytes / sizeof(GF31)]; CP(GF31) in31 = (CP(GF31)) (in + DISTGF31); P(GF31) out31 = (P(GF31)) (out + DISTGF31); @@ -663,19 +638,20 @@ KERNEL(G_H) tailSquareZeroGF31(P(T2) out, CP(T2) in, Trig smallTrig) { #endif #endif - fft_HEIGHT(lds, u, smallTrig31); + fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me); reverse(G_H, lds, u + NH/2, !which); pairSq(NH/2, u, u + NH/2, trig, !which); reverse(G_H, lds, u + NH/2, !which); - bar(); - fft_HEIGHT(lds, u, smallTrig31); + + fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me); writeTailFusedLine(u, out31, transPos(line, MIDDLE, WIDTH), me); } #if SINGLE_WIDE KERNEL(G_H) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF31 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF31 lds[lds_bytes / sizeof(GF31)]; CP(GF31) in31 = (CP(GF31)) (in + DISTGF31); P(GF31) out31 = (P(GF31)) (out + DISTGF31); @@ -699,16 +675,9 @@ KERNEL(G_H) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) { readTailFusedLine(in31, u, line1, me); readTailFusedLine(in31, v, line2, me); -#if ZEROHACK_H - u32 zerohack = (u32) get_group_id(0) / 131072; - fft_HEIGHT(lds + zerohack, u, smallTrig31 + zerohack); - bar(); - fft_HEIGHT(lds + zerohack, v, smallTrig31 + zerohack); -#else - fft_HEIGHT(lds, u, smallTrig31); - bar(); - fft_HEIGHT(lds, v, smallTrig31); -#endif + u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072; + fft_HEIGHT(lds + zerohack, u, smallTrig31 + zerohack, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds + zerohack, v, smallTrig31 + zerohack, 1, SHUFL_BYTES_H, me); // Do a little bit of memory access and a little bit of math. #if TAIL_TRIGS31 >= 1 @@ -751,10 +720,8 @@ KERNEL(G_H) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) { reverseLine(G_H, lds, v); } - bar(); - fft_HEIGHT(lds, v, smallTrig31); - bar(); - fft_HEIGHT(lds, u, smallTrig31); + fft_HEIGHT(lds, v, smallTrig31, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, u, smallTrig31, 1, SHUFL_BYTES_H, me); writeTailFusedLine(v, out31, memline2, me); writeTailFusedLine(u, out31, memline1, me); @@ -783,7 +750,8 @@ void OVERLOAD pairSq2_special(GF31 *u, GF31 base_squared) { } KERNEL(G_H * 2) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF31 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF31 lds[lds_bytes * 2 / sizeof(GF31)]; CP(GF31) in31 = (CP(GF31)) (in + DISTGF31); P(GF31) out31 = (P(GF31)) (out + DISTGF31); @@ -812,12 +780,8 @@ KERNEL(G_H * 2) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) { // Read lines u and v readTailFusedLine(in31, u, line, lowMe); -#if ZEROHACK_H - u32 zerohack = (u32) get_group_id(0) / 131072; - new_fft_HEIGHT2_1(lds + zerohack, u, smallTrig31 + zerohack); -#else - new_fft_HEIGHT2_1(lds, u, smallTrig31); -#endif + u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072; + new_fft_HEIGHT1(lds + zerohack, u, smallTrig31 + zerohack, 2, SHUFL_BYTES_H, lowMe); // Do a little bit of memory access and a little bit of math. Good on a Radeon VII. #if TAIL_TRIGS31 >= 1 @@ -838,8 +802,6 @@ KERNEL(G_H * 2) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) { GF31 trig = NTLOAD(smallTrig31[height_trigs + line_u*G_H*2 + me]); #endif - bar(G_H); - #if SINGLE_KERNEL // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1. if (line_u == 0) { @@ -851,15 +813,12 @@ KERNEL(G_H * 2) tailSquareGF31(P(T2) out, CP(T2) in, Trig smallTrig) { #else if (1) { #endif - revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf); + revCrossLine(lds, u); pairSq(NH/2, u, u + NH/2, trig, false); - bar(G_H); - revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf); + revCrossLine(lds, u); } - bar(G_H); - - new_fft_HEIGHT2_2(lds, u, smallTrig31); + new_fft_HEIGHT2(lds, u, smallTrig31, 2, SHUFL_BYTES_H, lowMe); // Write lines u and v writeTailFusedLine(u, out31, transPos(line, MIDDLE, WIDTH), lowMe); @@ -920,7 +879,8 @@ void OVERLOAD pairSq(u32 N, GF61 *u, GF61 *v, GF61 base_squared, bool special) { // The kernel tailSquareZero handles the special cases in tailSquare, i.e. the lines 0 and H/2 // This kernel is launched with 2 workgroups (handling line 0, resp. H/2) KERNEL(G_H) tailSquareZeroGF61(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF61 lds[SMALL_HEIGHT / 2]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF61 lds[lds_bytes / sizeof(GF61)]; CP(GF61) in61 = (CP(GF61)) (in + DISTGF61); P(GF61) out61 = (P(GF61)) (out + DISTGF61); @@ -956,19 +916,20 @@ KERNEL(G_H) tailSquareZeroGF61(P(T2) out, CP(T2) in, Trig smallTrig) { #endif #endif - fft_HEIGHT(lds, u, smallTrig61); + fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me); reverse(G_H, lds, u + NH/2, !which); pairSq(NH/2, u, u + NH/2, trig, !which); reverse(G_H, lds, u + NH/2, !which); - bar(); - fft_HEIGHT(lds, u, smallTrig61); + + fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me); writeTailFusedLine(u, out61, transPos(line, MIDDLE, WIDTH), me); } #if SINGLE_WIDE KERNEL(G_H) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF61 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF61 lds[lds_bytes / sizeof(GF61)]; CP(GF61) in61 = (CP(GF61)) (in + DISTGF61); P(GF61) out61 = (P(GF61)) (out + DISTGF61); @@ -992,16 +953,9 @@ KERNEL(G_H) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) { readTailFusedLine(in61, u, line1, me); readTailFusedLine(in61, v, line2, me); -#if ZEROHACK_H - u32 zerohack = (u32) get_group_id(0) / 131072; - fft_HEIGHT(lds + zerohack, u, smallTrig61 + zerohack); - bar(); - fft_HEIGHT(lds + zerohack, v, smallTrig61 + zerohack); -#else - fft_HEIGHT(lds, u, smallTrig61); - bar(); - fft_HEIGHT(lds, v, smallTrig61); -#endif + u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072; + fft_HEIGHT(lds + zerohack, u, smallTrig61 + zerohack, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds + zerohack, v, smallTrig61 + zerohack, 1, SHUFL_BYTES_H, me); // Do a little bit of memory access and a little bit of math. #if TAIL_TRIGS61 >= 1 @@ -1044,10 +998,8 @@ KERNEL(G_H) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) { reverseLine(G_H, lds, v); } - bar(); - fft_HEIGHT(lds, v, smallTrig61); - bar(); - fft_HEIGHT(lds, u, smallTrig61); + fft_HEIGHT(lds, v, smallTrig61, 1, SHUFL_BYTES_H, me); + fft_HEIGHT(lds, u, smallTrig61, 1, SHUFL_BYTES_H, me); writeTailFusedLine(v, out61, memline2, me); writeTailFusedLine(u, out61, memline1, me); @@ -1076,7 +1028,8 @@ void OVERLOAD pairSq2_special(GF61 *u, GF61 base_squared) { } KERNEL(G_H * 2) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) { - local GF61 lds[SMALL_HEIGHT]; + const u32 lds_bytes = SMALL_HEIGHT * SHUFL_BYTES_H; + local GF61 lds[lds_bytes * 2 / sizeof(GF61)]; CP(GF61) in61 = (CP(GF61)) (in + DISTGF61); P(GF61) out61 = (P(GF61)) (out + DISTGF61); @@ -1105,12 +1058,8 @@ KERNEL(G_H * 2) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) { // Read lines u and v readTailFusedLine(in61, u, line, lowMe); -#if ZEROHACK_H - u32 zerohack = (u32) get_group_id(0) / 131072; - new_fft_HEIGHT2_1(lds + zerohack, u, smallTrig61 + zerohack); -#else - new_fft_HEIGHT2_1(lds, u, smallTrig61); -#endif + u32 zerohack = ZEROHACK_H * (u32) get_group_id(0) / 131072; + new_fft_HEIGHT1(lds + zerohack, u, smallTrig61 + zerohack, 2, SHUFL_BYTES_H, lowMe); // Do a little bit of memory access and a little bit of math. Good on a Radeon VII. #if TAIL_TRIGS61 >= 1 @@ -1131,8 +1080,6 @@ KERNEL(G_H * 2) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) { GF61 trig = NTLOAD(smallTrig61[height_trigs + line_u*G_H*2 + me]); #endif - bar(G_H); - #if SINGLE_KERNEL // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1. if (line_u == 0) { @@ -1144,15 +1091,12 @@ KERNEL(G_H * 2) tailSquareGF61(P(T2) out, CP(T2) in, Trig smallTrig) { #else if (1) { #endif - revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf); + revCrossLine(lds, u); pairSq(NH/2, u, u + NH/2, trig, false); - bar(G_H); - revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf); + revCrossLine(lds, u); } - bar(G_H); - - new_fft_HEIGHT2_2(lds, u, smallTrig61); + new_fft_HEIGHT2(lds, u, smallTrig61, 2, SHUFL_BYTES_H, lowMe); // Write lines u and v writeTailFusedLine(u, out61, transPos(line, MIDDLE, WIDTH), lowMe); diff --git a/src/cl/tailutil.cl b/src/cl/tailutil.cl index 01710cf1..9234f858 100644 --- a/src/cl/tailutil.cl +++ b/src/cl/tailutil.cl @@ -30,84 +30,181 @@ #define SINGLE_WIDE TAIL_KERNELS < 2 // Old single-wide tailSquare vs. new double-wide tailSquare #define SINGLE_KERNEL (TAIL_KERNELS & 1) == 0 // TailSquare uses a single kernel vs. two kernels -#if FFT_FP64 +// 64-bit implementations of reverse routines -void OVERLOAD reverse(u32 WG, local T2 *lds, T2 *u, bool bump) { +#if FFT_FP64 | NTT_GF61 + +void OVERLOAD reverse(u32 WG, local T2 *lds2, T2 *u, bool bump) { u32 me = get_local_id(0); u32 revMe = WG - 1 - me + bump; - bar(); - + if (SHUFL_BYTES_H >= 8) { + local T2 *lds = lds2; + bar(WG); #if NH == 8 - lds[revMe + 0 * WG] = u[3]; - lds[revMe + 1 * WG] = u[2]; - lds[revMe + 2 * WG] = u[1]; - lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0]; + lds[revMe + 0 * WG] = u[3]; + lds[revMe + 1 * WG] = u[2]; + lds[revMe + 2 * WG] = u[1]; + lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0]; #elif NH == 4 - lds[revMe + 0 * WG] = u[1]; - lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0]; -#else -#error + lds[revMe + 0 * WG] = u[1]; + lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0]; #endif + bar(WG); + for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; } + } - bar(); - for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; } + else if (SHUFL_BYTES_H == 4) { + local T *lds = (local T *) lds2; + bar(WG); +#if NH == 8 + lds[revMe + 0 * WG] = u[3].x; + lds[revMe + 1 * WG] = u[2].x; + lds[revMe + 2 * WG] = u[1].x; + lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0].x; +#elif NH == 4 + lds[revMe + 0 * WG] = u[1].x; + lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0].x; +#endif + bar(WG); + for (i32 i = 0; i < NH/2; ++i) { u[i].x = lds[i * WG + me]; } + bar(WG); +#if NH == 8 + lds[revMe + 0 * WG] = u[3].y; + lds[revMe + 1 * WG] = u[2].y; + lds[revMe + 2 * WG] = u[1].y; + lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0].y; +#elif NH == 4 + lds[revMe + 0 * WG] = u[1].y; + lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0].y; +#endif + bar(WG); + for (i32 i = 0; i < NH/2; ++i) { u[i].y = lds[i * WG + me]; } + } } -void OVERLOAD reverseLine(u32 WG, local T2 *lds2, T2 *u) { +void OVERLOAD reverseLine(u32 WG, local T2 *lds, T2 *u) { u32 me = get_local_id(0); u32 revMe = WG - 1 - me; - local T2 *lds = lds2 + revMe; - bar(); - for (u32 i = 0; i < NH; ++i) { lds[WG * (NH - 1 - i)] = u[i]; } - - lds = lds2 + me; - bar(); - for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; } -} - -// This is used to reverse the second part of a line, and cross the reversed parts between the halves. -void OVERLOAD revCrossLine(u32 WG, local T2* lds2, T2 *u, u32 n, bool writeSecondHalf) { - u32 me = get_local_id(0); - u32 lowMe = me % WG; - - u32 revLowMe = WG - 1 - lowMe; - - for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; } + if (SHUFL_BYTES_H == 16) { + local T2 *ldsOut = lds + revMe; + local T2 *ldsIn = lds + me; + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i]; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { u[i] = ldsIn[WG * i]; } + } - bar(); // we need a full bar because we're crossing halves + else if (SHUFL_BYTES_H == 8) { + local T *ldsOut = (local T *) lds + revMe; + local T *ldsIn = (local T *) lds + me; + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i].x; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { u[i].x = ldsIn[WG * i]; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i].y; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { u[i].y = ldsIn[WG * i]; } + } - for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; } + else if (SHUFL_BYTES_H == 4) { + local int *ldsOut = (local int *) lds + revMe; + local int *ldsIn = (local int *) lds + me; + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = as_int4(u[i]).x; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { int4 tmp = as_int4(u[i]); tmp.x = ldsIn[WG * i]; u[i] = as_double2(tmp); } + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = as_int4(u[i]).y; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { int4 tmp = as_int4(u[i]); tmp.y = ldsIn[WG * i]; u[i] = as_double2(tmp); } + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = as_int4(u[i]).z; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { int4 tmp = as_int4(u[i]); tmp.z = ldsIn[WG * i]; u[i] = as_double2(tmp); } + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = as_int4(u[i]).w; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { int4 tmp = as_int4(u[i]); tmp.w = ldsIn[WG * i]; u[i] = as_double2(tmp); } + } } // -// These versions are for the kernel(s) that uses a double-wide workgroup (u in half the workgroup, v in the other half) +// These versions are for the kernel(s) that use a double-wide workgroup (u in half the workgroup, v in the other half) // -void OVERLOAD reverse2(local T2 *lds, T2 *u) { +void OVERLOAD reverse2(local T2 *lds2, T2 *u) { u32 me = get_local_id(0); - - // For NH=8, u[0] to u[3] are left unchanged. Write to lds: - // u[7]rev u[6]rev - // u[5]rev u[4]rev - // v[7]rev v[6]rev - // v[5]rev v[4]rev - bar(); - for (u32 i = 0; i < NH / 2; ++i) { - u32 j = (i * G_H + me % G_H); - lds[me < G_H ? ((NH/2)*G_H - j) % ((NH/2)*G_H) : NH*G_H-1 - j] = u[NH/2 + i]; + u32 lowMe = me % G_H; + + if (SHUFL_BYTES_H >= 8) { + local T2 *lds = lds2; + if (me >= G_H) lds += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T2); + // For NH=8, u[0] to u[3] are left unchanged. Write to lds: + // u[7]rev u[6]rev u[5]rev u[4]rev + // v[7]rev v[6]rev v[5]rev v[4]rev + bar(G_H); + for (u32 i = 0; i < NH/2; ++i) { lds[((NH/2 - i) * G_H - (me >= G_H ? 1 : 0) - lowMe) % (NH/2 * G_H)] = u[NH/2 + i]; } + // For NH=8, read from lds into u[i]: + // u[4] = u[7]rev v[7]rev + // u[5] = u[6]rev v[6]rev + // u[6] = u[5]rev v[5]rev + // u[7] = u[4]rev v[4]rev + bar(G_H); + for (u32 i = 0; i < NH/2; ++i) { u[NH/2 + i] = lds[i * G_H + lowMe]; } + } + + else if (SHUFL_BYTES_H == 4) { + local T *lds = (local T *) lds2; + if (me >= G_H) lds += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T); + bar(G_H); + for (u32 i = 0; i < NH/2; ++i) { lds[((NH/2 - i) * G_H - (me >= G_H ? 1 : 0) - lowMe) % (NH/2 * G_H)] = u[NH/2 + i].x; } + bar(G_H); + for (u32 i = 0; i < NH/2; ++i) { u[NH/2 + i].x = lds[i * G_H + lowMe]; } + bar(G_H); + for (u32 i = 0; i < NH/2; ++i) { lds[((NH/2 - i) * G_H - (me >= G_H ? 1 : 0) - lowMe) % (NH/2 * G_H)] = u[NH/2 + i].y; } + bar(G_H); + for (u32 i = 0; i < NH/2; ++i) { u[NH/2 + i].y = lds[i * G_H + lowMe]; } } - // For NH=8, read from lds into u[i]: - // u[4] = u[7]rev v[7]rev - // u[5] = u[6]rev v[6]rev - // u[6] = u[5]rev v[5]rev - // u[7] = u[4]rev v[4]rev - bar(); - lds += me % G_H + (me / G_H) * NH/2 * G_H; - for (u32 i = 0; i < NH / 2; ++i) { u[NH/2 + i] = lds[i * G_H]; } } +// This is used to reverse the second part of a line, and cross the reversed parts between the halves. +void OVERLOAD revCrossLine(local T2* lds2, T2 *u) { + u32 me = get_local_id(0); + u32 lowMe = me % G_H; + u32 revLowMe = G_H - 1 - lowMe; + + if (SHUFL_BYTES_H >= 8) { + local T2 *ldsOut = lds2; + local T2 *ldsIn = lds2; + if (me < G_H) ldsOut += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T2); // Crossing LDS halves + else ldsIn += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T2); // Staying within LDS halves (just like shufl) + bar(); // we need a full bar because we're crossing halves + for (u32 i = 0; i < NH/2; ++i) { ldsOut[G_H * (NH/2 - 1 - i) + revLowMe] = u[i + NH/2]; } + bar(); // we need a full bar because we just crossed halves. LDS reads are compatible with future shufl calls. + for (u32 i = 0; i < NH/2; ++i) { u[i + NH/2] = ldsIn[G_H * i + lowMe]; } + } + + else if (SHUFL_BYTES_H == 4) { + local T *ldsOut = (local T *) lds2; + local T *ldsIn = (local T *) lds2; + if (me < G_H) ldsOut += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T); + else ldsIn += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(T); + bar(); // we need a full bar because we're crossing halves + for (u32 i = 0; i < NH/2; ++i) { ldsOut[G_H * (NH/2 - 1 - i) + revLowMe] = u[i + NH/2].x; } + bar(); // we need a full bar because we just crossed halves + for (u32 i = 0; i < NH/2; ++i) { u[i + NH/2].x = ldsIn[G_H * i + lowMe]; } + bar(); // we need a full bar because we're crossing halves + for (u32 i = 0; i < NH/2; ++i) { ldsOut[G_H * (NH/2 - 1 - i) + revLowMe] = u[i + NH/2].y; } + bar(); // we need a full bar because we just crossed halves. LDS reads are compatible with future shufl calls. + for (u32 i = 0; i < NH/2; ++i) { u[i + NH/2].y = ldsIn[G_H * i + lowMe]; } + } +} + +#if 0 // Unused + // Somewhat similar to reverseLine. // The u values are in threads < G_H, the v values to reverse in threads >= G_H. // Whereas reverseLine leaves u values alone. This reverseLine moves u values around @@ -119,7 +216,7 @@ void OVERLOAD reverse2(local T2 *lds, T2 *u) { void OVERLOAD reverseLine2(local T2 *lds, T2 *u) { u32 me = get_local_id(0); -// NOTE: It is important that this routine use lds memory in coordination with shufl2. Failure to do so would require an +// NOTE: It is important that this routine use lds memory in coordination with shufl. Failure to do so would require an // unqualified bar() call here. Specifically, the u values are stored in the upper half of lds memory (SMALL_HEIGHT T2 values). // The v values are stored in the lower half of lds memory (the next SMALL_HEIGHT T2 values). @@ -191,89 +288,107 @@ void OVERLOAD unreverseLine2(local T2 *lds, T2 *u) { #endif +#endif + /**************************************************************************/ /* Similar to above, but for an FFT based on FP32 */ /**************************************************************************/ -#if FFT_FP32 +#if FFT_FP32 | NTT_GF31 void OVERLOAD reverse(u32 WG, local F2 *lds, F2 *u, bool bump) { u32 me = get_local_id(0); u32 revMe = WG - 1 - me + bump; - bar(); - + if (SHUFL_BYTES_H >= 4) { + bar(WG); #if NH == 8 - lds[revMe + 0 * WG] = u[3]; - lds[revMe + 1 * WG] = u[2]; - lds[revMe + 2 * WG] = u[1]; - lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0]; + lds[revMe + 0 * WG] = u[3]; + lds[revMe + 1 * WG] = u[2]; + lds[revMe + 2 * WG] = u[1]; + lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0]; #elif NH == 4 - lds[revMe + 0 * WG] = u[1]; - lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0]; -#else -#error + lds[revMe + 0 * WG] = u[1]; + lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0]; #endif - - bar(); - for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; } + bar(WG); + for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; } + } } -void OVERLOAD reverseLine(u32 WG, local F2 *lds2, F2 *u) { +void OVERLOAD reverseLine(u32 WG, local F2 *lds, F2 *u) { u32 me = get_local_id(0); u32 revMe = WG - 1 - me; - local F2 *lds = lds2 + revMe; - bar(); - for (u32 i = 0; i < NH; ++i) { lds[WG * (NH - 1 - i)] = u[i]; } - - lds = lds2 + me; - bar(); - for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; } -} - -// This is used to reverse the second part of a line, and cross the reversed parts between the halves. -void OVERLOAD revCrossLine(u32 WG, local F2* lds2, F2 *u, u32 n, bool writeSecondHalf) { - u32 me = get_local_id(0); - u32 lowMe = me % WG; - - u32 revLowMe = WG - 1 - lowMe; - - for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; } - - bar(); // we need a full bar because we're crossing halves + if (SHUFL_BYTES_H >= 8) { + local F2 *ldsOut = lds + revMe; + local F2 *ldsIn = lds + me; + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i]; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { u[i] = ldsIn[WG * i]; } + } - for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; } + else if (SHUFL_BYTES_H == 4) { + local F *ldsOut = (local F *) lds + revMe; + local F *ldsIn = (local F *) lds + me; + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i].x; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { u[i].x = ldsIn[WG * i]; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { ldsOut[WG * (NH - 1 - i)] = u[i].y; } + bar(WG); + for (u32 i = 0; i < NH; ++i) { u[i].y = ldsIn[WG * i]; } + } } // -// These versions are for the kernel(s) that uses a double-wide workgroup (u in half the workgroup, v in the other half) +// These versions are for the kernel(s) that use a double-wide workgroup (u in half the workgroup, v in the other half) // void OVERLOAD reverse2(local F2 *lds, F2 *u) { u32 me = get_local_id(0); + u32 lowMe = me % G_H; + + if (SHUFL_BYTES_H >= 4) { + if (me >= G_H) lds += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(F2); + // For NH=8, u[0] to u[3] are left unchanged. Write to lds: + // u[7]rev u[6]rev u[5]rev u[4]rev + // v[7]rev v[6]rev v[5]rev v[4]rev + bar(G_H); + for (u32 i = 0; i < NH/2; ++i) { lds[((NH/2 - i) * G_H - (me >= G_H ? 1 : 0) - lowMe) % (NH/2 * G_H)] = u[NH/2 + i]; } + // For NH=8, read from lds into u[i]: + // u[4] = u[7]rev v[7]rev + // u[5] = u[6]rev v[6]rev + // u[6] = u[5]rev v[5]rev + // u[7] = u[4]rev v[4]rev + bar(G_H); + for (u32 i = 0; i < NH/2; ++i) { u[NH/2 + i] = lds[i * G_H + lowMe]; } + } +} - // For NH=8, u[0] to u[3] are left unchanged. Write to lds: - // u[7]rev u[6]rev - // u[5]rev u[4]rev - // v[7]rev v[6]rev - // v[5]rev v[4]rev - bar(); - for (u32 i = 0; i < NH / 2; ++i) { - u32 j = (i * G_H + me % G_H); - lds[me < G_H ? ((NH/2)*G_H - j) % ((NH/2)*G_H) : NH*G_H-1 - j] = u[NH/2 + i]; +// This is used to reverse the second part of a line, and cross the reversed parts between the halves. +void OVERLOAD revCrossLine(local F2* lds2, F2 *u) { + u32 me = get_local_id(0); + u32 lowMe = me % G_H; + u32 revLowMe = G_H - 1 - lowMe; + + if (SHUFL_BYTES_H >= 4) { + local F2 *ldsOut = lds2; + local F2 *ldsIn = lds2; + if (me < G_H) ldsOut += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(F2); + else ldsIn += SMALL_HEIGHT * SHUFL_BYTES_H / sizeof(F2); + bar(); // we need a full bar because we're crossing halves + for (u32 i = 0; i < NH/2; ++i) { ldsOut[G_H * (NH/2 - 1 - i) + revLowMe] = u[i + NH/2]; } + bar(); // we need a full bar because we just crossed halves. LDS reads are compatible with future shufl calls. + for (u32 i = 0; i < NH/2; ++i) { u[i + NH/2] = ldsIn[G_H * i + lowMe]; } } - // For NH=8, read from lds into u[i]: - // u[4] = u[7]rev v[7]rev - // u[5] = u[6]rev v[6]rev - // u[6] = u[5]rev v[5]rev - // u[7] = u[4]rev v[4]rev - bar(); - lds += me % G_H + (me / G_H) * NH/2 * G_H; - for (u32 i = 0; i < NH / 2; ++i) { u[NH/2 + i] = lds[i * G_H]; } } +#if 0 // Unused + // Somewhat similar to reverseLine. // The u values are in threads < G_H, the v values to reverse in threads >= G_H. // Whereas reverseLine leaves u values alone. This reverseLine moves u values around @@ -357,6 +472,8 @@ void OVERLOAD unreverseLine2(local F2 *lds, F2 *u) { #endif +#endif + /**************************************************************************/ /* Similar to above, but for an NTT based on GF(M31^2) */ @@ -365,161 +482,32 @@ void OVERLOAD unreverseLine2(local F2 *lds, F2 *u) { #if NTT_GF31 void OVERLOAD reverse(u32 WG, local GF31 *lds, GF31 *u, bool bump) { - u32 me = get_local_id(0); - u32 revMe = WG - 1 - me + bump; - - bar(); - -#if NH == 8 - lds[revMe + 0 * WG] = u[3]; - lds[revMe + 1 * WG] = u[2]; - lds[revMe + 2 * WG] = u[1]; - lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0]; -#elif NH == 4 - lds[revMe + 0 * WG] = u[1]; - lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0]; -#else -#error -#endif - - bar(); - for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; } + reverse(WG, (local F2 *) lds, (F2 *) u, bump); } -void OVERLOAD reverseLine(u32 WG, local GF31 *lds2, GF31 *u) { - u32 me = get_local_id(0); - u32 revMe = WG - 1 - me; - - local GF31 *lds = lds2 + revMe; - bar(); - for (u32 i = 0; i < NH; ++i) { lds[WG * (NH - 1 - i)] = u[i]; } - - lds = lds2 + me; - bar(); - for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; } +void OVERLOAD reverseLine(u32 WG, local GF31 *lds, GF31 *u) { + reverseLine(WG, (local F2 *) lds, (F2 *) u); } -// This is used to reverse the second part of a line, and cross the reversed parts between the halves. -void OVERLOAD revCrossLine(u32 WG, local GF31* lds2, GF31 *u, u32 n, bool writeSecondHalf) { - u32 me = get_local_id(0); - u32 lowMe = me % WG; - - u32 revLowMe = WG - 1 - lowMe; - - for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; } - - bar(); // we need a full bar because we're crossing halves - - for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; } -} - -// -// These versions are for the kernel(s) that uses a double-wide workgroup (u in half the workgroup, v in the other half) -// - void OVERLOAD reverse2(local GF31 *lds, GF31 *u) { - u32 me = get_local_id(0); - - // For NH=8, u[0] to u[3] are left unchanged. Write to lds: - // u[7]rev u[6]rev - // u[5]rev u[4]rev - // v[7]rev v[6]rev - // v[5]rev v[4]rev - bar(); - for (u32 i = 0; i < NH / 2; ++i) { - u32 j = (i * G_H + me % G_H); - lds[me < G_H ? ((NH/2)*G_H - j) % ((NH/2)*G_H) : NH*G_H-1 - j] = u[NH/2 + i]; - } - // For NH=8, read from lds into u[i]: - // u[4] = u[7]rev v[7]rev - // u[5] = u[6]rev v[6]rev - // u[6] = u[5]rev v[5]rev - // u[7] = u[4]rev v[4]rev - bar(); - lds += me % G_H + (me / G_H) * NH/2 * G_H; - for (u32 i = 0; i < NH / 2; ++i) { u[NH/2 + i] = lds[i * G_H]; } + reverse2((local F2 *) lds, (F2 *) u); } -// Somewhat similar to reverseLine. -// The u values are in threads < G_H, the v values to reverse in threads >= G_H. -// Whereas reverseLine leaves u values alone. This reverseLine moves u values around -// so that pairSq2 can easily operate on pairs. This means for NH = 4, web output: -// u[0] u[1] // Returned in u[0] -// u[2] u[3] // Returned in u[1] -// v[3]rev v[2]rev // Returned in u[2] -// v[1]rev v[0]rev // Returned in u[3] -void OVERLOAD reverseLine2(local GF31 *lds, GF31 *u) { - u32 me = get_local_id(0); - -// NOTE: It is important that this routine use lds memory in coordination with shufl2. Failure to do so would require an -// unqualified bar() call here. Specifically, the u values are stored in the upper half of lds memory (SMALL_HEIGHT GF31 values). -// The v values are stored in the lower half of lds memory (the next SMALL_HEIGHT GF31 values). - - if (G_H > WAVEFRONT) bar(); - -// For NH=4, the lds indices (where to write each incoming u[i] which has v[i] in the upper threads) looks like this: -// 0..GH-1 +0*G_H GH-1..0 +7*G_H -// 0..GH-1 +1*G_H GH-1..0 +6*G_H -// 0..GH-1 +2*G_H GH-1..0 +5*G_H -// 0..GH-1 +3*G_H GH-1..0 +4*G_H -// That means saving to lds using index: me < G_H ? me % G_H + i * G_H : 8*G_H-1 - me % G_H - i * G_H - -#if 1 - local GF31 *ldsOut = lds + (me < G_H ? me % G_H : (NH*2)*G_H-1 - me % G_H); - i32 ldsOutInc = (me < G_H) ? G_H : -G_H; - for (u32 i = 0; i < NH; ++i, ldsOut += ldsOutInc) { *ldsOut = u[i]; } +void OVERLOAD revCrossLine(local GF31* lds, GF31 *u) { + revCrossLine((local F2 *) lds, (F2 *) u); +} - lds += me; - bar(); - for (u32 i = 0; i < NH; ++i) { u[i] = lds[i * 2*G_H]; } -#else - local Z61 *ldsOut = (local Z61 *) lds + (me < G_H ? me % G_H : (NH*2)*G_H-1 - me % G_H); - i32 ldsOutInc = (me < G_H) ? G_H : -G_H; - for (u32 i = 0; i < NH; ++i, ldsOut += ldsOutInc) { ldsOut[0] = u[i].x; ldsOut[NH*2*G_H] = u[i].y; } +#if 0 // Unused - local ZF61 *ldsIn = (local T *) lds + me; - bar(); - for (u32 i = 0; i < NH; ++i) { u[i].x = ldsIn[i * 2*G_H]; u[i].y = ldsIn[NH*2*G_H + i * 2*G_H]; } -#endif +void OVERLOAD reverseLine2(local GF31 *lds, GF31 *u) { + reverseLine2((local F2 *) lds, (F2 *) u); } -// Undo a reverseLine2 void OVERLOAD unreverseLine2(local GF31 *lds, GF31 *u) { - u32 me = get_local_id(0); - -// NOTE: It is important that this routine use lds memory in coordination with reverseLine2 and shufl2. By initially -// writing to the lds locations that reverseLine2 read from we do not need an initial bar() call here. Also, by reading -// from the lds locations that shufl2 will use (u values in the upper half of lds memory, v values in the lower half of -// lds memory) we can issue a qualified bar() call before calling FFT_HEIGHT2. - -#if 1 - local GF31 *ldsOut = lds + me; - for (u32 i = 0; i < NH; ++i) { ldsOut[i * 2*G_H] = u[i]; } - -// For NH=4, the lds indices (where to read each outgoing u[i] which has v[i] in the upper threads) looks like this: -// 0..GH-1 +0*G_H GH-1..0 +7*G_H -// 0..GH-1 +1*G_H GH-1..0 +6*G_H -// 0..GH-1 +2*G_H GH-1..0 +5*G_H -// 0..GH-1 +3*G_H GH-1..0 +4*G_H - lds += (me < G_H) ? me % G_H : (NH*2)*G_H-1 - me % G_H; - i32 ldsInc = (me < G_H) ? G_H : -G_H; - bar(); - for (u32 i = 0; i < NH; ++i, lds += ldsInc) { u[i] = *lds; } -#else - local Z61 *ldsOut = (local T *) lds + me; - for (u32 i = 0; i < NH; ++i) { ldsOut[i * 2*G_H] = u[i].x; ldsOut[NH*2*G_H + i * 2*G_H] = u[i].y; } + unreverseLine2((local F2 *) lds, (F2 *) u); +} -// For NH=4, the lds indices (where to read each outgoing u[i] which has v[i] in the upper threads) looks like this: -// 0..GH-1 +0*G_H GH-1..0 +7*G_H -// 0..GH-1 +1*G_H GH-1..0 +6*G_H -// 0..GH-1 +2*G_H GH-1..0 +5*G_H -// 0..GH-1 +3*G_H GH-1..0 +4*G_H - local Z61 *ldsIn = (local T *) lds + ((me < G_H) ? me % G_H : (NH*2)*G_H-1 - me % G_H); - i32 ldsInc = (me < G_H) ? G_H : -G_H; - bar(); - for (u32 i = 0; i < NH; ++i, ldsIn += ldsInc) { u[i].x = ldsIn[0]; u[i].y = ldsIn[NH*2*G_H]; } #endif -} #endif @@ -531,160 +519,31 @@ void OVERLOAD unreverseLine2(local GF31 *lds, GF31 *u) { #if NTT_GF61 void OVERLOAD reverse(u32 WG, local GF61 *lds, GF61 *u, bool bump) { - u32 me = get_local_id(0); - u32 revMe = WG - 1 - me + bump; - - bar(); - -#if NH == 8 - lds[revMe + 0 * WG] = u[3]; - lds[revMe + 1 * WG] = u[2]; - lds[revMe + 2 * WG] = u[1]; - lds[bump ? ((revMe + 3 * WG) % (4 * WG)) : (revMe + 3 * WG)] = u[0]; -#elif NH == 4 - lds[revMe + 0 * WG] = u[1]; - lds[bump ? ((revMe + WG) % (2 * WG)) : (revMe + WG)] = u[0]; -#else -#error -#endif - - bar(); - for (i32 i = 0; i < NH/2; ++i) { u[i] = lds[i * WG + me]; } + reverse(WG, (local T2 *) lds, (T2 *) u, bump); } -void OVERLOAD reverseLine(u32 WG, local GF61 *lds2, GF61 *u) { - u32 me = get_local_id(0); - u32 revMe = WG - 1 - me; - - local GF61 *lds = lds2 + revMe; - bar(); - for (u32 i = 0; i < NH; ++i) { lds[WG * (NH - 1 - i)] = u[i]; } - - lds = lds2 + me; - bar(); - for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; } +void OVERLOAD reverseLine(u32 WG, local GF61 *lds, GF61 *u) { + reverseLine(WG, (local T2 *) lds, (T2 *) u); } -// This is used to reverse the second part of a line, and cross the reversed parts between the halves. -void OVERLOAD revCrossLine(u32 WG, local GF61* lds2, GF61 *u, u32 n, bool writeSecondHalf) { - u32 me = get_local_id(0); - u32 lowMe = me % WG; - - u32 revLowMe = WG - 1 - lowMe; - - for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; } - - bar(); // we need a full bar because we're crossing halves - - for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; } -} - -// -// These versions are for the kernel(s) that uses a double-wide workgroup (u in half the workgroup, v in the other half) -// - void OVERLOAD reverse2(local GF61 *lds, GF61 *u) { - u32 me = get_local_id(0); - - // For NH=8, u[0] to u[3] are left unchanged. Write to lds: - // u[7]rev u[6]rev - // u[5]rev u[4]rev - // v[7]rev v[6]rev - // v[5]rev v[4]rev - bar(); - for (u32 i = 0; i < NH / 2; ++i) { - u32 j = (i * G_H + me % G_H); - lds[me < G_H ? ((NH/2)*G_H - j) % ((NH/2)*G_H) : NH*G_H-1 - j] = u[NH/2 + i]; - } - // For NH=8, read from lds into u[i]: - // u[4] = u[7]rev v[7]rev - // u[5] = u[6]rev v[6]rev - // u[6] = u[5]rev v[5]rev - // u[7] = u[4]rev v[4]rev - bar(); - lds += me % G_H + (me / G_H) * NH/2 * G_H; - for (u32 i = 0; i < NH / 2; ++i) { u[NH/2 + i] = lds[i * G_H]; } + reverse2((local T2 *) lds, (T2 *) u); } -// Somewhat similar to reverseLine. -// The u values are in threads < G_H, the v values to reverse in threads >= G_H. -// Whereas reverseLine leaves u values alone. This reverseLine moves u values around -// so that pairSq2 can easily operate on pairs. This means for NH = 4, web output: -// u[0] u[1] // Returned in u[0] -// u[2] u[3] // Returned in u[1] -// v[3]rev v[2]rev // Returned in u[2] -// v[1]rev v[0]rev // Returned in u[3] -void OVERLOAD reverseLine2(local GF61 *lds, GF61 *u) { - u32 me = get_local_id(0); - -// NOTE: It is important that this routine use lds memory in coordination with shufl2. Failure to do so would require an -// unqualified bar() call here. Specifically, the u values are stored in the upper half of lds memory (SMALL_HEIGHT GF61 values). -// The v values are stored in the lower half of lds memory (the next SMALL_HEIGHT GF61 values). - - if (G_H > WAVEFRONT) bar(); - -// For NH=4, the lds indices (where to write each incoming u[i] which has v[i] in the upper threads) looks like this: -// 0..GH-1 +0*G_H GH-1..0 +7*G_H -// 0..GH-1 +1*G_H GH-1..0 +6*G_H -// 0..GH-1 +2*G_H GH-1..0 +5*G_H -// 0..GH-1 +3*G_H GH-1..0 +4*G_H -// That means saving to lds using index: me < G_H ? me % G_H + i * G_H : 8*G_H-1 - me % G_H - i * G_H - -#if 1 - local GF61 *ldsOut = lds + (me < G_H ? me % G_H : (NH*2)*G_H-1 - me % G_H); - i32 ldsOutInc = (me < G_H) ? G_H : -G_H; - for (u32 i = 0; i < NH; ++i, ldsOut += ldsOutInc) { *ldsOut = u[i]; } +void OVERLOAD revCrossLine(local GF61* lds, GF61 *u) { + revCrossLine((local T2 *) lds, (T2 *) u); +} - lds += me; - bar(); - for (u32 i = 0; i < NH; ++i) { u[i] = lds[i * 2*G_H]; } -#else - local Z61 *ldsOut = (local Z61 *) lds + (me < G_H ? me % G_H : (NH*2)*G_H-1 - me % G_H); - i32 ldsOutInc = (me < G_H) ? G_H : -G_H; - for (u32 i = 0; i < NH; ++i, ldsOut += ldsOutInc) { ldsOut[0] = u[i].x; ldsOut[NH*2*G_H] = u[i].y; } +#if 0 // Unused - local ZF61 *ldsIn = (local T *) lds + me; - bar(); - for (u32 i = 0; i < NH; ++i) { u[i].x = ldsIn[i * 2*G_H]; u[i].y = ldsIn[NH*2*G_H + i * 2*G_H]; } -#endif +void OVERLOAD reverseLine2(local GF61 *lds, GF61 *u) { + reverseLine2((local T2 *) lds, (T2 *) u); } -// Undo a reverseLine2 void OVERLOAD unreverseLine2(local GF61 *lds, GF61 *u) { - u32 me = get_local_id(0); - -// NOTE: It is important that this routine use lds memory in coordination with reverseLine2 and shufl2. By initially -// writing to the lds locations that reverseLine2 read from we do not need an initial bar() call here. Also, by reading -// from the lds locations that shufl2 will use (u values in the upper half of lds memory, v values in the lower half of -// lds memory) we can issue a qualified bar() call before calling FFT_HEIGHT2. - -#if 1 - local GF61 *ldsOut = lds + me; - for (u32 i = 0; i < NH; ++i) { ldsOut[i * 2*G_H] = u[i]; } - -// For NH=4, the lds indices (where to read each outgoing u[i] which has v[i] in the upper threads) looks like this: -// 0..GH-1 +0*G_H GH-1..0 +7*G_H -// 0..GH-1 +1*G_H GH-1..0 +6*G_H -// 0..GH-1 +2*G_H GH-1..0 +5*G_H -// 0..GH-1 +3*G_H GH-1..0 +4*G_H - lds += (me < G_H) ? me % G_H : (NH*2)*G_H-1 - me % G_H; - i32 ldsInc = (me < G_H) ? G_H : -G_H; - bar(); - for (u32 i = 0; i < NH; ++i, lds += ldsInc) { u[i] = *lds; } -#else - local Z61 *ldsOut = (local T *) lds + me; - for (u32 i = 0; i < NH; ++i) { ldsOut[i * 2*G_H] = u[i].x; ldsOut[NH*2*G_H + i * 2*G_H] = u[i].y; } + unreverseLine2((local T2 *) lds, (T2 *) u); +} -// For NH=4, the lds indices (where to read each outgoing u[i] which has v[i] in the upper threads) looks like this: -// 0..GH-1 +0*G_H GH-1..0 +7*G_H -// 0..GH-1 +1*G_H GH-1..0 +6*G_H -// 0..GH-1 +2*G_H GH-1..0 +5*G_H -// 0..GH-1 +3*G_H GH-1..0 +4*G_H - local Z61 *ldsIn = (local T *) lds + ((me < G_H) ? me % G_H : (NH*2)*G_H-1 - me % G_H); - i32 ldsInc = (me < G_H) ? G_H : -G_H; - bar(); - for (u32 i = 0; i < NH; ++i, ldsIn += ldsInc) { u[i].x = ldsIn[0]; u[i].y = ldsIn[NH*2*G_H]; } #endif -} #endif diff --git a/src/tune.cpp b/src/tune.cpp index 4d613d0f..ae376054 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -911,7 +911,7 @@ void Tune::tune() { } // Find best BIGLIT setting - if (time_FFTs) { + if (0 && time_FFTs) { // Deprecated FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_biglit = 0; @@ -954,7 +954,7 @@ void Tune::tune() { config.write("\n -log 1000000\n"); } if (args->workers < 2) { - config.write("\n# Running two workers sometimes gives better throughput. Autoprimenet will need to create up a second worktodo file."); + config.write("\n# Running two workers sometimes gives better throughput. AutoPrimenet will need to create a second worktodo file."); config.write("\n# -workers 2\n"); config.write("\n# Changing TAIL_KERNELS to 3 when running two workers may be better."); config.write("\n# -use TAIL_KERNELS=3\n"); From 08036f356c6b6a0fa8c28d37880426ed649e939c Mon Sep 17 00:00:00 2001 From: george Date: Thu, 12 Mar 2026 02:00:30 +0000 Subject: [PATCH 9/9] Testing indicates WMUL=2 should dbe the default. RTX4xxx and RTX5xxx GPUs benefit from L2STORE and LULOAD. Added support for those options officially. Since FAST_BARRIER seems to now work on nVidia, the option is now tuned. --- src/Gpu.cpp | 4 +++- src/cl/base.cl | 8 +++++++ src/tune.cpp | 63 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/src/Gpu.cpp b/src/Gpu.cpp index 84cc4434..6d00c0fd 100644 --- a/src/Gpu.cpp +++ b/src/Gpu.cpp @@ -228,7 +228,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector< // Default value for -use options that must also be parsed in C++ code tail_single_wide = 0, tail_single_kernel = 1; // Default tailSquare is double-wide in one kernel in_place = 0; // Default is not in-place - wmul = 1; // Default is carryFused processes one workgroup at a time + wmul = 2; // Default is carryFused processes two lines at a time pad_size = isAmdGpu(id) ? 256 : 0; // Default is 256 bytes for AMD, 0 for others // Validate -use options @@ -263,6 +263,8 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector< "TABMUL_CHAIN32", "TABMUL_CHAIN61", "MODM31", + "ENABLE_L2STORE", + "ENABLE_LULOAD", "WMUL" }); if (!isValid) { diff --git a/src/cl/base.cl b/src/cl/base.cl index f252cfe1..1765fdfe 100644 --- a/src/cl/base.cl +++ b/src/cl/base.cl @@ -189,6 +189,14 @@ G_H "group height" == SMALL_HEIGHT / NH #define ZEROHACK_H 1 #endif +#if !defined(ENABLE_L2STORE) +#define ENABLE_L2STORE 1 +#endif + +#if !defined(ENABLE_LULOAD) +#define ENABLE_LULOAD 1 +#endif + // Expected defines: EXP the exponent. // WIDTH, SMALL_HEIGHT, MIDDLE. diff --git a/src/tune.cpp b/src/tune.cpp index ae376054..76474bcb 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -343,6 +343,7 @@ void Tune::tune() { // There are some options and variants that are different based on GPU manufacturer bool AMDGPU = isAmdGpu(q->context->deviceId()); + bool NVIDIAGPU = isNvidiaGpu(q->context->deviceId()); bool tune_config = 1; bool time_FFTs = 0; @@ -601,7 +602,7 @@ void Tune::tune() { } // Find best FAST_BARRIER setting - if (AMDGPU) { + if (1 /*AMDGPU*/) { // FAST_BARRIER now works for nVidia GPUs too (from what I've seen) FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; u64 exponent = primes.prevPrime(fft.maxExp()); u32 best_fast_barrier = 0; @@ -910,6 +911,66 @@ void Tune::tune() { args->flags["ZEROHACK_H"] = to_string(best_zerohack_h); } + // Find best WMUL setting + if (1) { + FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; + u64 exponent = primes.prevPrime(fft.maxExp()); + u32 best_wmul = 0; + u32 current_wmul = args->value("WMUL", 2); + double best_cost = -1.0; + double current_cost = -1.0; + for (u32 wmul : {1, 2, 4}) { + args->flags["WMUL"] = to_string(wmul); + double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP(quick); + log("Time for %12s using WMUL=%u is %6.1f\n", fft.spec().c_str(), wmul, cost); + if (wmul == current_wmul) current_cost = cost; + if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_wmul = wmul; } + } + log("Best WMUL is %u. Default WMUL is 2.\n", best_wmul); + configsUpdate(current_cost, best_cost, 0.003, "WMUL", best_wmul, newConfigKeyVals, suggestedConfigKeyVals); + args->flags["WMUL"] = to_string(best_wmul); + } + + // Find best ENABLE_L2STORE setting + if (NVIDIAGPU) { + FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; + u64 exponent = primes.prevPrime(fft.maxExp()); + u32 best_enable_l2store = 0; + u32 current_enable_l2store = args->value("ENABLE_L2STORE", 2); + double best_cost = -1.0; + double current_cost = -1.0; + for (u32 enable_l2store : {0, 1}) { + args->flags["ENABLE_L2STORE"] = to_string(enable_l2store); + double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP(quick); + log("Time for %12s using ENABLE_L2STORE=%u is %6.1f\n", fft.spec().c_str(), enable_l2store, cost); + if (enable_l2store == current_enable_l2store) current_cost = cost; + if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_enable_l2store = enable_l2store; } + } + log("Best ENABLE_L2STORE is %u. Default ENABLE_L2STORE is 1.\n", best_enable_l2store); + configsUpdate(current_cost, best_cost, 0.003, "ENABLE_L2STORE", best_enable_l2store, newConfigKeyVals, suggestedConfigKeyVals); + args->flags["ENABLE_L2STORE"] = to_string(best_enable_l2store); + } + + // Find best ENABLE_LULOAD setting + if (NVIDIAGPU) { + FFTConfig fft{*defaultShape, variant, CARRY_AUTO}; + u64 exponent = primes.prevPrime(fft.maxExp()); + u32 best_enable_luload = 0; + u32 current_enable_luload = args->value("ENABLE_LULOAD", 2); + double best_cost = -1.0; + double current_cost = -1.0; + for (u32 enable_luload : {0, 1}) { + args->flags["ENABLE_LULOAD"] = to_string(enable_luload); + double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP(quick); + log("Time for %12s using ENABLE_LULOAD=%u is %6.1f\n", fft.spec().c_str(), enable_luload, cost); + if (enable_luload == current_enable_luload) current_cost = cost; + if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_enable_luload = enable_luload; } + } + log("Best ENABLE_LULOAD is %u. Default ENABLE_LULOAD is 1.\n", best_enable_luload); + configsUpdate(current_cost, best_cost, 0.003, "ENABLE_LULOAD", best_enable_luload, newConfigKeyVals, suggestedConfigKeyVals); + args->flags["ENABLE_LULOAD"] = to_string(best_enable_luload); + } + // Find best BIGLIT setting if (0 && time_FFTs) { // Deprecated FFTConfig fft{*defaultShape, variant, CARRY_AUTO};