diff --git a/framework/audio/common/CMakeLists.txt b/framework/audio/common/CMakeLists.txt
index 14ff38cce5..ca9237da11 100644
--- a/framework/audio/common/CMakeLists.txt
+++ b/framework/audio/common/CMakeLists.txt
@@ -33,6 +33,13 @@ target_sources(muse_audio_common PRIVATE
     iaudiothreadsecurer.h
     audiothreadsecurer.cpp
     audiothreadsecurer.h
+    audioworkgroup.h
+    audioworkgroup.cpp
+    audiotaskscheduler.h
+    iaudiotaskscheduler.h
+    realtimethreadpool.h
+    concurrentqueue.h
+    lightweightsemaphore.h
     workmode.cpp
     workmode.h
     alignmentbuffer.h
diff --git a/framework/audio/common/audiotaskscheduler.h b/framework/audio/common/audiotaskscheduler.h
new file mode 100644
index 0000000000..dac2c2716f
--- /dev/null
+++ b/framework/audio/common/audiotaskscheduler.h
@@ -0,0 +1,97 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only
+ * MuseScore-CLA-applies
+ *
+ * MuseScore
+ * Music Composition & Notation
+ *
+ * Copyright (C) 2026 MuseScore Limited and others
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "audioworkgroup.h"
+#include "realtimethreadpool.h"
+#include "iaudiodriver.h"
+#include "iaudiotaskscheduler.h"
+#include "audiosanitizer.h"
+#include "global/async/asyncable.h"
+#include "global/log.h"
+#include <mutex>
+
+namespace muse::audio {
+class AudioTaskScheduler : public IAudioTaskScheduler, public muse::async::Asyncable
+{
+    constexpr static const char* threadpoolName = "audio_realtime_thread";
+public:
+    void submitRealtimeTasksAndWait(const std::vector<Task>& tasks) override
+    {
+        std::lock_guard lock(m_threadPoolMutex);
+        for (const auto& task : tasks) {
+            IF_ASSERT_FAILED(m_threadPool->enqueue(task)) {
+                task();
+            }
+        }
+        m_threadPool->participateAndWait();
+    }
+
+    void setAudioDriver(const IAudioDriverPtr& audioDriver)
+    {
+        if (!audioDriver) {
+            return;
+        }
+        setWorkgroup(audioDriver->getAudioWorkGroup());
+        audioDriver->currentWorkgroupChanged().onNotify(
+            this, [this, audioDriverWeak = std::weak_ptr<IAudioDriver>(audioDriver)]() {
+            if (auto audioDriver = audioDriverWeak.lock()) {
+                setWorkgroup(audioDriver->getAudioWorkGroup());
+            }
+        });
+    }
+
+private:
+
+    void setWorkgroup(const AudioWorkGroup& workGroup)
+    {
+        std::lock_guard lock(m_threadPoolMutex);
+        ensureThreadPoolSize(workGroup);
+        m_threadPool->setAudioWorkgroup(workGroup);
+    }
+
+    int getIdealThreadCount(AudioWorkGroup workGroup) const
+    {
+        int bestThreadHint = std::thread::hardware_concurrency();
+        if (workGroup.getProvider() != nullptr) {
+            bestThreadHint = workGroup.getMaxParallelThreadCount();
+        }
+        constexpr int hardwareToRealtimeRatio = 2; // This is a heuristic value. The optimal value may vary depending on the workload and system.
+        return bestThreadHint > 0 ? static_cast<int>(bestThreadHint / hardwareToRealtimeRatio) : 1;
+    }
+
+    void ensureThreadPoolSize(const AudioWorkGroup& currentWorkGroup)
+    {
+        auto idealWorkerCount = getIdealThreadCount(currentWorkGroup);
+        std::lock_guard lock(m_threadPoolMutex);
+        if (m_threadPool->getNumberOfWorkers() != idealWorkerCount) {
+            m_threadPool = std::make_unique<RealtimeThreadPool>(threadpoolName, idealWorkerCount);
+            AudioSanitizer::setMixerThreads(m_threadPool->threadIdSet());
+        }
+    }
+
+    std::unique_ptr<RealtimeThreadPool> m_threadPool{ std::make_unique<RealtimeThreadPool>(threadpoolName, getIdealThreadCount({})) };
+
+    std::recursive_mutex m_threadPoolMutex;
+};
+}
diff --git a/framework/audio/common/audioworkgroup.cpp b/framework/audio/common/audioworkgroup.cpp
new file mode 100644
index 0000000000..2ccd009822
--- /dev/null
+++ b/framework/audio/common/audioworkgroup.cpp
@@ -0,0 +1,244 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only
+ * MuseScore-CLA-applies
+ *
+ * MuseScore
+ * Music Composition & Notation
+ *
+ * Copyright (C) 2026 MuseScore Limited and others
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "audioworkgroup.h"
+
+#include <memory>
+#include <utility>
+
+#ifdef __APPLE__
+#include <os/object.h>
+#include <os/workgroup.h>
+#else
+#include <thread>
+#endif
+
+namespace muse::audio {
+#ifdef __APPLE__
+class AudioWorkgroupTokenProvider
+{
+public:
+    AudioWorkgroupTokenProvider(os_workgroup_t workgroup)
+        : m_workgroup(workgroup)
+    {
+        if (!m_workgroup) {
+            return;
+        }
+
+        if (__builtin_available(macOS 11.0, *)) {
+            os_retain(m_workgroup);
+            auto status = os_workgroup_join(m_workgroup, &m_joinToken);
+            if (status == 0) {
+                return;
+            }
+
+            os_release(m_workgroup);
+            m_workgroup = nullptr;
+            m_joinToken = {};
+            return;
+        }
+
+        m_workgroup = nullptr;
+    }
+
+    bool isAttachedTo(os_workgroup_t wg) const { return m_workgroup == wg; }
+    bool isValid() const { return m_workgroup != nullptr; }
+
+    AudioWorkgroupTokenProvider(const AudioWorkgroupTokenProvider& other) = delete;
+
+    AudioWorkgroupTokenProvider& operator=(const AudioWorkgroupTokenProvider&) = delete;
+
+    AudioWorkgroupTokenProvider(AudioWorkgroupTokenProvider&& other) noexcept
+        : m_workgroup(other.m_workgroup), m_joinToken(other.m_joinToken)
+    {
+        other.m_workgroup = nullptr;
+        other.m_joinToken = {};
+    }
+
+    ~AudioWorkgroupTokenProvider()
+    {
+        leave();
+    }
+
+private:
+    void leave() noexcept
+    {
+        if (!m_workgroup) {
+            return;
+        }
+
+        if (__builtin_available(macOS 11.0, *)) {
+            os_workgroup_leave(m_workgroup, &m_joinToken);
+        }
+        os_release(m_workgroup);
+        m_workgroup = nullptr;
+        m_joinToken = {};
+    }
+
+    os_workgroup_t m_workgroup;
+    os_workgroup_join_token_s m_joinToken;
+};
+
+class AudioWorkgroupProvider
+{
+public:
+    explicit AudioWorkgroupProvider(os_workgroup_t wg)
+        : m_workgroup(wg)
+    {
+        os_retain(m_workgroup);
+    }
+
+    ~AudioWorkgroupProvider()
+    {
+        if (m_workgroup) {
+            os_release(m_workgroup);
+        }
+    }
+
+    AudioWorkgroupProvider(const AudioWorkgroupProvider& other)
+        : m_workgroup(other.m_workgroup)
+    {
+        os_retain(m_workgroup);
+    }
+
+    AudioWorkgroupProvider& operator=(const AudioWorkgroupProvider& other)
+    {
+        if (this != &other) {
+            os_retain(other.m_workgroup);
+            if (m_workgroup) {
+                os_release(m_workgroup);
+            }
+            m_workgroup = other.m_workgroup;
+        }
+        return *this;
+    }
+
+    AudioWorkgroupProvider(AudioWorkgroupProvider&& other) noexcept
+        : m_workgroup(other.m_workgroup)
+    {
+        other.m_workgroup = nullptr;
+    }
+
+    bool join(AudioWorkgroupToken& tokenProvider) const
+    {
+        if (auto existingProvider = AudioWorkGroup::providerFor(tokenProvider);
+            existingProvider != nullptr && existingProvider->isAttachedTo(m_workgroup)) {
+            return true;
+        }
+        AudioWorkGroup::resetProviderFor(tokenProvider);
+        AudioWorkgroupTokenProvider provider(m_workgroup);
+        if (!provider.isValid()) {
+            return false;
+        }
+
+        AudioWorkGroup::setProviderFor(tokenProvider, [provider = std::move(provider)]() {
+            return &provider;
+        });
+        return true;
+    }
+
+    size_t getMaxParallelThreadCount() const
+    {
+        if (__builtin_available(macOS 11.0, *)) {
+            return (size_t)os_workgroup_max_parallel_threads(m_workgroup, nullptr);
+        }
+        return 0;
+    }
+
+private:
+
+    os_workgroup_t m_workgroup;
+};
+
+bool AudioWorkGroup::join(AudioWorkgroupToken& token)
+{
+    auto provider = getProvider();
+    if (!provider) {
+        return false;
+    }
+    return provider->join(token);
+}
+
+AudioWorkGroup makeAudioWorkgroup(void* opaqueHandle)
+{
+    if (opaqueHandle == nullptr) {
+        return {};
+    }
+
+    os_workgroup_t handle = reinterpret_cast<os_workgroup_t>(opaqueHandle);
+
+    return AudioWorkGroup { std::make_unique<AudioWorkgroupProvider>(handle) };
+}
+
+size_t AudioWorkGroup::getMaxParallelThreadCount() const
+{
+    if (auto provider = getProvider(); provider) {
+        return provider->getMaxParallelThreadCount();
+    }
+    return 0;
+}
+
+#else
+
+class AudioWorkgroupProvider
+{
+};
+
+size_t AudioWorkGroup::getMaxParallelThreadCount() const
+{
+    return std::thread::hardware_concurrency();
+}
+
+bool AudioWorkGroup::join(AudioWorkgroupToken&)
+{
+    return false;
+}
+
+AudioWorkGroup makeAudioWorkgroup(void*)
+{
+    return {};
+}
+
+#endif
+
+AudioWorkGroup::AudioWorkGroup() = default;
+
+AudioWorkGroup::AudioWorkGroup(std::unique_ptr<AudioWorkgroupProvider> provider)
+    : m_provider(std::move(provider)) {}
+
+AudioWorkGroup::~AudioWorkGroup() = default;
+
+AudioWorkGroup::AudioWorkGroup(const AudioWorkGroup& other)
+    : m_provider(other.m_provider ? std::make_unique<AudioWorkgroupProvider>(*other.m_provider) : nullptr) {}
+
+AudioWorkGroup::AudioWorkGroup(AudioWorkGroup&& other) noexcept = default;
+
+AudioWorkGroup& AudioWorkGroup::operator=(const AudioWorkGroup& other)
+{
+    if (this != &other) {
+        m_provider = other.m_provider ? std::make_unique<AudioWorkgroupProvider>(*other.m_provider) : nullptr;
+    }
+    return *this;
+}
+
+AudioWorkGroup& AudioWorkGroup::operator=(AudioWorkGroup&& other) noexcept = default;
+} // namespace muse::audio
diff --git a/framework/audio/common/audioworkgroup.h b/framework/audio/common/audioworkgroup.h
new file mode 100644
index 0000000000..6f19b002cf
--- /dev/null
+++ b/framework/audio/common/audioworkgroup.h
@@ -0,0 +1,147 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only
+ * MuseScore-CLA-applies
+ *
+ * MuseScore
+ * Music Composition & Notation
+ *
+ * Copyright (C) 2026 MuseScore Limited and others
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <thread>
+#include <utility>
+
+#include "global/functional/inplace_function_mv.h"
+
+namespace muse::audio {
+class AudioWorkgroupTokenProvider;
+
+using ErasedAudioWorkgroupTokenProvider = muse::functional::MoveOnlyInplaceFunction<const AudioWorkgroupTokenProvider* (), 64>;
+
+class AudioWorkgroupToken
+{
+public:
+    AudioWorkgroupToken() = default;
+    AudioWorkgroupToken(const AudioWorkgroupToken&) = delete;
+    AudioWorkgroupToken& operator=(const AudioWorkgroupToken&) = delete;
+    AudioWorkgroupToken(AudioWorkgroupToken&& other) noexcept
+    {
+        moveFrom(std::move(other));
+    }
+
+    AudioWorkgroupToken& operator=(AudioWorkgroupToken&& other) noexcept
+    {
+        if (this != &other) {
+            reset();
+            moveFrom(std::move(other));
+        }
+
+        return *this;
+    }
+
+    ~AudioWorkgroupToken()
+    {
+        reset();
+    }
+
+private:
+    friend class AudioWorkGroup;
+
+    explicit AudioWorkgroupToken(ErasedAudioWorkgroupTokenProvider provider)
+    {
+        setProvider(std::move(provider));
+    }
+
+    const AudioWorkgroupTokenProvider* provider() const
+    {
+        assertCurrentThread();
+        return m_provider ? m_provider() : nullptr;
+    }
+
+    void setProvider(ErasedAudioWorkgroupTokenProvider workgroup)
+    {
+        reset();
+        m_provider = std::move(workgroup);
+        if (m_provider) {
+            m_threadId = std::this_thread::get_id();
+        }
+    }
+
+    void reset()
+    {
+        assertCurrentThread();
+        m_provider = nullptr;
+        m_threadId = {};
+    }
+
+    void moveFrom(AudioWorkgroupToken&& other) noexcept
+    {
+        other.assertCurrentThread();
+        m_provider = std::move(other.m_provider);
+        m_threadId = other.m_threadId;
+        other.m_threadId = {};
+    }
+
+    void assertCurrentThread() const
+    {
+        assert((!m_provider || m_threadId == std::this_thread::get_id())
+               && "AudioWorkgroupToken must be used on the thread that joined the audio workgroup");
+    }
+
+    ErasedAudioWorkgroupTokenProvider m_provider;
+    std::thread::id m_threadId;
+};
+
+class AudioWorkgroupProvider;
+
+class AudioWorkGroup
+{
+public:
+    AudioWorkGroup();
+    AudioWorkGroup(const AudioWorkGroup&);
+    AudioWorkGroup(AudioWorkGroup&&) noexcept;
+    AudioWorkGroup& operator=(const AudioWorkGroup&);
+    AudioWorkGroup& operator=(AudioWorkGroup&&) noexcept;
+    ~AudioWorkGroup();
+
+    bool join(AudioWorkgroupToken& token);
+
+    const AudioWorkgroupProvider* getProvider() const { return m_provider.get(); }
+
+    size_t getMaxParallelThreadCount() const;
+
+private:
+    friend class AudioWorkgroupProvider;
+    friend AudioWorkGroup makeAudioWorkgroup(void* opaqueHandle);
+
+    explicit AudioWorkGroup(std::unique_ptr<AudioWorkgroupProvider> provider);
+
+    static const AudioWorkgroupTokenProvider* providerFor(const AudioWorkgroupToken& token) { return token.provider(); }
+    static void setProviderFor(AudioWorkgroupToken& token, ErasedAudioWorkgroupTokenProvider provider)
+    {
+        token.setProvider(std::move(provider));
+    }
+
+    static void resetProviderFor(AudioWorkgroupToken& token) { token.reset(); }
+
+    std::unique_ptr<AudioWorkgroupProvider> m_provider;
+};
+
+AudioWorkGroup makeAudioWorkgroup(void* opaqueHandle);
+} // namespace muse::audio
diff --git a/framework/audio/common/concurrentqueue.h b/framework/audio/common/concurrentqueue.h
new file mode 100644
index 0000000000..8b64d63cc3
--- /dev/null
+++ b/framework/audio/common/concurrentqueue.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#if defined(__MACH__)
+#include <mach/machine/vm_types.h>
+
+#pragma push_macro("integer_t")
+#define integer_t ::integer_t
+#endif
+
+#include "../thirdparty/moodycamel/blockingconcurrentqueue.h"
+
+#if defined(__MACH__)
+#pragma pop_macro("integer_t")
+#endif
+
+namespace muse {
+template<typename T,
+         typename Traits = moodycamel::ConcurrentQueueDefaultTraits>
+using BlockingConcurrentQueue = moodycamel::BlockingConcurrentQueue<T, Traits>;
+}
diff --git a/framework/audio/common/iaudiotaskscheduler.h b/framework/audio/common/iaudiotaskscheduler.h
new file mode 100644
index 0000000000..29686b6690
--- /dev/null
+++ b/framework/audio/common/iaudiotaskscheduler.h
@@ -0,0 +1,40 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only
+ * MuseScore-CLA-applies
+ *
+ * MuseScore
+ * Music Composition & Notation
+ *
+ * Copyright (C) 2026 MuseScore Limited and others
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "global/modularity/imoduleinterface.h"
+#include "global/functional/inplace_function.h"
+#include <memory>
+#include <vector>
+
+namespace muse::audio {
+class IAudioTaskScheduler : MODULE_GLOBAL_INTERFACE
+{
+    INTERFACE_ID(IAudioTaskScheduler)
+public:
+    using Task = muse::functional::inplace_function<void (), 64>;
+    virtual void submitRealtimeTasksAndWait(const std::vector<Task>& tasks) = 0;
+};
+
+using IAudioTaskSchedulerPtr = std::shared_ptr<IAudioTaskScheduler>;
+}
diff --git a/framework/audio/common/lightweightsemaphore.h b/framework/audio/common/lightweightsemaphore.h
new file mode 100644
index 0000000000..0d0557d1a6
--- /dev/null
+++ b/framework/audio/common/lightweightsemaphore.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "../thirdparty/moodycamel/lightweightsemaphore.h"
+
+namespace muse {
+using LightweightSemaphore = moodycamel::LightweightSemaphore;
+}
diff --git a/framework/audio/common/realtimethreadpool.h b/framework/audio/common/realtimethreadpool.h
new file mode 100644
index 0000000000..6605d0e887
--- /dev/null
+++ b/framework/audio/common/realtimethreadpool.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include "audioworkgroup.h"
+#include "concurrency/threadutils.h"
+#include "runtime.h"
+#include "concurrentqueue.h"
+#include "global/functional/inplace_function.h"
+#include "lightweightsemaphore.h"
+#include "thirdparty/kors_logger/src/log_base.h"
+
+#include <atomic>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <string>
+#include <thread>
+#include <vector>
+
+namespace muse::audio {
+class RealtimeThreadPool
+{
+    struct InflightSemaphoreRelease {
+        explicit InflightSemaphoreRelease(muse::LightweightSemaphore& semaphore)
+            : m_semaphore(semaphore) {}
+
+        InflightSemaphoreRelease(const InflightSemaphoreRelease&) = delete;
+        InflightSemaphoreRelease& operator=(const InflightSemaphoreRelease&) = delete;
+
+        ~InflightSemaphoreRelease() noexcept
+        {
+            m_semaphore.signal();
+        }
+
+        muse::LightweightSemaphore& m_semaphore;
+    };
+
+public:
+    static constexpr int maxTaskCount = 10000;
+    using Task = muse::functional::inplace_function<void (), 64>;
+    RealtimeThreadPool(
+        std::string name,
+        int num_of_workers = std::thread::hardware_concurrency())
+    {
+        num_of_workers = std::max(0, num_of_workers);
+        m_workers.reserve(num_of_workers);
+        try {
+            for (size_t i = 0; i < static_cast<size_t>(num_of_workers); ++i) {
+                auto worker = std::make_unique<Worker>();
+                Worker* workerPtr = worker.get();
+                const size_t workerIndex = i;
+                worker->m_thread = std::make_unique<std::thread>(
+                    [this, workerPtr, workerIndex, name] {
+                    muse::runtime::setThreadName(name + "_" + std::to_string(workerIndex));
+                    AudioWorkgroupToken workgroupToken;
+                    for (;;) {
+                        Task task;
+
+                        m_queue.wait_dequeue(task);
+                        {
+                            std::lock_guard lock(workerPtr->m_workgroupMutex);
+                            workerPtr->m_workgroup.join(workgroupToken);
+                        }
+
+                        if (this->m_should_stop) {
+                            return;
+                        }
+
+                        InflightSemaphoreRelease release(m_inflightSemaphore);
+                        task();
+                    }
+                });
+                m_workers.push_back(std::move(worker));
+                muse::setThreadPriority(*m_workers.back()->m_thread, ThreadPriority::High);
+            }
+        } catch (...) {
+            terminate();
+            throw;
+        }
+    }
+
+    void setAudioWorkgroup(muse::audio::AudioWorkGroup audioworkgroup)
+    {
+        for (auto& worker : m_workers) {
+            std::lock_guard lock(worker->m_workgroupMutex);
+            worker->m_workgroup = audioworkgroup;
+        }
+    }
+
+    bool enqueue(const Task& func)
+    {
+        m_inflightSemaphore.wait();
+        if (!m_queue.enqueue(func)) {
+            m_inflightSemaphore.signal();
+            return false;
+        }
+        return true;
+    }
+
+    void participateAndWait()
+    {
+        Task task;
+        while (m_queue.try_dequeue(task)) {
+            InflightSemaphoreRelease release(m_inflightSemaphore);
+            task();
+        }
+        waitUntilFinished();
+        m_inflightSemaphore.signal(maxTaskCount);
+    }
+
+    std::set<std::thread::id> threadIdSet() const
+    {
+        std::set<std::thread::id> result;
+
+        for (const auto& worker : m_workers) {
+            result.insert(worker->m_thread->get_id());
+        }
+
+        return result;
+    }
+
+    int getNumberOfWorkers() const
+    {
+        return static_cast<int>(m_workers.size());
+    }
+
+    ~RealtimeThreadPool()
+    {
+        terminate();
+    }
+
+private:
+    void waitUntilFinished()
+    {
+        auto actuallyAwaited = m_inflightSemaphore.waitMany(maxTaskCount);
+        for (size_t i = 0;
+             i < static_cast<size_t>(maxTaskCount - actuallyAwaited); ++i) {
+            m_inflightSemaphore.wait();
+        }
+    }
+
+    void terminate()
+    {
+        m_should_stop = true;
+        for (size_t i = 0; i < m_workers.size(); ++i) {
+            IF_ASSERT_FAILED(m_queue.enqueue([] {})) {
+                // this is _extremely_ unlikely to fail. But if it does the threads would hang indefinitely.
+                std::terminate();
+            }
+        }
+
+        for (auto& worker : m_workers) {
+            if (worker->m_thread && worker->m_thread->joinable()) {
+                worker->m_thread->join();
+            }
+        }
+    }
+
+    struct Worker {
+        std::unique_ptr<std::thread> m_thread;
+        AudioWorkGroup m_workgroup;
+        std::mutex m_workgroupMutex;
+    };
+
+    muse::BlockingConcurrentQueue<Task > m_queue;
+    std::vector<std::unique_ptr<Worker> > m_workers;
+    muse::LightweightSemaphore m_inflightSemaphore { maxTaskCount };
+    std::atomic<bool> m_should_stop{ false };
+}; // namespace muse::audio
+} // namespace muse::audio
diff --git a/framework/audio/driver/CMakeLists.txt b/framework/audio/driver/CMakeLists.txt
index 950c7086bc..6bc44af87e 100644
--- a/framework/audio/driver/CMakeLists.txt
+++ b/framework/audio/driver/CMakeLists.txt
@@ -85,10 +85,13 @@ elseif(OS_IS_MAC)
     target_sources(muse_audio_driver PRIVATE
         platform/osx/osxaudiodriver.mm
         platform/osx/osxaudiodriver.h
+        platform/osx/osxdirectaudiodriver.mm
+        platform/osx/osxdirectaudiodriver.h
     )
-
+        
     set_source_files_properties(
         platform/osx/osxaudiodriver.mm
+        platform/osx/osxdirectaudiodriver.mm
         PROPERTIES
         SKIP_UNITY_BUILD_INCLUSION ON
         SKIP_PRECOMPILE_HEADERS ON
diff --git a/framework/audio/driver/platform/osx/osxdirectaudiodriver.h b/framework/audio/driver/platform/osx/osxdirectaudiodriver.h
new file mode 100644
index 0000000000..215d3bedd8
--- /dev/null
+++ b/framework/audio/driver/platform/osx/osxdirectaudiodriver.h
@@ -0,0 +1,93 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only
+ * MuseScore-CLA-applies
+ *
+ * MuseScore
+ * Music Composition & Notation
+ *
+ * Copyright (C) 2026 MuseScore Limited and others
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+#ifndef MUSE_AUDIO_OSXDIRECTAUDIODRIVER_H
+#define MUSE_AUDIO_OSXDIRECTAUDIODRIVER_H
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <optional>
+
+#include <MacTypes.h>
+
+#include "common/audioworkgroup.h"
+#include "iaudiodriver.h"
+
+struct AudioTimeStamp;
+struct AudioQueueBuffer;
+struct OpaqueAudioQueue;
+
+namespace muse::audio {
+class OSXDirectAudioDriver : public IAudioDriver
+{
+public:
+    OSXDirectAudioDriver();
+    ~OSXDirectAudioDriver();
+
+    void init() override;
+
+    std::string name() const override;
+
+    AudioDeviceID defaultDevice() const override;
+
+    bool open(const Spec& spec, Spec* activeSpec) override;
+    void close() override;
+    bool isOpened() const override;
+
+    const Spec& activeSpec() const override;
+    async::Channel<Spec> activeSpecChanged() const override;
+
+    AudioDeviceList availableOutputDevices() const override;
+    async::Notification availableOutputDevicesChanged() const override;
+    void updateDeviceMap();
+
+    std::vector<samples_t> availableOutputDeviceBufferSizes() const override;
+    std::vector<sample_rate_t> availableOutputDeviceSampleRates() const override;
+
+    AudioWorkGroup getAudioWorkGroup() const override;
+    async::Notification currentWorkgroupChanged() const override;
+
+    struct Data;
+
+private:
+    static void logError(const std::string message, OSStatus error);
+
+    void initDeviceMapListener();
+    void removeDeviceMapListener();
+    void doClose();
+
+    std::optional<int> getAudioDeviceId(const AudioDeviceID& deviceId) const;
+
+    UInt32 osxDeviceId() const;
+
+    std::unique_ptr<Data> m_data = nullptr;
+    async::Channel<Spec> m_activeSpecChanged;
+    std::map<unsigned int, std::string> m_outputDevices = {}, m_inputDevices = {};
+    mutable std::mutex m_devicesMutex;
+    async::Notification m_availableOutputDevicesChanged;
+    async::Notification m_currentWorkgroupChanged;
+
+    AudioWorkGroup m_audioWorkGroup;
+    bool m_deviceMapListenerRegistered = false;
+};
+}
+#endif // MUSE_AUDIO_OSXDIRECTAUDIODRIVER_H
diff --git a/framework/audio/driver/platform/osx/osxdirectaudiodriver.mm b/framework/audio/driver/platform/osx/osxdirectaudiodriver.mm
new file mode 100644
index 0000000000..fa2423569d
--- /dev/null
+++ b/framework/audio/driver/platform/osx/osxdirectaudiodriver.mm
@@ -0,0 +1,1004 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only
+ * MuseScore-CLA-applies
+ *
+ * MuseScore
+ * Music Composition & Notation
+ *
+ * Copyright (C) 2026 MuseScore Limited and others
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "osxdirectaudiodriver.h"
+
+#include <CoreAudio/AudioHardware.h>
+#include <CoreAudio/AudioHardwareBase.h>
+#include <CoreAudioTypes/CoreAudioBaseTypes.h>
+#include <MacTypes.h>
+#include <QtCore/qsemaphore.h>
+#include <Security/cssmconfig.h>
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <mutex>
+
+#include <AudioToolbox/AudioToolbox.h>
+#include <optional>
+#include <string_view>
+
+#include "common/audiotypes.h"
+#include "common/audioworkgroup.h"
+#include "translation.h"
+#include "log.h"
+
+typedef AudioDeviceID OSXAudioDeviceID;
+
+using namespace muse;
+using namespace muse::audio;
+
+struct ChannelBufferDetails {
+    int streamNumber;
+    unsigned int hardwareChannelNumber;
+    unsigned int dataOffsetSamples;
+    unsigned int dataStrideSamples;
+};
+
+struct OSXDirectAudioDriver::Data {
+    Spec format;
+    AudioDeviceIOProcID procId{};
+    bool canBeDirectlyMapped = false;
+
+    OSXAudioDeviceID deviceId{};
+
+    std::atomic<bool> stopPending{ false };
+    std::atomic<bool> stopped{ false };
+
+    std::vector<ChannelBufferDetails> channelBufferOutputDetails;
+    std::vector<float> outBuffer;
+
+    Data& operator=(const Data& other)
+    {
+        format = other.format;
+        procId = other.procId;
+        canBeDirectlyMapped = other.canBeDirectlyMapped;
+        deviceId = other.deviceId;
+        stopPending = other.stopPending.load();
+        stopped = other.stopped.load();
+        channelBufferOutputDetails = other.channelBufferOutputDetails;
+        outBuffer = other.outBuffer;
+        return *this;
+    }
+
+    void clear()
+    {
+        *this = Data();
+    }
+};
+namespace muse::audio {
+AudioWorkGroup makeAudioWorkgroup(void* opaqueHandle);
+AudioWorkGroup OSXDirectAudioDriver::getAudioWorkGroup() const
+{
+    return m_audioWorkGroup;
+}
+
+async::Notification OSXDirectAudioDriver::currentWorkgroupChanged() const
+{
+    return m_currentWorkgroupChanged;
+}
+} // namespace muse::audio
+
+OSXDirectAudioDriver::OSXDirectAudioDriver()
+    : m_data(std::make_unique<Data>())
+{
+    initDeviceMapListener();
+    updateDeviceMap();
+}
+
+OSXDirectAudioDriver::~OSXDirectAudioDriver()
+{
+    removeDeviceMapListener();
+    doClose();
+}
+
+void OSXDirectAudioDriver::init()
+{
+}
+
+std::string OSXDirectAudioDriver::name() const
+{
+    return "OSX";
+}
+
+muse::audio::AudioDeviceID OSXDirectAudioDriver::defaultDevice() const
+{
+    return DEFAULT_DEVICE_ID;
+}
+
+static uint32_t outputBufferSamplesPerChannel(const AudioBuffer& buffer)
+{
+    if (buffer.mNumberChannels == 0) {
+        return 0;
+    }
+
+    return buffer.mDataByteSize / (buffer.mNumberChannels * sizeof(float));
+}
+
+static uint32_t callbackSamplesPerChannel(const AudioBufferList* outputData,
+                                          const std::vector<ChannelBufferDetails>& details)
+{
+    std::optional<uint32_t> result;
+
+    for (const ChannelBufferDetails& detail : details) {
+        if (detail.streamNumber < 0 || static_cast<UInt32>(detail.streamNumber) >= outputData->mNumberBuffers) {
+            continue;
+        }
+
+        const AudioBuffer& outputBuffer = outputData->mBuffers[detail.streamNumber];
+        const uint32_t frames = outputBufferSamplesPerChannel(outputBuffer);
+        if (frames == 0) {
+            continue;
+        }
+
+        result = result.has_value() ? std::min(result.value(), frames) : frames;
+    }
+
+    return result.value_or(0);
+}
+
+static std::optional<uint32_t> getSameRequestedSampleCount(const AudioBufferList* outOutputData,
+                                                           const std::vector<ChannelBufferDetails>& details)
+{
+    std::optional<uint32_t> sampleCount;
+
+    for (const ChannelBufferDetails& detail : details) {
+        if (detail.streamNumber < 0 || static_cast<UInt32>(detail.streamNumber) >= outOutputData->mNumberBuffers) {
+            continue;
+        }
+
+        const AudioBuffer& outputBuffer = outOutputData->mBuffers[detail.streamNumber];
+        const uint32_t frames = outputBufferSamplesPerChannel(outputBuffer);
+        if (frames == 0) {
+            continue;
+        }
+
+        if (!sampleCount.has_value()) {
+            sampleCount = frames;
+        } else if (sampleCount.value() != frames) {
+            return std::nullopt;
+        }
+    }
+
+    return sampleCount;
+}
+
+//TODO: make use of timing parameters
+static int coreAudioIOProc(AudioObjectID /* inDevice*/,
+                           const AudioTimeStamp* /* inNow */,
+                           const AudioBufferList* /* inInputData */,
+                           const AudioTimeStamp* /*  inInputTime */,
+                           AudioBufferList* outOutputData,
+                           const AudioTimeStamp* /* inOutputTime */,
+                           void* __nullable inClientData)
+{
+    auto* data = reinterpret_cast<OSXDirectAudioDriver::Data*>(inClientData);
+    if (data->stopPending) {
+        AudioDeviceStop(data->deviceId, data->procId);
+        data->stopPending = false;
+        data->stopped = true;
+        return noErr;
+    }
+    if (data->channelBufferOutputDetails.empty()) {
+        return noErr;
+    }
+
+    uint32_t samplesPerChannel = callbackSamplesPerChannel(outOutputData, data->channelBufferOutputDetails);
+    if (samplesPerChannel == 0) {
+        return noErr;
+    }
+
+    const uint32_t callbackDataStride = data->format.output.audioChannelCount;
+    int dataSize = static_cast<int>(samplesPerChannel * callbackDataStride * sizeof(float));
+    if (data->canBeDirectlyMapped && getSameRequestedSampleCount(outOutputData,
+                                                                 data->channelBufferOutputDetails).value_or(0) == samplesPerChannel) {
+        data->format.callback(
+            (uint8_t*)outOutputData
+            ->mBuffers[data->channelBufferOutputDetails[0].streamNumber]
+            .mData,
+            dataSize);
+        return 0;
+    }
+
+    if (data->outBuffer.size() < static_cast<size_t>(samplesPerChannel) * callbackDataStride) {
+        samplesPerChannel = static_cast<uint32_t>(data->outBuffer.size() / callbackDataStride);
+        dataSize = static_cast<int>(samplesPerChannel * callbackDataStride * sizeof(float));
+    }
+
+    data->format.callback((uint8_t*)data->outBuffer.data(), dataSize);
+
+    for (int ch = 0; ch < (int)data->channelBufferOutputDetails.size(); ++ch) {
+        const auto& details = data->channelBufferOutputDetails[ch];
+        auto& outputBuffer = outOutputData->mBuffers[details.streamNumber];
+        int stride = details.dataStrideSamples;
+        if (stride == 0) {
+            continue;
+        }
+
+        const uint64_t requiredSamples = details.dataOffsetSamples
+                                         + static_cast<uint64_t>(samplesPerChannel - 1) * stride
+                                         + 1;
+        if (outputBuffer.mDataByteSize < (requiredSamples * sizeof(float))) {
+            // this is very unexpected. Should always be the same as samplesPerChannel
+            continue;
+        }
+
+        float* src = data->outBuffer.data() + ch;
+        float* dest = static_cast<float*>(outputBuffer.mData)
+                      + details.dataOffsetSamples;
+        for (int j = samplesPerChannel; --j >= 0;) {
+            *dest = *src;
+            dest += stride;
+            src += callbackDataStride;
+        }
+    }
+    return 0;
+}
+
+std::optional<std::array<uint32_t, 2> > getPreferredStereoHardwareChannels(OSXAudioDeviceID deviceId)
+{
+    AudioObjectPropertyAddress addr{
+        kAudioDevicePropertyPreferredChannelsForStereo,
+        kAudioDevicePropertyScopeOutput, kAudioObjectPropertyElementMaster };
+    UInt32 stereo[2] = {};
+    UInt32 size = sizeof(stereo);
+
+    if (AudioObjectGetPropertyData(deviceId, &addr, 0, nullptr, &size, stereo) != noErr) {
+        return std::nullopt;
+    }
+    return std::array<uint32_t, 2> { stereo[0], stereo[1] };
+}
+
+bool filterAndReorderStereoHardwareChannels(const OSXAudioDeviceID& deviceId,
+                                            std::vector<ChannelBufferDetails>& details)
+{
+    auto stereoChannels = getPreferredStereoHardwareChannels(deviceId);
+    if (!stereoChannels.has_value()) {
+        return false;
+    }
+    details.erase(
+        std::remove_if(details.begin(), details.end(),
+                       [&stereoChannels](const ChannelBufferDetails& details) {
+        return details.hardwareChannelNumber
+               != stereoChannels->at(0)
+               && details.hardwareChannelNumber
+               != stereoChannels->at(1);
+    }),
+        details.end());
+    std::sort(details.begin(), details.end(),
+              [&stereoChannels](const ChannelBufferDetails& a,
+                                const ChannelBufferDetails& b) {
+        auto aIndex = (a.hardwareChannelNumber
+                       == stereoChannels->at(0))
+                      ? 0
+                      : 1;
+        auto bIndex = (b.hardwareChannelNumber
+                       == stereoChannels->at(0))
+                      ? 0
+                      : 1;
+        return aIndex < bIndex;
+    });
+    return true;
+}
+
+std::vector<ChannelBufferDetails> getChannelBufferDetails(const OSXAudioDeviceID& deviceId,
+                                                          const std::function<void(std::string, OSStatus)>& onError)
+{
+    std::vector<ChannelBufferDetails> result;
+
+    UInt32 size{};
+    AudioObjectPropertyAddress address = { kAudioDevicePropertyStreamConfiguration,
+                                           kAudioDevicePropertyScopeOutput,
+                                           kAudioObjectPropertyElementMaster };
+    if (auto status = AudioObjectGetPropertyDataSize(deviceId, &address, 0, nullptr, &size); status != noErr) {
+        onError("Failed to get device " + std::to_string(deviceId) + " stream configuration" + ", err: " + std::to_string(status), status);
+        return result;
+    }
+
+    std::vector<uint8_t> buf(size);
+
+    auto status = AudioObjectGetPropertyData(deviceId, &address, 0, nullptr,
+                                             &size, buf.data());
+    if (status != noErr) {
+        onError("Failed to get device " + std::to_string(deviceId) + " stream configuration" + ", err: " + std::to_string(status), status);
+        return result;
+    }
+
+    AudioBufferList* bufList = reinterpret_cast<AudioBufferList*>(buf.data());
+
+    const int numStreams = static_cast<int>(bufList->mNumberBuffers);
+    unsigned int hardwareChannelNumber = 1;
+
+    for (int i = 0; i < numStreams; ++i) {
+        auto& b = bufList->mBuffers[i];
+        for (unsigned int j = 0; j < b.mNumberChannels; ++j) {
+            ChannelBufferDetails details {
+                .streamNumber = i,
+                .hardwareChannelNumber = hardwareChannelNumber++,
+                .dataOffsetSamples = j,
+                .dataStrideSamples
+                    =b.mNumberChannels };
+            result.push_back(details);
+        }
+    }
+
+    return result;
+}
+
+static bool isSupportedCallbackFormat(const AudioStreamBasicDescription& format)
+{
+    const bool isNativeEndian = (format.mFormatFlags & kAudioFormatFlagIsBigEndian) == kAudioFormatFlagsNativeEndian;
+    const bool isNativeFloatPacked = (format.mFormatFlags & kAudioFormatFlagsNativeFloatPacked) == kAudioFormatFlagsNativeFloatPacked;
+
+    return format.mFormatID == kAudioFormatLinearPCM
+           && isNativeEndian
+           && isNativeFloatPacked
+           && format.mBitsPerChannel == sizeof(float) * 8
+           && format.mFramesPerPacket == 1;
+}
+
+static bool validateOutputStreamFormats(OSXAudioDeviceID deviceId,
+                                        const std::vector<ChannelBufferDetails>& details,
+                                        const std::function<void(std::string, OSStatus)>& logError)
+{
+    AudioObjectPropertyAddress streamsAddress {
+        kAudioDevicePropertyStreams,
+        kAudioDevicePropertyScopeOutput,
+        kAudioObjectPropertyElementMaster
+    };
+
+    UInt32 streamsSize = 0;
+    OSStatus status = AudioObjectGetPropertyDataSize(deviceId, &streamsAddress, 0, nullptr, &streamsSize);
+    if (status != noErr) {
+        logError("Failed to get device " + std::to_string(deviceId) + " output streams size, err: ", status);
+        return false;
+    }
+
+    std::vector<AudioStreamID> streamIds(streamsSize / sizeof(AudioStreamID));
+    status = AudioObjectGetPropertyData(deviceId, &streamsAddress, 0, nullptr, &streamsSize, streamIds.data());
+    if (status != noErr) {
+        logError("Failed to get device " + std::to_string(deviceId) + " output streams, err: ", status);
+        return false;
+    }
+
+    std::vector<int> checkedStreams;
+    for (const ChannelBufferDetails& detail : details) {
+        if (std::find(checkedStreams.cbegin(), checkedStreams.cend(), detail.streamNumber) != checkedStreams.cend()) {
+            continue;
+        }
+
+        if (detail.streamNumber < 0 || static_cast<size_t>(detail.streamNumber) >= streamIds.size()) {
+            LOGE() << "CoreAudio output stream index " << detail.streamNumber
+                   << " is outside stream list for device " << deviceId;
+            return false;
+        }
+
+        AudioStreamBasicDescription format {};
+        UInt32 formatSize = sizeof(format);
+        AudioObjectPropertyAddress formatAddress {
+            kAudioStreamPropertyVirtualFormat,
+            kAudioObjectPropertyScopeGlobal,
+            kAudioObjectPropertyElementMaster
+        };
+
+        status = AudioObjectGetPropertyData(streamIds[detail.streamNumber], &formatAddress, 0, nullptr, &formatSize, &format);
+        if (status != noErr) {
+            logError("Failed to get device " + std::to_string(deviceId) + " output stream format, err: ", status);
+            return false;
+        }
+
+        if (!isSupportedCallbackFormat(format)) {
+            LOGE() << "Unsupported CoreAudio output stream format for device " << deviceId
+                   << ", stream " << detail.streamNumber
+                   << ": expected native packed 32-bit float PCM";
+            return false;
+        }
+
+        checkedStreams.push_back(detail.streamNumber);
+    }
+
+    return true;
+}
+
+inline static bool canBeDirectlyMapped(const std::vector<ChannelBufferDetails>& details,
+                                       uint32_t callbackDataStride)
+{
+    std::optional<int> firstStreamNumber;
+    std::optional<uint32_t> expectedOffset;
+    for (int channel = 0; channel < (int)details.size(); ++channel) {
+        const auto& detail = details[channel];
+        if (detail.dataStrideSamples != callbackDataStride) {
+            return false;
+        }
+        if (!firstStreamNumber.has_value()) {
+            firstStreamNumber = detail.streamNumber;
+        }
+        if (firstStreamNumber.value() != detail.streamNumber) {
+            return false;
+        }
+        if (!expectedOffset.has_value()) {
+            expectedOffset = detail.dataOffsetSamples;
+        }
+        if (expectedOffset.value() != detail.dataOffsetSamples) {
+            return false;
+        }
+        *expectedOffset += 1;
+    }
+    return true;
+}
+
+static std::vector<ChannelBufferDetails> getFittingChannelStreamsFromDevice(OSXAudioDeviceID deviceId, int channelCount,
+                                                                            const std::function<void(std::string, OSStatus)>& logError)
+{
+    auto outputStreamInfos = getChannelBufferDetails(deviceId, logError);
+    if (outputStreamInfos.size()
+        < (unsigned int)channelCount) {
+        logError("Not enough channels are available with current device", noErr);
+        return {};
+    }
+    if (channelCount == 2) {
+        filterAndReorderStereoHardwareChannels(deviceId, outputStreamInfos);
+    }
+    if (outputStreamInfos.size() < (unsigned int)channelCount) {
+        logError("Not enough channels are available with current device after filtering for stereo preference", noErr);
+        return {};
+    }
+    if (outputStreamInfos.size()
+        > (unsigned int)channelCount) {
+        // if there are more hardware channels than requested, we just use the first ones. This is a fallback if filterStereoHardwareChannels was not successful
+        outputStreamInfos.resize(channelCount);
+    }
+    return outputStreamInfos;
+}
+
+static std::optional<Float64> closestAvailableSampleRate(OSXAudioDeviceID deviceId, Float64 requested,
+                                                         const std::function<void(std::string, OSStatus)>& onError)
+{
+    AudioObjectPropertyAddress address {
+        kAudioDevicePropertyAvailableNominalSampleRates,
+        kAudioObjectPropertyScopeGlobal,
+        kAudioObjectPropertyElementMaster
+    };
+
+    UInt32 size = 0;
+    if (auto status = AudioObjectGetPropertyDataSize(deviceId, &address, 0, nullptr, &size); status != noErr || size == 0) {
+        onError("Failed to get device " + std::to_string(deviceId) + " available sample rates" + ", err: " + std::to_string(status),
+                status);
+        return std::nullopt;
+    }
+
+    std::vector<AudioValueRange> ranges(size / sizeof(AudioValueRange));
+    if (auto status = AudioObjectGetPropertyData(deviceId, &address, 0, nullptr, &size, ranges.data()); status != noErr) {
+        onError("Failed to get device " + std::to_string(deviceId) + " available sample rates" + ", err: " + std::to_string(status),
+                status);
+        return std::nullopt;
+    }
+
+    std::optional<Float64> best;
+    Float64 bestDistance = std::numeric_limits<Float64>::max();
+
+    for (const AudioValueRange& range : ranges) {
+        Float64 candidate = requested;
+
+        if (requested < range.mMinimum) {
+            candidate = range.mMinimum;
+        } else if (requested > range.mMaximum) {
+            candidate = range.mMaximum;
+        }
+
+        Float64 distance = std::abs(candidate - requested);
+        if (distance < bestDistance) {
+            best = candidate;
+            bestDistance = distance;
+        }
+    }
+
+    return best;
+}
+
+static std::optional<OutputSpec> prepareDeviceWithOutputSpec(OSXAudioDeviceID deviceId, const OutputSpec& spec,
+                                                             const std::function<void(std::string, OSStatus)>& logError)
+{
+    AudioObjectPropertyAddress requestedSampleRate{
+        kAudioDevicePropertyNominalSampleRate, kAudioObjectPropertyScopeGlobal,
+        kAudioObjectPropertyElementMaster };
+
+    Float64 currentSampleRate = 0.0;
+    UInt32 sampleRateSize = sizeof(currentSampleRate);
+
+    OSStatus result
+        =AudioObjectGetPropertyData(deviceId, &requestedSampleRate, 0, nullptr,
+                                    &sampleRateSize, &currentSampleRate);
+
+    if (result != noErr) {
+        logError("Failed to get current sample rate for device " + std::to_string(deviceId) + ", err: ", result);
+        return std::nullopt;
+    }
+
+    Float64 sampleRate = closestAvailableSampleRate(deviceId, spec.sampleRate, logError)
+                         .value_or(currentSampleRate);
+
+    if (std::abs(currentSampleRate - sampleRate) > 0.5) {
+        result = AudioObjectSetPropertyData(
+            deviceId,
+            &requestedSampleRate,
+            0,
+            nullptr,
+            sizeof(sampleRate),
+            &sampleRate);
+
+        if (result != noErr) {
+            sampleRate = currentSampleRate;
+        }
+    } else {
+        sampleRate = currentSampleRate;
+    }
+
+    AudioValueRange bufferSizeRange = { 0, 0 };
+    UInt32 bufferSizeRangeSize = sizeof(AudioValueRange);
+    AudioObjectPropertyAddress bufferSizeRangeAddress = {
+        .mSelector = kAudioDevicePropertyBufferFrameSizeRange,
+        .mScope = kAudioObjectPropertyScopeGlobal,
+        .mElement = kAudioObjectPropertyElementMaster
+    };
+
+    result = AudioObjectGetPropertyData(deviceId, &bufferSizeRangeAddress, 0, 0, &bufferSizeRangeSize, &bufferSizeRange);
+    if (result != noErr) {
+        logError("Failed to create Audio Queue Output, err: ", result);
+        return std::nullopt;
+    }
+
+    samples_t minBufferSize = static_cast<samples_t>(bufferSizeRange.mMinimum);
+    samples_t maxBufferSize = static_cast<samples_t>(bufferSizeRange.mMaximum);
+    UInt32 bufferSizeOut = std::min(maxBufferSize, std::max(minBufferSize, spec.samplesPerChannel));
+
+    AudioObjectPropertyAddress preferredBufferSizeAddress = {
+        .mSelector = kAudioDevicePropertyBufferFrameSize,
+        .mScope = kAudioObjectPropertyScopeGlobal,
+        .mElement = kAudioObjectPropertyElementMaster
+    };
+
+    result = AudioObjectSetPropertyData(deviceId, &preferredBufferSizeAddress, 0, 0, sizeof(bufferSizeOut), (void*)&bufferSizeOut);
+    if (result != noErr) {
+        logError("Failed to create Audio Queue Output, err: ", result);
+        return std::nullopt;
+    }
+
+    UInt32 actualBufferSizeOut = bufferSizeOut;
+    UInt32 actualBufferSizeOutSize = sizeof(actualBufferSizeOut);
+    result = AudioObjectGetPropertyData(deviceId, &preferredBufferSizeAddress, 0, 0, &actualBufferSizeOutSize, &actualBufferSizeOut);
+    if (result != noErr) {
+        logError("Failed to get Audio Device bufferFrameSize, err: ", result);
+        return std::nullopt;
+    }
+    bufferSizeOut = actualBufferSizeOut;
+
+    return OutputSpec {
+        .sampleRate = (uint64_t)sampleRate, .samplesPerChannel = bufferSizeOut, .audioChannelCount = spec.audioChannelCount
+    };
+}
+
+AudioWorkGroup createAudioWorkgroup(OSXAudioDeviceID deviceId)
+{
+    AudioObjectPropertyAddress pa;
+    pa.mSelector = kAudioDevicePropertyIOThreadOSWorkgroup;
+    pa.mScope = kAudioObjectPropertyScopeWildcard;
+    pa.mElement = kAudioObjectPropertyElementMaster;
+    os_workgroup_t workgroup;
+    uint32_t workgroupSize = sizeof(workgroup);
+    if (AudioObjectGetPropertyData(deviceId, &pa, 0, nullptr, &workgroupSize,
+                                   &workgroup) != noErr) {
+        return {};
+    }
+
+    return makeAudioWorkgroup(workgroup);
+}
+
+bool OSXDirectAudioDriver::open(const Spec& spec, Spec* activeSpec)
+{
+    if (isOpened()) {
+        return false;
+    }
+
+    m_data->clear();
+
+    auto deviceId = getAudioDeviceId(spec.deviceId);
+    if (!deviceId) {
+        logError("Failed to find device " + spec.deviceId, noErr);
+        return false;
+    }
+    auto bestOutputStreamInfos = getFittingChannelStreamsFromDevice(
+        *deviceId, spec.output.audioChannelCount, &logError);
+    if (bestOutputStreamInfos.empty()) {
+        return false;
+    }
+
+    IF_ASSERT_FAILED_X(validateOutputStreamFormats(*deviceId, bestOutputStreamInfos, &logError),
+                       "CoreAudio output stream format must be native packed 32-bit float PCM") {
+        return false;
+    }
+
+    auto actualOutputSpec = prepareDeviceWithOutputSpec(*deviceId, spec.output, &logError);
+    if (!actualOutputSpec) {
+        return false;
+    }
+
+    m_data->canBeDirectlyMapped = canBeDirectlyMapped(bestOutputStreamInfos,
+                                                      spec.output.audioChannelCount);
+    m_data->format = spec;
+    m_data->format.output = actualOutputSpec.value();
+    m_data->format.deviceId = QString::number(*deviceId).toStdString();
+    m_data->channelBufferOutputDetails = std::move(bestOutputStreamInfos);
+    m_data->outBuffer.resize(m_data->format.output.samplesPerChannel
+                             * m_data->format.output.audioChannelCount);
+    m_data->deviceId = *deviceId;
+
+    auto result = AudioDeviceCreateIOProcID(*deviceId, coreAudioIOProc, m_data.get(),
+                                            &m_data->procId);
+    if (result != noErr) {
+        m_data->clear();
+        logError("Failed to create Audio Device IO Proc, err: ", result);
+        return false;
+    }
+
+    result = AudioDeviceStart(*deviceId, m_data->procId);
+    if (result != noErr) {
+        AudioDeviceDestroyIOProcID(*deviceId, m_data->procId);
+        m_data->clear();
+        logError("Failed to start Audio Device, err: ", result);
+        return false;
+    }
+
+    m_audioWorkGroup = createAudioWorkgroup(*deviceId);
+    m_currentWorkgroupChanged.notify();
+
+    if (activeSpec) {
+        *activeSpec = m_data->format;
+    }
+
+    m_activeSpecChanged.send(m_data->format);
+
+    LOGI() << "Connected to " << m_data->format.deviceId
+           << " with bufferSize " << m_data->format.output.samplesPerChannel
+           << ", sampleRate " << m_data->format.output.sampleRate;
+
+    return true;
+}
+
+void OSXDirectAudioDriver::close()
+{
+    doClose();
+}
+
+void OSXDirectAudioDriver::doClose()
+{
+    if (!isOpened()) {
+        return;
+    }
+
+    m_data->stopPending = true;
+    // we spin while we wait for the callback to stop the device. That way we can be sure that data will no longer be used by the callback
+    for (int i = 0; i < 100 && !m_data->stopped; ++i) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+
+    if (!m_data->stopped) {
+        [[maybe_unused]] auto _leaked = m_data.release(); // we let it leak since it might still be accessed by the callback
+        m_data = std::make_unique<Data>();
+    } else {
+        AudioDeviceDestroyIOProcID(m_data->deviceId, m_data->procId);
+        m_data->clear();
+    }
+    m_audioWorkGroup = {};
+    m_currentWorkgroupChanged.notify();
+}
+
+bool OSXDirectAudioDriver::isOpened() const
+{
+    return m_data->procId != nullptr;
+}
+
+const OSXDirectAudioDriver::Spec& OSXDirectAudioDriver::activeSpec() const
+{
+    return m_data->format;
+}
+
+async::Channel<OSXDirectAudioDriver::Spec> OSXDirectAudioDriver::activeSpecChanged() const
+{
+    return m_activeSpecChanged;
+}
+
+AudioDeviceList OSXDirectAudioDriver::availableOutputDevices() const
+{
+    std::lock_guard lock(m_devicesMutex);
+
+    AudioDeviceList deviceList;
+    deviceList.push_back({ DEFAULT_DEVICE_ID, muse::trc("audio", "System default") });
+
+    for (auto& device : m_outputDevices) {
+        AudioDevice deviceInfo;
+        deviceInfo.id = QString::number(device.first).toStdString();
+        deviceInfo.name = device.second;
+
+        deviceList.push_back(deviceInfo);
+    }
+
+    return deviceList;
+}
+
+async::Notification OSXDirectAudioDriver::availableOutputDevicesChanged() const
+{
+    return m_availableOutputDevicesChanged;
+}
+
+void OSXDirectAudioDriver::updateDeviceMap()
+{
+    std::lock_guard lock(m_devicesMutex);
+
+    UInt32 propertySize;
+    OSStatus result;
+    std::vector<AudioObjectID> audioObjects = {};
+    m_outputDevices.clear();
+    m_inputDevices.clear();
+
+    AudioObjectPropertyAddress devicesPropertyAddress = {
+        .mSelector = kAudioHardwarePropertyDevices,
+        .mScope = kAudioObjectPropertyScopeGlobal,
+        .mElement = kAudioObjectPropertyElementMaster,
+    };
+
+    AudioObjectPropertyAddress namePropertyAddress = {
+        .mSelector = kAudioDevicePropertyDeviceNameCFString,
+        .mScope = kAudioObjectPropertyScopeGlobal,
+        .mElement = kAudioObjectPropertyElementMaster,
+    };
+
+    auto getStreamsCount
+        = [](const AudioObjectID& id, const AudioObjectPropertyScope& scope, const std::string& deviceName) -> unsigned int {
+        AudioObjectPropertyAddress propertyAddress = {
+            .mSelector  = kAudioDevicePropertyStreamConfiguration,
+            .mScope     = scope,
+            .mElement   = kAudioObjectPropertyElementWildcard
+        };
+        UInt32 propertySize = 0;
+        OSStatus result = AudioObjectGetPropertyDataSize(id, &propertyAddress, 0, NULL, &propertySize);
+        if (result != noErr) {
+            logError("Failed to get device's (" + deviceName + ") streams size, err: ", result);
+            return 0;
+        }
+
+        auto freeBufferList = [](AudioBufferList* list) { free(list); };
+        std::unique_ptr<AudioBufferList, decltype(freeBufferList)> bufferList(reinterpret_cast<AudioBufferList*>(malloc(propertySize)),
+                                                                              freeBufferList);
+        result = AudioObjectGetPropertyData(id, &propertyAddress, 0, NULL, &propertySize, bufferList.get());
+        if (result != noErr) {
+            logError("Failed to get device's (" + deviceName + ") streams, err: ", result);
+            return 0;
+        }
+
+        return bufferList->mNumberBuffers;
+    };
+
+    result = AudioObjectGetPropertyDataSize(kAudioObjectSystemObject, &devicesPropertyAddress, 0, NULL, &propertySize);
+    if (result != noErr) {
+        logError("Failed to get devices count, err: ", result);
+        return;
+    }
+
+    audioObjects.resize(propertySize / sizeof(OSXAudioDeviceID));
+    result = AudioObjectGetPropertyData(kAudioObjectSystemObject, &devicesPropertyAddress, 0, NULL, &propertySize, audioObjects.data());
+    if (result != noErr) {
+        logError("Failed to get devices list, err: ", result);
+        return;
+    }
+
+    for (auto&& deviceId : audioObjects) {
+        CFStringRef nameRef;
+        propertySize = sizeof(nameRef);
+
+        result = AudioObjectGetPropertyData(deviceId, &namePropertyAddress, 0, NULL, &propertySize, &nameRef);
+        if (result != noErr) {
+            logError("Failed to get device's name, err: ", result);
+            continue;
+        }
+
+        NSString* nsString = (NSString*)nameRef;
+        std::string deviceName = [nsString UTF8String];
+
+        if (getStreamsCount(deviceId, kAudioObjectPropertyScopeOutput, deviceName) > 0) {
+            m_outputDevices[deviceId] = deviceName;
+        }
+
+        if (getStreamsCount(deviceId, kAudioObjectPropertyScopeInput, deviceName) > 0) {
+            m_inputDevices[deviceId] = deviceName;
+        }
+
+        CFRelease(nameRef);
+    }
+    m_availableOutputDevicesChanged.notify();
+}
+
+std::vector<samples_t> OSXDirectAudioDriver::availableOutputDeviceBufferSizes() const
+{
+    OSXAudioDeviceID osxDeviceId = this->osxDeviceId();
+    AudioObjectPropertyAddress bufferFrameSizePropertyAddress = {
+        .mSelector = kAudioDevicePropertyBufferFrameSizeRange,
+        .mScope = kAudioObjectPropertyScopeGlobal,
+        .mElement = kAudioObjectPropertyElementMaster
+    };
+
+    AudioValueRange range = { 0, 0 };
+    UInt32 dataSize = sizeof(AudioValueRange);
+    OSStatus rangeResult = AudioObjectGetPropertyData(osxDeviceId, &bufferFrameSizePropertyAddress, 0, NULL, &dataSize, &range);
+    if (rangeResult != noErr) {
+        logError("Failed to get device " + m_data->format.deviceId + " bufferFrameSize, err: ", rangeResult);
+        return {};
+    }
+
+    samples_t minimum = std::max(static_cast<samples_t>(range.mMinimum), MINIMUM_BUFFER_SIZE);
+    samples_t maximum = std::min(static_cast<samples_t>(range.mMaximum), MAXIMUM_BUFFER_SIZE);
+
+    std::vector<samples_t> result;
+    for (samples_t bufferSize = maximum; bufferSize >= minimum;) {
+        result.push_back(bufferSize);
+        bufferSize /= 2;
+    }
+
+    std::sort(result.begin(), result.end());
+
+    return result;
+}
+
+//TODO: replace hardcoded values with truth
+std::vector<sample_rate_t> OSXDirectAudioDriver::availableOutputDeviceSampleRates() const
+{
+    return {
+        44100,
+        48000,
+        88200,
+        96000,
+    };
+}
+
+static std::optional<OSXAudioDeviceID> getDefaultDeviceId(const std::function<void(std::string, OSStatus)>& logError)
+{
+    OSXAudioDeviceID osxDeviceId = kAudioObjectUnknown;
+    UInt32 deviceIdSize = sizeof(osxDeviceId);
+
+    AudioObjectPropertyAddress deviceNamePropertyAddress = {
+        .mSelector = kAudioHardwarePropertyDefaultOutputDevice,
+        .mScope = kAudioDevicePropertyScopeOutput,
+        .mElement = kAudioObjectPropertyElementMaster
+    };
+
+    OSStatus result = AudioObjectGetPropertyData(kAudioObjectSystemObject, &deviceNamePropertyAddress, 0, 0, &deviceIdSize, &osxDeviceId);
+    if (result != noErr) {
+        logError("Failed to get default device ID, err: ", result);
+        return std::nullopt;
+    }
+
+    return osxDeviceId;
+}
+
+UInt32 OSXDirectAudioDriver::osxDeviceId() const
+{
+    AudioDeviceID deviceId = m_data->format.deviceId;
+    if (deviceId == DEFAULT_DEVICE_ID) {
+        auto defaultDeviceId = getDefaultDeviceId(&logError);
+        if (!defaultDeviceId) {
+            logError("Failed to get default device ID, err: ", noErr);
+            return kAudioObjectUnknown;
+        }
+        return *defaultDeviceId;
+    }
+
+    return QString::fromStdString(deviceId).toInt();
+}
+
+void OSXDirectAudioDriver::logError(const std::string message, OSStatus error)
+{
+    if (error == noErr) {
+        return;
+    }
+
+    char errorString[5];
+
+    UInt32 errorBigEndian = CFSwapInt32HostToBig(error);
+    errorString[0] = errorBigEndian & 0xFF;
+    errorString[1] = (errorBigEndian >> 8) & 0xFF;
+    errorString[2] = (errorBigEndian >> 16) & 0xFF;
+    errorString[3] = (errorBigEndian >> 24) & 0xFF;
+    errorString[4] = '\0';
+    if (isprint(errorString[0]) && isprint(errorString[1]) && isprint(errorString[2]) && isprint(errorString[3])) {
+        LOGE() << message << errorString << "(" << error << ")";
+    } else {
+        LOGE() << message << error;
+    }
+}
+
+static OSStatus onDeviceListChanged(AudioObjectID inObjectID, UInt32 inNumberAddresses, const AudioObjectPropertyAddress* inAddresses,
+                                    void* inClientData)
+{
+    UNUSED(inObjectID);
+    UNUSED(inNumberAddresses);
+    UNUSED(inAddresses);
+    auto driver = reinterpret_cast<OSXDirectAudioDriver*>(inClientData);
+    driver->updateDeviceMap();
+
+    return noErr;
+}
+
+void OSXDirectAudioDriver::initDeviceMapListener()
+{
+    AudioObjectPropertyAddress propertyAddress;
+    propertyAddress.mSelector = kAudioHardwarePropertyDevices;
+    propertyAddress.mScope = kAudioObjectPropertyScopeGlobal;
+    propertyAddress.mElement = kAudioObjectPropertyElementMaster;
+
+    auto result = AudioObjectAddPropertyListener(kAudioObjectSystemObject, &propertyAddress, &onDeviceListChanged, this);
+    if (result != noErr) {
+        logError("Failed to add devices list listener, err: ", result);
+        return;
+    }
+
+    m_deviceMapListenerRegistered = true;
+}
+
+void OSXDirectAudioDriver::removeDeviceMapListener()
+{
+    if (!m_deviceMapListenerRegistered) {
+        return;
+    }
+
+    AudioObjectPropertyAddress propertyAddress;
+    propertyAddress.mSelector = kAudioHardwarePropertyDevices;
+    propertyAddress.mScope = kAudioObjectPropertyScopeGlobal;
+    propertyAddress.mElement = kAudioObjectPropertyElementMaster;
+
+    auto result = AudioObjectRemovePropertyListener(kAudioObjectSystemObject, &propertyAddress, &onDeviceListChanged, this);
+    if (result != noErr) {
+        logError("Failed to remove devices list listener, err: ", result);
+        return;
+    }
+
+    m_deviceMapListenerRegistered = false;
+}
+
+std::optional<int> muse::audio::OSXDirectAudioDriver::getAudioDeviceId(
+    const AudioDeviceID& deviceId) const
+{
+    if (deviceId.empty() || deviceId == DEFAULT_DEVICE_ID) {
+        return getDefaultDeviceId(&logError); //default device used
+    }
+
+    std::lock_guard lock(m_devicesMutex);
+
+    uint deviceIdInt = QString::fromStdString(deviceId).toInt();
+    auto index = std::find_if(m_outputDevices.begin(), m_outputDevices.end(), [&deviceIdInt](auto& d) {
+        return d.first == deviceIdInt;
+    });
+
+    if (index == m_outputDevices.end()) {
+        return std::nullopt;
+    }
+    return index->first;
+}
diff --git a/framework/audio/engine/internal/mixer.cpp b/framework/audio/engine/internal/mixer.cpp
index 29a2695fea..a7c4758b5b 100644
--- a/framework/audio/engine/internal/mixer.cpp
+++ b/framework/audio/engine/internal/mixer.cpp
@@ -26,12 +26,6 @@
 #include "audio/common/audiosanitizer.h"
 #include "audio/common/audioerrors.h"
 
-#include "muse_framework_config.h"
-
-#ifdef MUSE_THREADS_SUPPORT
-#include "concurrency/taskscheduler.h"
-#endif
-
 #include "log.h"
 
 using namespace muse;
@@ -44,22 +38,11 @@ constexpr size_t MIN_TRACK_COUNT_FOR_MULTITHREADING = 2;
 Mixer::~Mixer()
 {
     ONLY_AUDIO_MAIN_OR_ENGINE_THREAD;
-    delete m_taskScheduler;
 }
 
 void Mixer::init()
 {
     ONLY_AUDIO_ENGINE_THREAD;
-
-#ifdef MUSE_THREADS_SUPPORT
-    m_taskScheduler = new TaskScheduler();
-
-    if (!m_taskScheduler->setThreadsPriority(ThreadPriority::High)) {
-        LOGE() << "Unable to change audio threads priority";
-    }
-
-    AudioSanitizer::setMixerThreads(m_taskScheduler->threadIdSet());
-#endif
 }
 
 Ret Mixer::addTrack(TrackChainPtr trackChain, const AuxSendsParams& auxSends)
@@ -72,6 +55,7 @@ Ret Mixer::addTrack(TrackChainPtr trackChain, const AuxSendsParams& auxSends)
     trackData.buffer.resize(outBufferSize);
 
     m_tracks.emplace_back(std::move(trackData));
+    m_trackTasks.reserve(m_tracks.size());
 
     setAuxSends(trackData.trackId, auxSends);
 
@@ -179,7 +163,8 @@ void Mixer::process(float* outBuffer, samples_t samplesPerChannel)
     processAuxChannels(outBuffer, samplesPerChannel);
 }
 
-void Mixer::processTrackChannels(size_t outBufferSize, size_t samplesPerChannel)
+void Mixer::processTrackChannels(size_t outBufferSize,
+                                 size_t samplesPerChannel)
 {
     auto processChannel = [outBufferSize, samplesPerChannel](TrackData& trackData) {
         IF_ASSERT_FAILED(trackData.chain) {
@@ -199,8 +184,7 @@ void Mixer::processTrackChannels(size_t outBufferSize, size_t samplesPerChannel)
 
 #ifdef MUSE_THREADS_SUPPORT
     if (useMultithreading()) {
-        std::vector<std::future<void> > futures;
-
+        m_trackTasks.clear();
         for (auto& t : m_tracks) {
             t.processed = false;
 
@@ -208,13 +192,11 @@ void Mixer::processTrackChannels(size_t outBufferSize, size_t samplesPerChannel)
                 continue;
             }
 
-            std::future<void> future = m_taskScheduler->submit(processChannel, std::ref(t));
-            futures.emplace_back(std::move(future));
-        }
-
-        for (auto& f : futures) {
-            f.wait();
+            m_trackTasks.emplace_back([tPtr = &t, processChannel] {
+                processChannel(*tPtr);
+            });
         }
+        audioTaskScheduler()->submitRealtimeTasksAndWait(m_trackTasks);
     } else
 #endif
     {
diff --git a/framework/audio/engine/internal/mixer.h b/framework/audio/engine/internal/mixer.h
index 535662df49..a5f606be51 100644
--- a/framework/audio/engine/internal/mixer.h
+++ b/framework/audio/engine/internal/mixer.h
@@ -24,6 +24,7 @@
 
 #include <memory>
 
+#include "common/iaudiotaskscheduler.h"
 #include "global/modularity/ioc.h"
 #include "global/async/asyncable.h"
 
@@ -35,6 +36,7 @@
 #include "nodes/signalnode.h"
 #include "nodes/trackchain.h"
 
+#include "muse_framework_config.h"
 namespace muse {
 class TaskScheduler;
 }
@@ -50,6 +52,9 @@ namespace muse::audio::engine {
 class Mixer : public AudioNode<ContextMixerTag>, public async::Asyncable
 {
     GlobalInject<IAudioFactory> audioFactory;
+#ifdef MUSE_THREADS_SUPPORT
+    GlobalInject<IAudioTaskScheduler> audioTaskScheduler;
+#endif
 
 public:
     ~Mixer() override;
@@ -84,8 +89,6 @@ class Mixer : public AudioNode<ContextMixerTag>, public async::Asyncable
 
     bool useMultithreading() const;
 
-    TaskScheduler* m_taskScheduler = nullptr;
-
     struct TrackData {
         TrackId trackId;
         TrackChainPtr chain;
@@ -97,6 +100,8 @@ class Mixer : public AudioNode<ContextMixerTag>, public async::Asyncable
     std::vector<TrackData> m_auxTracks;
     std::map<TrackId, AuxSendsParams> m_auxSends;
 
+    std::vector<IAudioTaskScheduler::Task> m_trackTasks;
+
     size_t m_nonMutedTrackCount = 0;
     std::unordered_set<TrackId> m_tracksToProcessWhenIdle;
 };
diff --git a/framework/audio/iaudiodriver.h b/framework/audio/iaudiodriver.h
index 8076a9de07..9d81dc0875 100644
--- a/framework/audio/iaudiodriver.h
+++ b/framework/audio/iaudiodriver.h
@@ -27,6 +27,7 @@
 #include <functional>
 #include <memory>
 
+#include "common/audioworkgroup.h"
 #include "global/async/notification.h"
 
 #include "audio/common/audiotypes.h"
@@ -52,6 +53,8 @@ class IAudioDriver
     virtual std::string name() const = 0;
 
     virtual AudioDeviceID defaultDevice() const = 0;
+    virtual AudioWorkGroup getAudioWorkGroup() const { return {}; }
+    virtual async::Notification currentWorkgroupChanged() const { return {}; }
 
     virtual bool open(const Spec& spec, Spec* activeSpec) = 0;
     virtual void close() = 0;
diff --git a/framework/audio/main/audiomodule.cpp b/framework/audio/main/audiomodule.cpp
index 155e5f2123..ec781e295d 100644
--- a/framework/audio/main/audiomodule.cpp
+++ b/framework/audio/main/audiomodule.cpp
@@ -21,6 +21,7 @@
  */
 #include "audiomodule.h"
 
+#include "common/iaudiotaskscheduler.h"
 #include "ui/iuiactionsregister.h"
 #include "global/modularity/ioc.h"
 
@@ -87,6 +88,7 @@ void AudioModule::registerExports()
     globalIoc()->registerExport<IAudioThreadSecurer>(mname, std::make_shared<AudioThreadSecurer>());
     globalIoc()->registerExport<rpc::IRpcChannel>(mname, m_rpcChannel);
     globalIoc()->registerExport<IAudioDriverController>(mname, m_audioDriverController);
+    globalIoc()->registerExport<IAudioTaskScheduler>(mname, m_audioDriverController->getAudioTaskScheduler());
     globalIoc()->registerExport<ISoundFontController>(mname, m_soundFontController);
     globalIoc()->registerExport<IStartAudioController>(mname, m_startAudioController);
 }
diff --git a/framework/audio/main/internal/audiodrivercontroller.cpp b/framework/audio/main/internal/audiodrivercontroller.cpp
index 6a3996eaf2..f68b80bef4 100644
--- a/framework/audio/main/internal/audiodrivercontroller.cpp
+++ b/framework/audio/main/internal/audiodrivercontroller.cpp
@@ -22,6 +22,8 @@
 
 #include "audiodrivercontroller.h"
 
+#include "common/audiotaskscheduler.h"
+#include "common/iaudiotaskscheduler.h"
 #include "global/async/async.h"
 
 #include "muse_framework_config.h"
@@ -47,6 +49,7 @@
 
 #ifdef Q_OS_MACOS
 #include "audio/driver/platform/osx/osxaudiodriver.h"
+#include "audio/driver/platform/osx/osxdirectaudiodriver.h"
 #endif
 
 #ifdef Q_OS_WASM
@@ -59,6 +62,11 @@ using namespace muse;
 using namespace muse::audio;
 using namespace muse::audio::rpc;
 
+AudioDriverController::AudioDriverController()
+    : m_audioTaskScheduler(std::make_shared<AudioTaskScheduler>())
+{
+}
+
 IAudioDriverPtr AudioDriverController::createDriver(const std::string& name) const
 {
 #ifdef MUSE_MODULE_AUDIO_JACK
@@ -105,7 +113,9 @@ IAudioDriverPtr AudioDriverController::createDriver(const std::string& name) con
 #endif
 
 #ifdef Q_OS_MACOS
-    UNUSED(name);
+    if (name == "CoreAudioDirect") {
+        return std::shared_ptr<IAudioDriver>(new OSXDirectAudioDriver());
+    }
     return std::shared_ptr<IAudioDriver>(new OSXAudioDriver());
 #endif
 
@@ -147,6 +157,7 @@ std::vector<std::string> AudioDriverController::availableAudioDrivers() const
 
 #ifdef Q_OS_MACOS
     names.push_back("CoreAudio");
+    names.push_back("CoreAudioDirect");
     return names;
 #endif
 
@@ -188,6 +199,11 @@ void AudioDriverController::setNewDriver(IAudioDriverPtr newDriver)
             });
         });
     }
+
+    auto audioWorkgroupSource = std::dynamic_pointer_cast<AudioTaskScheduler>(m_audioTaskScheduler);
+    if (audioWorkgroupSource) {
+        audioWorkgroupSource->setAudioDriver(m_audioDriver);
+    }
 }
 
 IAudioDriver::Spec AudioDriverController::defaultSpec() const
@@ -463,3 +479,8 @@ async::Notification AudioDriverController::outputDeviceSampleRateChanged() const
 {
     return m_outputDeviceSampleRateChanged;
 }
+
+IAudioTaskSchedulerPtr muse::audio::AudioDriverController::getAudioTaskScheduler() const
+{
+    return m_audioTaskScheduler;
+}
diff --git a/framework/audio/main/internal/audiodrivercontroller.h b/framework/audio/main/internal/audiodrivercontroller.h
index f6628bd9c3..57a7596817 100644
--- a/framework/audio/main/internal/audiodrivercontroller.h
+++ b/framework/audio/main/internal/audiodrivercontroller.h
@@ -22,6 +22,7 @@
 
 #pragma once
 
+#include "common/iaudiotaskscheduler.h"
 #include "global/async/asyncable.h"
 
 #include "audio/iaudiodrivercontroller.h"
@@ -37,6 +38,7 @@ class AudioDriverController : public IAudioDriverController, public async::Async
     GlobalInject<rpc::IRpcChannel> rpcChannel;
 
 public:
+    AudioDriverController();
 
     std::vector<std::string> availableAudioDrivers() const override;
 
@@ -67,6 +69,8 @@ class AudioDriverController : public IAudioDriverController, public async::Async
     void changeSampleRate(sample_rate_t sampleRate) override;
     async::Notification outputDeviceSampleRateChanged() const override;
 
+    IAudioTaskSchedulerPtr getAudioTaskScheduler() const;
+
 private:
     IAudioDriverPtr createDriver(const std::string& name) const;
     void setNewDriver(IAudioDriverPtr newDriver);
@@ -77,6 +81,7 @@ class AudioDriverController : public IAudioDriverController, public async::Async
     void updateOutputSpec();
 
     IAudioDriver::Callback m_callback;
+    IAudioTaskSchedulerPtr m_audioTaskScheduler;
     IAudioDriverPtr m_audioDriver;
     async::Notification m_currentAudioDriverChanged;
     async::Notification m_availableOutputDevicesChanged;
diff --git a/framework/audio/thirdparty/moodycamel/blockingconcurrentqueue.h b/framework/audio/thirdparty/moodycamel/blockingconcurrentqueue.h
new file mode 100644
index 0000000000..ec4d8cfbb8
--- /dev/null
+++ b/framework/audio/thirdparty/moodycamel/blockingconcurrentqueue.h
@@ -0,0 +1,581 @@
+// Provides an efficient blocking version of moodycamel::ConcurrentQueue.
+// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified
+// BSD license, available at the top of concurrentqueue.h.
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, see lightweightsemaphore.h).
+
+#pragma once
+
+#include "concurrentqueue.h"
+#include "lightweightsemaphore.h"
+
+#include <type_traits>
+#include <cerrno>
+#include <memory>
+#include <chrono>
+#include <ctime>
+
+namespace moodycamel {
+// This is a blocking version of the queue. It has an almost identical interface to
+// the normal non-blocking version, with the addition of various wait_dequeue() methods
+// and the removal of producer-specific dequeue methods.
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class BlockingConcurrentQueue
+{
+private:
+    typedef ::moodycamel::ConcurrentQueue<T, Traits> ConcurrentQueue;
+    typedef ::moodycamel::LightweightSemaphore LightweightSemaphore;
+
+public:
+    typedef typename ConcurrentQueue::producer_token_t producer_token_t;
+    typedef typename ConcurrentQueue::consumer_token_t consumer_token_t;
+
+    typedef typename ConcurrentQueue::index_t index_t;
+    typedef typename ConcurrentQueue::size_t size_t;
+    typedef typename std::make_signed<size_t>::type ssize_t;
+
+    static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE;
+    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD;
+    static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE;
+    static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE;
+    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+    static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE
+        = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE;
+    static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE;
+
+public:
+    // Creates a queue with at least `capacity` element slots; note that the
+    // actual number of elements that can be inserted without additional memory
+    // allocation depends on the number of producers and the block size (e.g. if
+    // the block size is equal to `capacity`, only a single block will be allocated
+    // up-front, which means only a single producer will be able to enqueue elements
+    // without an extra allocation -- blocks aren't shared between producers).
+    // This method is not thread safe -- it is up to the user to ensure that the
+    // queue is fully constructed before it starts being used by other threads (this
+    // includes making the memory effects of construction visible, possibly with a
+    // memory barrier).
+    explicit BlockingConcurrentQueue(size_t capacity = 6* BLOCK_SIZE)
+        : inner(capacity),
+        sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS),
+             &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+    {
+        assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner
+               && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+        if (!sema) {
+            MOODYCAMEL_THROW(std::bad_alloc());
+        }
+    }
+
+    BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+        : inner(minCapacity, maxExplicitProducers, maxImplicitProducers),
+        sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS),
+             &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+    {
+        assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner
+               && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+        if (!sema) {
+            MOODYCAMEL_THROW(std::bad_alloc());
+        }
+    }
+
+    // Disable copying and copy assignment
+    BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+    BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+
+    // Moving is supported, but note that it is *not* a thread-safe operation.
+    // Nobody can use the queue while it's being moved, and the memory effects
+    // of that move must be propagated to other threads before they can use it.
+    // Note: When a queue is moved, its tokens are still valid but can only be
+    // used with the destination queue (i.e. semantically they are moved along
+    // with the queue itself).
+    BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+        : inner(std::move(other.inner)), sema(std::move(other.sema))
+    { }
+
+    inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+    {
+        return swap_internal(other);
+    }
+
+    // Swaps this queue's state with the other's. Not thread-safe.
+    // Swapping two queues does not invalidate their tokens, however
+    // the tokens that were created for one queue must be used with
+    // only the swapped queue (i.e. the tokens are tied to the
+    // queue's movable state, not the object itself).
+    inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap_internal(other);
+    }
+
+private:
+    BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other)
+    {
+        if (this == &other) {
+            return *this;
+        }
+
+        inner.swap(other.inner);
+        sema.swap(other.sema);
+        return *this;
+    }
+
+public:
+    // Enqueues a single item (by copying it).
+    // Allocates memory if required. Only fails if memory allocation fails (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T const& item)
+    {
+        if ((details::likely)(inner.enqueue(item))) {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Allocates memory if required. Only fails if memory allocation fails (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T&& item)
+    {
+        if ((details::likely)(inner.enqueue(std::move(item)))) {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T const& item)
+    {
+        if ((details::likely)(inner.enqueue(token, item))) {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T&& item)
+    {
+        if ((details::likely)(inner.enqueue(token, std::move(item)))) {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues several items.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+    // Thread-safe.
+    template<typename It>
+    inline bool enqueue_bulk(It itemFirst, size_t count)
+    {
+        if ((details::likely)(inner.enqueue_bulk(std::forward<It>(itemFirst), count))) {
+            sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails
+    // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template<typename It>
+    inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        if ((details::likely)(inner.enqueue_bulk(token, std::forward<It>(itemFirst), count))) {
+            sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by copying it).
+    // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T const& item)
+    {
+        if (inner.try_enqueue(item)) {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T&& item)
+    {
+        if (inner.try_enqueue(std::move(item))) {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T const& item)
+    {
+        if (inner.try_enqueue(token, item)) {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T&& item)
+    {
+        if (inner.try_enqueue(token, std::move(item))) {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues several items.
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template<typename It>
+    inline bool try_enqueue_bulk(It itemFirst, size_t count)
+    {
+        if (inner.try_enqueue_bulk(std::forward<It>(itemFirst), count)) {
+            sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template<typename It>
+    inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        if (inner.try_enqueue_bulk(token, std::forward<It>(itemFirst), count)) {
+            sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+            return true;
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename U>
+    inline bool try_dequeue(U& item)
+    {
+        if (sema->tryWait()) {
+            while (!inner.try_dequeue(item)) {
+                continue;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue using an explicit consumer token.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename U>
+    inline bool try_dequeue(consumer_token_t& token, U& item)
+    {
+        if (sema->tryWait()) {
+            while (!inner.try_dequeue(token, item)) {
+                continue;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename It>
+    inline size_t try_dequeue_bulk(It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+        while (count != max) {
+            count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit consumer token.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename It>
+    inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+        while (count != max) {
+            count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Blocks the current thread until there's something to dequeue, then
+    // dequeues it.
+    // Never allocates. Thread-safe.
+    template<typename U>
+    inline void wait_dequeue(U& item)
+    {
+        while (!sema->wait()) {
+            continue;
+        }
+        while (!inner.try_dequeue(item)) {
+            continue;
+        }
+    }
+
+    // Blocks the current thread until either there's something to dequeue
+    // or the timeout (specified in microseconds) expires. Returns false
+    // without setting `item` if the timeout expires, otherwise assigns
+    // to `item` and returns true.
+    // Using a negative timeout indicates an indefinite timeout,
+    // and is thus functionally equivalent to calling wait_dequeue.
+    // Never allocates. Thread-safe.
+    template<typename U>
+    inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+    {
+        if (!sema->wait(timeout_usecs)) {
+            return false;
+        }
+        while (!inner.try_dequeue(item)) {
+            continue;
+        }
+        return true;
+    }
+
+    // Blocks the current thread until either there's something to dequeue
+    // or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+    // Never allocates. Thread-safe.
+    template<typename U, typename Rep, typename Period>
+    inline bool wait_dequeue_timed(U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+
+    // Blocks the current thread until there's something to dequeue, then
+    // dequeues it using an explicit consumer token.
+    // Never allocates. Thread-safe.
+    template<typename U>
+    inline void wait_dequeue(consumer_token_t& token, U& item)
+    {
+        while (!sema->wait()) {
+            continue;
+        }
+        while (!inner.try_dequeue(token, item)) {
+            continue;
+        }
+    }
+
+    // Blocks the current thread until either there's something to dequeue
+    // or the timeout (specified in microseconds) expires. Returns false
+    // without setting `item` if the timeout expires, otherwise assigns
+    // to `item` and returns true.
+    // Using a negative timeout indicates an indefinite timeout,
+    // and is thus functionally equivalent to calling wait_dequeue.
+    // Never allocates. Thread-safe.
+    template<typename U>
+    inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs)
+    {
+        if (!sema->wait(timeout_usecs)) {
+            return false;
+        }
+        while (!inner.try_dequeue(token, item)) {
+            continue;
+        }
+        return true;
+    }
+
+    // Blocks the current thread until either there's something to dequeue
+    // or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+    // Never allocates. Thread-safe.
+    template<typename U, typename Rep, typename Period>
+    inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(token, item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued, which will
+    // always be at least one (this method blocks until the queue
+    // is non-empty) and at most max.
+    // Never allocates. Thread-safe.
+    template<typename It>
+    inline size_t wait_dequeue_bulk(It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+        while (count != max) {
+            count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued, which can
+    // be 0 if the timeout expires while waiting for elements,
+    // and at most max.
+    // Using a negative timeout indicates an indefinite timeout,
+    // and is thus functionally equivalent to calling wait_dequeue_bulk.
+    // Never allocates. Thread-safe.
+    template<typename It>
+    inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs)
+    {
+        size_t count = 0;
+        max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+        while (count != max) {
+            count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued, which can
+    // be 0 if the timeout expires while waiting for elements,
+    // and at most max.
+    // Never allocates. Thread-safe.
+    template<typename It, typename Rep, typename Period>
+    inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit consumer token.
+    // Returns the number of items actually dequeued, which will
+    // always be at least one (this method blocks until the queue
+    // is non-empty) and at most max.
+    // Never allocates. Thread-safe.
+    template<typename It>
+    inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+        while (count != max) {
+            count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit consumer token.
+    // Returns the number of items actually dequeued, which can
+    // be 0 if the timeout expires while waiting for elements,
+    // and at most max.
+    // Using a negative timeout indicates an indefinite timeout,
+    // and is thus functionally equivalent to calling wait_dequeue_bulk.
+    // Never allocates. Thread-safe.
+    template<typename It>
+    inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs)
+    {
+        size_t count = 0;
+        max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+        while (count != max) {
+            count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit consumer token.
+    // Returns the number of items actually dequeued, which can
+    // be 0 if the timeout expires while waiting for elements,
+    // and at most max.
+    // Never allocates. Thread-safe.
+    template<typename It, typename Rep, typename Period>
+    inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max,
+                                          std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(token, itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+
+    // Returns an estimate of the total number of elements currently in the queue. This
+    // estimate is only accurate if the queue has completely stabilized before it is called
+    // (i.e. all enqueue and dequeue operations have completed and their memory effects are
+    // visible on the calling thread, and no further operations start while this method is
+    // being called).
+    // Thread-safe.
+    inline size_t size_approx() const
+    {
+        return (size_t)sema->availableApprox();
+    }
+
+    // Returns true if the underlying atomic variables used by
+    // the queue are lock-free (they should be on most platforms).
+    // Thread-safe.
+    static constexpr bool is_lock_free()
+    {
+        return ConcurrentQueue::is_lock_free();
+    }
+
+private:
+    template<typename U, typename A1, typename A2>
+    static inline U* create(A1&& a1, A2&& a2)
+    {
+        void* p = (Traits::malloc)(sizeof(U));
+        return p != nullptr ? new (p) U(std::forward<A1>(a1), std::forward<A2>(a2)) : nullptr;
+    }
+
+    template<typename U>
+    static inline void destroy(U* p)
+    {
+        if (p != nullptr) {
+            p->~U();
+        }
+        (Traits::free)(p);
+    }
+
+private:
+    ConcurrentQueue inner;
+    std::unique_ptr<LightweightSemaphore, void (*)(LightweightSemaphore*)> sema;
+};
+
+template<typename T, typename Traits>
+inline void swap(BlockingConcurrentQueue<T, Traits>& a, BlockingConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+}     // end namespace moodycamel
diff --git a/framework/audio/thirdparty/moodycamel/concurrentqueue.h b/framework/audio/thirdparty/moodycamel/concurrentqueue.h
new file mode 100644
index 0000000000..d758324e0c
--- /dev/null
+++ b/framework/audio/thirdparty/moodycamel/concurrentqueue.h
@@ -0,0 +1,3953 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2020, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+
+#pragma once
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
+// does not support `if constexpr`, so we have no choice but to simply disable the warning
+#pragma warning(push)
+#pragma warning(disable: 4127)  // conditional expression is constant
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>       // Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>            // for CHAR_BIT
+#include <array>
+#include <thread>       // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <mutex>        // used for thread exit synchronization
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel {
+namespace details {
+template<typename thread_id_t> struct thread_id_converter {
+    typedef thread_id_t thread_id_numeric_size_t;
+    typedef thread_id_t thread_id_hash_t;
+    static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+};
+}
+}
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel {
+namespace details {
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+static inline thread_id_t thread_id() { return rl::thread_index(); }
+}
+}
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id  = 0;                        // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;        // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+}
+}
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) \
+    || defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+
+typedef std::thread::id thread_id_t;
+static const thread_id_t invalid_thread_id;               // Default ctor creates invalid ID
+
+// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+// be.
+static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+template<std::size_t> struct thread_id_size { };
+template<> struct thread_id_size<4> {
+    typedef std::uint32_t numeric_t;
+};
+template<> struct thread_id_size<8> {
+    typedef std::uint64_t numeric_t;
+};
+
+template<> struct thread_id_converter<thread_id_t> {
+    typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+    typedef std::size_t thread_id_hash_t;
+#else
+    typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+    static thread_id_hash_t prehash(thread_id_t const& x)
+    {
+#ifndef __APPLE__
+        return std::hash<std::thread::id>()(x);
+#else
+        return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+    }
+};
+}
+}
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel {
+namespace details {
+typedef std::uintptr_t thread_id_t;
+static const thread_id_t invalid_thread_id  = 0;                  // Address can't be nullptr
+static const thread_id_t invalid_thread_id2 = 1;                  // Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+}
+}
+#endif
+
+// Constexpr if
+#ifndef MOODYCAMEL_CONSTEXPR_IF
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#define MOODYCAMEL_CONSTEXPR_IF if constexpr
+#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define MOODYCAMEL_CONSTEXPR_IF if
+#define MOODYCAMEL_MAYBE_UNUSED
+#endif
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, \
+                                 expr) (std::is_rvalue_reference<valueType>::value \
+                                        && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value \
+                                        : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, \
+                                   expr) ((std::is_rvalue_reference<valueType>::value \
+                                           && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value \
+                                           || std::is_nothrow_move_assignable<type>::value \
+                                           : std::is_trivially_copy_assignable<type>::value \
+                                           || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, \
+                                                                                                                        expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, \
+                                 expr) (std::is_rvalue_reference<valueType>::value \
+                                        && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value \
+                                        || std::is_nothrow_move_constructible<type>::value \
+                                        : std::is_trivially_copy_constructible<type>::value \
+                                        || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, \
+                                   expr) ((std::is_rvalue_reference<valueType>::value \
+                                           && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value \
+                                           || std::is_nothrow_move_assignable<type>::value \
+                                           : std::is_trivially_copy_assignable<type>::value \
+                                           || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, \
+                                                                                                                        expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) \
+    && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) \
+    && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions.
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+namespace moodycamel {
+namespace details {
+#ifndef MOODYCAMEL_ALIGNAS
+// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+template<int Align, typename T> struct Vs2013Aligned { };        // default, unsupported alignment
+template<typename T> struct Vs2013Aligned<1, T> {
+    typedef __declspec(align (1)) T type;
+};
+template<typename T> struct Vs2013Aligned<2, T> {
+    typedef __declspec(align (2)) T type;
+};
+template<typename T> struct Vs2013Aligned<4, T> {
+    typedef __declspec(align (4)) T type;
+};
+template<typename T> struct Vs2013Aligned<8, T> {
+    typedef __declspec(align (8)) T type;
+};
+template<typename T> struct Vs2013Aligned<16, T> {
+    typedef __declspec(align (16)) T type;
+};
+template<typename T> struct Vs2013Aligned<32, T> {
+    typedef __declspec(align (32)) T type;
+};
+template<typename T> struct Vs2013Aligned<64, T> {
+    typedef __declspec(align (64)) T type;
+};
+template<typename T> struct Vs2013Aligned<128, T> {
+    typedef __declspec(align (128)) T type;
+};
+template<typename T> struct Vs2013Aligned<256, T> {
+    typedef __declspec(align (256)) T type;
+};
+#else
+template<typename T> struct identity {
+    typedef T type;
+};
+#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#endif
+#endif
+}
+}
+
+// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
+// we can apply per-function compile-time suppression.
+// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+#define MOODYCAMEL_NO_TSAN
+#if defined(__has_feature)
+ #if __has_feature(thread_sanitizer)
+  #undef MOODYCAMEL_NO_TSAN
+  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+ #endif // TSAN
+#endif // TSAN
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel {
+namespace details {
+#if defined(__GNUC__)
+static inline bool(likely)(bool x) {
+    return __builtin_expect((x), true);
+}
+static inline bool(unlikely)(bool x) {
+    return __builtin_expect((x), false);
+}
+#else
+static inline bool(likely)(bool x) {
+    return x;
+}
+static inline bool(unlikely)(bool x) {
+    return x;
+}
+#endif
+}
+}
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+template<typename T>
+struct const_numeric_max {
+    static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+    static const T value = std::numeric_limits<T>::is_signed
+                           ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+                           : static_cast<T>(-1);
+};
+
+#if defined(__GLIBCXX__)
+typedef ::max_align_t std_max_align_t;            // libstdc++ forgot to add it to std:: for a while
+#else
+typedef std::max_align_t std_max_align_t;         // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+typedef union {
+    std_max_align_t x;
+    long long y;
+    void* z;
+} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+    // General-purpose size type. std::size_t is strongly recommended.
+    typedef std::size_t size_t;
+
+    // The type used for the enqueue and dequeue indices. Must be at least as
+    // large as size_t. Should be significantly larger than the number of elements
+    // you expect to hold at once, especially if you have a high turnover rate;
+    // for example, on 32-bit x86, if you expect to have over a hundred million
+    // elements or pump several million elements through your queue in a very
+    // short space of time, using a 32-bit type *may* trigger a race condition.
+    // A 64-bit int type is recommended in that case, and in practice will
+    // prevent a race condition no matter the usage of the queue. Note that
+    // whether the queue is lock-free with a 64-int type depends on the whether
+    // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+    typedef std::size_t index_t;
+
+    // Internally, all elements are enqueued and dequeued from multi-element
+    // blocks; this is the smallest controllable unit. If you expect few elements
+    // but many producers, a smaller block size should be favoured. For few producers
+    // and/or many elements, a larger block size is preferred. A sane default
+    // is provided. Must be a power of 2.
+    static const size_t BLOCK_SIZE = 32;
+
+    // For explicit producers (i.e. when using a producer token), the block is
+    // checked for being empty by iterating through a list of flags, one per element.
+    // For large block sizes, this is too inefficient, and switching to an atomic
+    // counter-based approach is faster. The switch is made for block sizes strictly
+    // larger than this threshold.
+    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+    // How many full blocks can be expected for a single explicit producer? This should
+    // reflect that number's maximum for optimal performance. Must be a power of 2.
+    static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+    // How many full blocks can be expected for a single implicit producer? This should
+    // reflect that number's maximum for optimal performance. Must be a power of 2.
+    static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+    // The initial size of the hash table mapping thread IDs to implicit producers.
+    // Note that the hash is resized every time it becomes half full.
+    // Must be a power of two, and either 0 or at least 1. If 0, implicit production
+    // (using the enqueue methods without an explicit producer token) is disabled.
+    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+    // Controls the number of items that an explicit consumer (i.e. one with a token)
+    // must consume before it causes all consumers to rotate and move on to the next
+    // internal queue.
+    static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+
+    // The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+    // Enqueue operations that would cause this limit to be surpassed will fail. Note
+    // that this limit is enforced at the block level (for performance reasons), i.e.
+    // it's rounded up to the nearest block size.
+    static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+    // The number of times to spin before sleeping when waiting on a semaphore.
+    // Recommended values are on the order of 1000-10000 unless the number of
+    // consumer threads exceeds the number of idle cores (in which case try 0-100).
+    // Only affects instances of the BlockingConcurrentQueue.
+    static const int MAX_SEMA_SPINS = 10000;
+
+    // Whether to recycle dynamically-allocated blocks into an internal free list or
+    // not. If false, only pre-allocated blocks (controlled by the constructor
+    // arguments) will be recycled, and all others will be `free`d back to the heap.
+    // Note that blocks consumed by explicit producers are only freed on destruction
+    // of the queue (not following destruction of the token) regardless of this trait.
+    static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+#ifndef MCDBGQ_USE_RELACY
+    // Memory allocation can be customized if needed.
+    // malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+    // Gah, this is 2015, stop defining macros that break standard code already!
+    // Work around malloc/free being special macros:
+    static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+    static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+    static inline void* (malloc)(size_t size) {
+        return WORKAROUND_malloc(size);
+    }
+    static inline void(free)(void* ptr) {
+        return WORKAROUND_free(ptr);
+    }
+#else
+    static inline void* malloc(size_t size) { return std::malloc(size); }
+    static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+    // Debug versions when running under the Relacy race detector (ignore
+    // these in user code)
+    static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+    static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+namespace details {
+struct ConcurrentQueueProducerTypelessBase
+{
+    ConcurrentQueueProducerTypelessBase* next;
+    std::atomic<bool> inactive;
+    ProducerToken* token;
+
+    ConcurrentQueueProducerTypelessBase()
+        : next(nullptr), inactive(false), token(nullptr)
+    {
+    }
+};
+
+template<bool use32> struct _hash_32_or_64 {
+    static inline std::uint32_t hash(std::uint32_t h)
+    {
+        // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+        // Since the thread ID is already unique, all we really want to do is propagate that
+        // uniqueness evenly across all the bits, so that we can use a subset of the bits while
+        // reducing collisions significantly
+        h ^= h >> 16;
+        h *= 0x85ebca6b;
+        h ^= h >> 13;
+        h *= 0xc2b2ae35;
+        return h ^ (h >> 16);
+    }
+};
+template<> struct _hash_32_or_64<1> {
+    static inline std::uint64_t hash(std::uint64_t h)
+    {
+        h ^= h >> 33;
+        h *= 0xff51afd7ed558ccd;
+        h ^= h >> 33;
+        h *= 0xc4ceb9fe1a85ec53;
+        return h ^ (h >> 33);
+    }
+};
+template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+
+static inline size_t hash_thread_id(thread_id_t id)
+{
+    static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+    return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+                                   thread_id_converter<thread_id_t>::prehash(id)));
+}
+
+template<typename T>
+static inline bool circular_less_than(T a, T b)
+{
+    static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+                  "circular_less_than is intended to be used only with unsigned integer types");
+    return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+    // Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+    //       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+}
+
+template<typename U>
+static inline char* align_for(char* ptr)
+{
+    const std::size_t alignment = std::alignment_of<U>::value;
+    return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+}
+
+template<typename T>
+static inline T ceil_to_pow_2(T x)
+{
+    static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+                  "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+    // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+        x |= x >> (i << 3);
+    }
+    ++x;
+    return x;
+}
+
+template<typename T>
+static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+{
+    T temp = std::move(left.load(std::memory_order_relaxed));
+    left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+    right.store(std::move(temp), std::memory_order_relaxed);
+}
+
+template<typename T>
+static inline T const& nomove(T const& x)
+{
+    return x;
+}
+
+template<bool Enable>
+struct nomove_if
+{
+    template<typename T>
+    static inline T const& eval(T const& x)
+    {
+        return x;
+    }
+};
+
+template<>
+struct nomove_if<false>
+{
+    template<typename U>
+    static inline auto eval(U&& x)
+    -> decltype(std::forward<U>(x))
+    {
+        return std::forward<U>(x);
+    }
+};
+
+template<typename It>
+static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT->decltype(*it)
+{
+    return *it;
+}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+typedef RelacyThreadExitListener ThreadExitListener;
+typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+class ThreadExitNotifier;
+
+struct ThreadExitListener
+{
+    typedef void (*callback_t)(void*);
+    callback_t callback;
+    void* userData;
+
+    ThreadExitListener* next;                   // reserved for use by the ThreadExitNotifier
+    ThreadExitNotifier* chain;                  // reserved for use by the ThreadExitNotifier
+};
+
+class ThreadExitNotifier
+{
+public:
+    static void subscribe(ThreadExitListener* listener)
+    {
+        auto& tlsInst = instance();
+        std::lock_guard<std::mutex> guard(mutex());
+        listener->next = tlsInst.tail;
+        listener->chain = &tlsInst;
+        tlsInst.tail = listener;
+    }
+
+    static void unsubscribe(ThreadExitListener* listener)
+    {
+        std::lock_guard<std::mutex> guard(mutex());
+        if (!listener->chain) {
+            return;              // race with ~ThreadExitNotifier
+        }
+        auto& tlsInst = *listener->chain;
+        listener->chain = nullptr;
+        ThreadExitListener** prev = &tlsInst.tail;
+        for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+            if (ptr == listener) {
+                *prev = ptr->next;
+                break;
+            }
+            prev = &ptr->next;
+        }
+    }
+
+private:
+    ThreadExitNotifier()
+        : tail(nullptr) { }
+    ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+    ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+
+    ~ThreadExitNotifier()
+    {
+        // This thread is about to exit, let everyone know!
+        assert(this == &instance()
+               &&
+               "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+        std::lock_guard<std::mutex> guard(mutex());
+        for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+            ptr->chain = nullptr;
+            ptr->callback(ptr->userData);
+        }
+    }
+
+    // Thread-local
+    static inline ThreadExitNotifier& instance()
+    {
+        static thread_local ThreadExitNotifier notifier;
+        return notifier;
+    }
+
+    static inline std::mutex& mutex()
+    {
+        // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+        static std::mutex mutex;
+        return mutex;
+    }
+
+private:
+    ThreadExitListener* tail;
+};
+#endif
+#endif
+
+template<typename T> struct static_is_lock_free_num {
+    enum {
+        value = 0
+    };
+};
+template<> struct static_is_lock_free_num<signed char> {
+    enum {
+        value = ATOMIC_CHAR_LOCK_FREE
+    };
+};
+template<> struct static_is_lock_free_num<short> {
+    enum {
+        value = ATOMIC_SHORT_LOCK_FREE
+    };
+};
+template<> struct static_is_lock_free_num<int> {
+    enum {
+        value = ATOMIC_INT_LOCK_FREE
+    };
+};
+template<> struct static_is_lock_free_num<long> {
+    enum {
+        value = ATOMIC_LONG_LOCK_FREE
+    };
+};
+template<> struct static_is_lock_free_num<long long> {
+    enum {
+        value = ATOMIC_LLONG_LOCK_FREE
+    };
+};
+template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+template<> struct static_is_lock_free<bool> {
+    enum {
+        value = ATOMIC_BOOL_LOCK_FREE
+    };
+};
+template<typename U> struct static_is_lock_free<U*> {
+    enum {
+        value = ATOMIC_POINTER_LOCK_FREE
+    };
+};
+}
+
+struct ProducerToken
+{
+    template<typename T, typename Traits>
+    explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+
+    template<typename T, typename Traits>
+    explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+
+    ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+        : producer(other.producer)
+    {
+        other.producer = nullptr;
+        if (producer != nullptr) {
+            producer->token = this;
+        }
+    }
+
+    inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap(other);
+        return *this;
+    }
+
+    void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+    {
+        std::swap(producer, other.producer);
+        if (producer != nullptr) {
+            producer->token = this;
+        }
+        if (other.producer != nullptr) {
+            other.producer->token = &other;
+        }
+    }
+
+    // A token is always valid unless:
+    //     1) Memory allocation failed during construction
+    //     2) It was moved via the move constructor
+    //        (Note: assignment does a swap, leaving both potentially valid)
+    //     3) The associated queue was destroyed
+    // Note that if valid() returns true, that only indicates
+    // that the token is valid for use with a specific queue,
+    // but not which one; that's up to the user to track.
+    inline bool valid() const { return producer != nullptr; }
+
+    ~ProducerToken()
+    {
+        if (producer != nullptr) {
+            producer->token = nullptr;
+            producer->inactive.store(true, std::memory_order_release);
+        }
+    }
+
+    // Disable copying and assignment
+    ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+    ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+    template<typename T, typename Traits> friend class ConcurrentQueue;
+    friend class ConcurrentQueueTests;
+
+protected:
+    details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+struct ConsumerToken
+{
+    template<typename T, typename Traits>
+    explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+
+    template<typename T, typename Traits>
+    explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+
+    ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+        : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+        itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer),
+        desiredProducer(other.desiredProducer)
+    {
+    }
+
+    inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap(other);
+        return *this;
+    }
+
+    void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+    {
+        std::swap(initialOffset, other.initialOffset);
+        std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+        std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+        std::swap(currentProducer, other.currentProducer);
+        std::swap(desiredProducer, other.desiredProducer);
+    }
+
+    // Disable copying and assignment
+    ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+    ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+    template<typename T, typename Traits> friend class ConcurrentQueue;
+    friend class ConcurrentQueueTests;
+
+private: // but shared with ConcurrentQueue
+    std::uint32_t initialOffset;
+    std::uint32_t lastKnownGlobalOffset;
+    std::uint32_t itemsConsumedFromCurrent;
+    details::ConcurrentQueueProducerTypelessBase* currentProducer;
+    details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+    typedef ::moodycamel::ProducerToken producer_token_t;
+    typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+    typedef typename Traits::index_t index_t;
+    typedef typename Traits::size_t size_t;
+
+    static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+    static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+    static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+    static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE
+        = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)            // + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)            // static_cast: Truncation of constant value
+#endif
+    static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE)
+                                             < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value
+                                            : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE
+                                               * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+    static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value,
+                  "Traits::size_t must be an unsigned integral type");
+    static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value,
+                  "Traits::index_t must be an unsigned integral type");
+    static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+    static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+    static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1)
+                  && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
+                  "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+    static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
+                  "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+    static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
+                  "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+    static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+                  || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
+                  "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+    static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
+                  "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+    // Creates a queue with at least `capacity` element slots; note that the
+    // actual number of elements that can be inserted without additional memory
+    // allocation depends on the number of producers and the block size (e.g. if
+    // the block size is equal to `capacity`, only a single block will be allocated
+    // up-front, which means only a single producer will be able to enqueue elements
+    // without an extra allocation -- blocks aren't shared between producers).
+    // This method is not thread safe -- it is up to the user to ensure that the
+    // queue is fully constructed before it starts being used by other threads (this
+    // includes making the memory effects of construction visible, possibly with a
+    // memory barrier).
+    explicit ConcurrentQueue(size_t capacity = 32* BLOCK_SIZE)
+        : producerListTail(nullptr),
+        producerCount(0),
+        initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0),
+        globalExplicitConsumerOffset(0)
+    {
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        // Track all the producers using a fully-resolved typed list for
+        // each kind; this makes it possible to debug them starting from
+        // the root queue object (otherwise wacky casts are needed that
+        // don't compile in the debugger's expression evaluator).
+        explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+    }
+
+    // Computes the correct amount of pre-allocated blocks for you based
+    // on the minimum number of elements you want available at any given
+    // time, and the maximum concurrent number of each type of producer.
+    ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+        : producerListTail(nullptr),
+        producerCount(0),
+        initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0),
+        globalExplicitConsumerOffset(0)
+    {
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2
+                        * (maxExplicitProducers + maxImplicitProducers);
+        populate_initial_block_list(blocks);
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+    }
+
+    // Note: The queue should not be accessed concurrently while it's
+    // being deleted. It's up to the user to synchronize this.
+    // This method is not thread safe.
+    ~ConcurrentQueue()
+    {
+        // Destroy producers
+        auto ptr = producerListTail.load(std::memory_order_relaxed);
+        while (ptr != nullptr) {
+            auto next = ptr->next_prod();
+            if (ptr->token != nullptr) {
+                ptr->token->producer = nullptr;
+            }
+            destroy(ptr);
+            ptr = next;
+        }
+
+        // Destroy implicit producer hash tables
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+            auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+            while (hash != nullptr) {
+                auto prev = hash->prev;
+                if (prev != nullptr) {                // The last hash is part of this object and was not allocated dynamically
+                    for (size_t i = 0; i != hash->capacity; ++i) {
+                        hash->entries[i].~ImplicitProducerKVP();
+                    }
+                    hash->~ImplicitProducerHash();
+                    (Traits::free)(hash);
+                }
+                hash = prev;
+            }
+        }
+
+        // Destroy global free list
+        auto block = freeList.head_unsafe();
+        while (block != nullptr) {
+            auto next = block->freeListNext.load(std::memory_order_relaxed);
+            if (block->dynamicallyAllocated) {
+                destroy(block);
+            }
+            block = next;
+        }
+
+        // Destroy initial free list
+        destroy_array(initialBlockPool, initialBlockPoolSize);
+    }
+
+    // Disable copying and copy assignment
+    ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+    ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+
+    // Moving is supported, but note that it is *not* a thread-safe operation.
+    // Nobody can use the queue while it's being moved, and the memory effects
+    // of that move must be propagated to other threads before they can use it.
+    // Note: When a queue is moved, its tokens are still valid but can only be
+    // used with the destination queue (i.e. semantically they are moved along
+    // with the queue itself).
+    ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+        : producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+        producerCount(other.producerCount.load(std::memory_order_relaxed)),
+        initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+        initialBlockPool(other.initialBlockPool),
+        initialBlockPoolSize(other.initialBlockPoolSize),
+        freeList(std::move(other.freeList)),
+        nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+        globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+    {
+        // Move the other one into this, and leave the other one as an empty queue
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        swap_implicit_producer_hashes(other);
+
+        other.producerListTail.store(nullptr, std::memory_order_relaxed);
+        other.producerCount.store(0, std::memory_order_relaxed);
+        other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+        other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+
+        other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+        other.initialBlockPoolSize = 0;
+        other.initialBlockPool = nullptr;
+
+        reown_producers();
+    }
+
+    inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+    {
+        return swap_internal(other);
+    }
+
+    // Swaps this queue's state with the other's. Not thread-safe.
+    // Swapping two queues does not invalidate their tokens, however
+    // the tokens that were created for one queue must be used with
+    // only the swapped queue (i.e. the tokens are tied to the
+    // queue's movable state, not the object itself).
+    inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap_internal(other);
+    }
+
+private:
+    ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+    {
+        if (this == &other) {
+            return *this;
+        }
+
+        details::swap_relaxed(producerListTail, other.producerListTail);
+        details::swap_relaxed(producerCount, other.producerCount);
+        details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+        std::swap(initialBlockPool, other.initialBlockPool);
+        std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+        freeList.swap(other.freeList);
+        details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+        details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+
+        swap_implicit_producer_hashes(other);
+
+        reown_producers();
+        other.reown_producers();
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        details::swap_relaxed(explicitProducers, other.explicitProducers);
+        details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+
+        return *this;
+    }
+
+public:
+    // Enqueues a single item (by copying it).
+    // Allocates memory if required. Only fails if memory allocation fails (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T const& item)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else {
+            return inner_enqueue<CanAlloc>(item);
+        }
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Allocates memory if required. Only fails if memory allocation fails (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T&& item)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else {
+            return inner_enqueue<CanAlloc>(std::move(item));
+        }
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T const& item)
+    {
+        return inner_enqueue<CanAlloc>(token, item);
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T&& item)
+    {
+        return inner_enqueue<CanAlloc>(token, std::move(item));
+    }
+
+    // Enqueues several items.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+    // Thread-safe.
+    template<typename It>
+    bool enqueue_bulk(It itemFirst, size_t count)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else {
+            return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+        }
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails
+    // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template<typename It>
+    bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+    }
+
+    // Enqueues a single item (by copying it).
+    // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T const& item)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else {
+            return inner_enqueue<CannotAlloc>(item);
+        }
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T&& item)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else {
+            return inner_enqueue<CannotAlloc>(std::move(item));
+        }
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T const& item)
+    {
+        return inner_enqueue<CannotAlloc>(token, item);
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T&& item)
+    {
+        return inner_enqueue<CannotAlloc>(token, std::move(item));
+    }
+
+    // Enqueues several items.
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template<typename It>
+    bool try_enqueue_bulk(It itemFirst, size_t count)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else {
+            return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+        }
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template<typename It>
+    bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+    }
+
+    // Attempts to dequeue from the queue.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename U>
+    bool try_dequeue(U& item)
+    {
+        // Instead of simply trying each producer in turn (which could cause needless contention on the first
+        // producer), we score them heuristically.
+        size_t nonEmptyCount = 0;
+        ProducerBase* best = nullptr;
+        size_t bestSize = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+            auto size = ptr->size_approx();
+            if (size > 0) {
+                if (size > bestSize) {
+                    bestSize = size;
+                    best = ptr;
+                }
+                ++nonEmptyCount;
+            }
+        }
+
+        // If there was at least one non-empty queue but it appears empty at the time
+        // we try to dequeue from it, we need to make sure every queue's been tried
+        if (nonEmptyCount > 0) {
+            if ((details::likely)(best->dequeue(item))) {
+                return true;
+            }
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+                if (ptr != best && ptr->dequeue(item)) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // This differs from the try_dequeue(item) method in that this one does
+    // not attempt to reduce contention by interleaving the order that producer
+    // streams are dequeued from. So, using this method can reduce overall throughput
+    // under contention, but will give more predictable results in single-threaded
+    // consumer scenarios. This is mostly only useful for internal unit tests.
+    // Never allocates. Thread-safe.
+    template<typename U>
+    bool try_dequeue_non_interleaved(U& item)
+    {
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            if (ptr->dequeue(item)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue using an explicit consumer token.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename U>
+    bool try_dequeue(consumer_token_t& token, U& item)
+    {
+        // The idea is roughly as follows:
+        // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+        // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+        // If there's no items where you're supposed to be, keep moving until you find a producer with some items
+        // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+
+        if (token.desiredProducer == nullptr
+            || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+            if (!update_current_producer_after_rotation(token)) {
+                return false;
+            }
+        }
+
+        // If there was at least one non-empty queue but it appears empty at the time
+        // we try to dequeue from it, we need to make sure every queue's been tried
+        if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+            if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+                globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+            }
+            return true;
+        }
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr) {
+            ptr = tail;
+        }
+        while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+            if (ptr->dequeue(item)) {
+                token.currentProducer = ptr;
+                token.itemsConsumedFromCurrent = 1;
+                return true;
+            }
+            ptr = ptr->next_prod();
+            if (ptr == nullptr) {
+                ptr = tail;
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename It>
+    size_t try_dequeue_bulk(It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            count += ptr->dequeue_bulk(itemFirst, max - count);
+            if (count == max) {
+                break;
+            }
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit consumer token.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename It>
+    size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+    {
+        if (token.desiredProducer == nullptr
+            || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+            if (!update_current_producer_after_rotation(token)) {
+                return 0;
+            }
+        }
+
+        size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+        if (count == max) {
+            if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+                globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+            }
+            return max;
+        }
+        token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+        max -= count;
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr) {
+            ptr = tail;
+        }
+        while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+            auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+            count += dequeued;
+            if (dequeued != 0) {
+                token.currentProducer = ptr;
+                token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+            }
+            if (dequeued == max) {
+                break;
+            }
+            max -= dequeued;
+            ptr = ptr->next_prod();
+            if (ptr == nullptr) {
+                ptr = tail;
+            }
+        }
+        return count;
+    }
+
+    // Attempts to dequeue from a specific producer's inner queue.
+    // If you happen to know which producer you want to dequeue from, this
+    // is significantly faster than using the general-case try_dequeue methods.
+    // Returns false if the producer's queue appeared empty at the time it
+    // was checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename U>
+    inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+    {
+        return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+    }
+
+    // Attempts to dequeue several elements from a specific producer's inner queue.
+    // Returns the number of items actually dequeued.
+    // If you happen to know which producer you want to dequeue from, this
+    // is significantly faster than using the general-case try_dequeue methods.
+    // Returns 0 if the producer's queue appeared empty at the time it
+    // was checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template<typename It>
+    inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+    {
+        return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+    }
+
+    // Returns an estimate of the total number of elements currently in the queue. This
+    // estimate is only accurate if the queue has completely stabilized before it is called
+    // (i.e. all enqueue and dequeue operations have completed and their memory effects are
+    // visible on the calling thread, and no further operations start while this method is
+    // being called).
+    // Thread-safe.
+    size_t size_approx() const
+    {
+        size_t size = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            size += ptr->size_approx();
+        }
+        return size;
+    }
+
+    // Returns true if the underlying atomic variables used by
+    // the queue are lock-free (they should be on most platforms).
+    // Thread-safe.
+    static constexpr bool is_lock_free()
+    {
+        return
+            details::static_is_lock_free<bool>::value == 2
+            && details::static_is_lock_free<size_t>::value == 2
+            && details::static_is_lock_free<std::uint32_t>::value == 2
+            && details::static_is_lock_free<index_t>::value == 2
+            && details::static_is_lock_free<void*>::value == 2
+            && details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value
+            == 2;
+    }
+
+private:
+    friend struct ProducerToken;
+    friend struct ConsumerToken;
+    struct ExplicitProducer;
+    friend struct ExplicitProducer;
+    struct ImplicitProducer;
+    friend struct ImplicitProducer;
+    friend class ConcurrentQueueTests;
+
+    enum AllocationMode {
+        CanAlloc, CannotAlloc
+    };
+
+    ///////////////////////////////
+    // Queue methods
+    ///////////////////////////////
+
+    template<AllocationMode canAlloc, typename U>
+    inline bool inner_enqueue(producer_token_t const& token, U&& element)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(
+            std::forward<U>(element));
+    }
+
+    template<AllocationMode canAlloc, typename U>
+    inline bool inner_enqueue(U&& element)
+    {
+        auto producer = get_or_add_implicit_producer();
+        return producer
+               == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+    }
+
+    template<AllocationMode canAlloc, typename It>
+    inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst,
+                                                                                                                                  count);
+    }
+
+    template<AllocationMode canAlloc, typename It>
+    inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+    {
+        auto producer = get_or_add_implicit_producer();
+        return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+    }
+
+    inline bool update_current_producer_after_rotation(consumer_token_t& token)
+    {
+        // Ah, there's been a rotation, figure out where we should be!
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        if (token.desiredProducer == nullptr && tail == nullptr) {
+            return false;
+        }
+        auto prodCount = producerCount.load(std::memory_order_relaxed);
+        auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+        if ((details::unlikely)(token.desiredProducer == nullptr)) {
+            // Aha, first time we're dequeueing anything.
+            // Figure out our local position
+            // Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+            std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+            token.desiredProducer = tail;
+            for (std::uint32_t i = 0; i != offset; ++i) {
+                token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+                if (token.desiredProducer == nullptr) {
+                    token.desiredProducer = tail;
+                }
+            }
+        }
+
+        std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+        if (delta >= prodCount) {
+            delta = delta % prodCount;
+        }
+        for (std::uint32_t i = 0; i != delta; ++i) {
+            token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+            if (token.desiredProducer == nullptr) {
+                token.desiredProducer = tail;
+            }
+        }
+
+        token.lastKnownGlobalOffset = globalOffset;
+        token.currentProducer = token.desiredProducer;
+        token.itemsConsumedFromCurrent = 0;
+        return true;
+    }
+
+    ///////////////////////////
+    // Free list
+    ///////////////////////////
+
+    template<typename N>
+    struct FreeListNode
+    {
+        FreeListNode()
+            : freeListRefs(0), freeListNext(nullptr) { }
+
+        std::atomic<std::uint32_t> freeListRefs;
+        std::atomic<N*> freeListNext;
+    };
+
+    // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+    // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+    // speedy under low contention.
+    template<typename N>            // N must inherit FreeListNode or have the same fields (and initialization of them)
+    struct FreeList
+    {
+        FreeList()
+            : freeListHead(nullptr) { }
+        FreeList(FreeList&& other)
+            : freeListHead(other.freeListHead.load(std::memory_order_relaxed))
+        {
+            other.freeListHead.store(nullptr, std::memory_order_relaxed);
+        }
+
+        void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+
+        FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+        FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+
+        inline void add(N* node)
+        {
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+            debug::DebugLock lock(mutex);
+#endif
+            // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+            // set it using a fetch_add
+            if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+                // Oh look! We were the last ones referencing this node, and we know
+                // we want to add it to the free list, so let's do it!
+                add_knowing_refcount_is_zero(node);
+            }
+        }
+
+        inline N* try_get()
+        {
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+            debug::DebugLock lock(mutex);
+#endif
+            auto head = freeListHead.load(std::memory_order_acquire);
+            while (head != nullptr) {
+                auto prevHead = head;
+                auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+                if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire,
+                                                                                           std::memory_order_relaxed)) {
+                    head = freeListHead.load(std::memory_order_acquire);
+                    continue;
+                }
+
+                // Good, reference count has been incremented (it wasn't at zero), which means we can read the
+                // next and not worry about it changing between now and the time we do the CAS
+                auto next = head->freeListNext.load(std::memory_order_relaxed);
+                if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+                    // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+                    // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+                    assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+
+                    // Decrease refcount twice, once for our ref, and once for the list's ref
+                    head->freeListRefs.fetch_sub(2, std::memory_order_release);
+                    return head;
+                }
+
+                // OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+                // Note that we don't need to release any memory effects, but we do need to ensure that the reference
+                // count decrement happens-after the CAS on the head.
+                refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+                if (refs == SHOULD_BE_ON_FREELIST + 1) {
+                    add_knowing_refcount_is_zero(prevHead);
+                }
+            }
+
+            return nullptr;
+        }
+
+        // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+        N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+
+    private:
+        inline void add_knowing_refcount_is_zero(N* node)
+        {
+            // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+            // only one copy of this method per node at a time, i.e. the single thread case), then we know
+            // we can safely change the next pointer of the node; however, once the refcount is back above
+            // zero, then other threads could increase it (happens under heavy contention, when the refcount
+            // goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+            // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+            // to add the node to the actual list fails, decrease the refcount and leave the add operation to
+            // the next thread who puts the refcount back at zero (which could be us, hence the loop).
+            auto head = freeListHead.load(std::memory_order_relaxed);
+            while (true) {
+                node->freeListNext.store(head, std::memory_order_relaxed);
+                node->freeListRefs.store(1, std::memory_order_release);
+                if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+                    // Hmm, the add failed, but we can only try again when the refcount goes back to zero
+                    if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+                        continue;
+                    }
+                }
+                return;
+            }
+        }
+
+    private:
+        // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+        std::atomic<N*> freeListHead;
+
+        static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+        static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+        debug::DebugMutex mutex;
+#endif
+    };
+
+    ///////////////////////////
+    // Block
+    ///////////////////////////
+
+    enum InnerQueueContext {
+        implicit_context = 0, explicit_context = 1
+    };
+
+    struct Block
+    {
+        Block()
+            : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
+        {
+#ifdef MCDBGQ_TRACKMEM
+            owner = nullptr;
+#endif
+        }
+
+        template<InnerQueueContext context>
+        inline bool is_empty() const
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+                // Check flags
+                for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+                    if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+                        return false;
+                    }
+                }
+
+                // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+                std::atomic_thread_fence(std::memory_order_acquire);
+                return true;
+            } else {
+                // Check counter
+                if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+                    std::atomic_thread_fence(std::memory_order_acquire);
+                    return true;
+                }
+                assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+                return false;
+            }
+        }
+
+        // Returns true if the block is now empty (does not apply in explicit context)
+        template<InnerQueueContext context>
+        inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+                // Set flag
+                assert(!emptyFlags[BLOCK_SIZE - 1
+                                   - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+                emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true,
+                                                                                                                 std::memory_order_release);
+                return false;
+            } else {
+                // Increment counter
+                auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+                assert(prevVal < BLOCK_SIZE);
+                return prevVal == BLOCK_SIZE - 1;
+            }
+        }
+
+        // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+        // Returns true if the block is now empty (does not apply in explicit context).
+        template<InnerQueueContext context>
+        inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+                // Set flags
+                std::atomic_thread_fence(std::memory_order_release);
+                i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+                for (size_t j = 0; j != count; ++j) {
+                    assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+                    emptyFlags[i + j].store(true, std::memory_order_relaxed);
+                }
+                return false;
+            } else {
+                // Increment counter
+                auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+                assert(prevVal + count <= BLOCK_SIZE);
+                return prevVal + count == BLOCK_SIZE;
+            }
+        }
+
+        template<InnerQueueContext context>
+        inline void set_all_empty()
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+                // Set all flags
+                for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+                    emptyFlags[i].store(true, std::memory_order_relaxed);
+                }
+            } else {
+                // Reset counter
+                elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+            }
+        }
+
+        template<InnerQueueContext context>
+        inline void reset_empty()
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+                // Reset flags
+                for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+                    emptyFlags[i].store(false, std::memory_order_relaxed);
+                }
+            } else {
+                // Reset counter
+                elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+            }
+        }
+
+        inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT
+        {
+            return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+        }
+
+        inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT
+        {
+            return static_cast<T const*>(static_cast<void const*>(elements))
+                   + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+        }
+
+    private:
+        static_assert(std::alignment_of<T>::value <= sizeof(T),
+                      "The queue does not support types with an alignment greater than their size at this time");
+        MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+    public:
+        Block* next;
+        std::atomic<size_t> elementsCompletelyDequeued;
+        std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+    public:
+        std::atomic<std::uint32_t> freeListRefs;
+        std::atomic<Block*> freeListNext;
+        bool dynamicallyAllocated;              // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+
+#ifdef MCDBGQ_TRACKMEM
+        void* owner;
+#endif
+    };
+    static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value,
+                  "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+    struct MemStats;
+private:
+#endif
+
+    ///////////////////////////
+    // Producer base
+    ///////////////////////////
+
+    struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+    {
+        ProducerBase(ConcurrentQueue* parent_, bool isExplicit_)
+            : tailIndex(0),
+            headIndex(0),
+            dequeueOptimisticCount(0),
+            dequeueOvercommit(0),
+            tailBlock(nullptr),
+            isExplicit(isExplicit_),
+            parent(parent_)
+        {
+        }
+
+        virtual ~ProducerBase() { }
+
+        template<typename U>
+        inline bool dequeue(U& element)
+        {
+            if (isExplicit) {
+                return static_cast<ExplicitProducer*>(this)->dequeue(element);
+            } else {
+                return static_cast<ImplicitProducer*>(this)->dequeue(element);
+            }
+        }
+
+        template<typename It>
+        inline size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            if (isExplicit) {
+                return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+            } else {
+                return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+            }
+        }
+
+        inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+
+        inline size_t size_approx() const
+        {
+            auto tail = tailIndex.load(std::memory_order_relaxed);
+            auto head = headIndex.load(std::memory_order_relaxed);
+            return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+        }
+
+        inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+    protected:
+        std::atomic<index_t> tailIndex;               // Where to enqueue to next
+        std::atomic<index_t> headIndex;               // Where to dequeue from next
+
+        std::atomic<index_t> dequeueOptimisticCount;
+        std::atomic<index_t> dequeueOvercommit;
+
+        Block* tailBlock;
+
+    public:
+        bool isExplicit;
+        ConcurrentQueue* parent;
+
+    protected:
+#ifdef MCDBGQ_TRACKMEM
+        friend struct MemStats;
+#endif
+    };
+
+    ///////////////////////////
+    // Explicit queue
+    ///////////////////////////
+
+    struct ExplicitProducer : public ProducerBase
+    {
+        explicit ExplicitProducer(ConcurrentQueue* parent_)
+            : ProducerBase(parent_, true),
+            blockIndex(nullptr),
+            pr_blockIndexSlotsUsed(0),
+            pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+            pr_blockIndexFront(0),
+            pr_blockIndexEntries(nullptr),
+            pr_blockIndexRaw(nullptr)
+        {
+            size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+            if (poolBasedIndexSize > pr_blockIndexSize) {
+                pr_blockIndexSize = poolBasedIndexSize;
+            }
+
+            new_block_index(0);                 // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+        }
+
+        ~ExplicitProducer()
+        {
+            // Destruct any elements not yet dequeued.
+            // Since we're in the destructor, we can assume all elements
+            // are either completely dequeued or completely not (no halfways).
+            if (this->tailBlock != nullptr) {               // Note this means there must be a block index too
+                // First find the block that's partially dequeued, if any
+                Block* halfDequeuedBlock = nullptr;
+                if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+                    // The head's not on a block boundary, meaning a block somewhere is partially dequeued
+                    // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+                    size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+                    while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE,
+                                                                this->headIndex.load(std::memory_order_relaxed))) {
+                        i = (i + 1) & (pr_blockIndexSize - 1);
+                    }
+                    assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base,
+                                                                this->headIndex.load(std::memory_order_relaxed)));
+                    halfDequeuedBlock = pr_blockIndexEntries[i].block;
+                }
+
+                // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+                auto block = this->tailBlock;
+                do{
+                    block = block->next;
+                    if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+                        continue;
+                    }
+
+                    size_t i = 0;               // Offset into block
+                    if (block == halfDequeuedBlock) {
+                        i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+                    }
+
+                    // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+                    auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1))
+                                          == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed)
+                                                                                  & static_cast<index_t>(BLOCK_SIZE - 1));
+                    while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+                        (*block)[i++]->~T();
+                    }
+                } while (block != this->tailBlock);
+            }
+
+            // Destroy all blocks that we own
+            if (this->tailBlock != nullptr) {
+                auto block = this->tailBlock;
+                do{
+                    auto nextBlock = block->next;
+                    this->parent->add_block_to_free_list(block);
+                    block = nextBlock;
+                } while (block != this->tailBlock);
+            }
+
+            // Destroy the block indices
+            auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+            while (header != nullptr) {
+                auto prev = static_cast<BlockIndexHeader*>(header->prev);
+                header->~BlockIndexHeader();
+                (Traits::free)(header);
+                header = prev;
+            }
+        }
+
+        template<AllocationMode allocMode, typename U>
+        inline bool enqueue(U&& element)
+        {
+            index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            index_t newTailIndex = 1 + currentTailIndex;
+            if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+                // We reached the end of a block, start a new one
+                auto startBlock = this->tailBlock;
+                auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+                if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+                    // We can re-use the block ahead of us, it's empty!
+                    this->tailBlock = this->tailBlock->next;
+                    this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+
+                    // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+                    // last block from it first -- except instead of removing then adding, we can just overwrite).
+                    // Note that there must be a valid block index here, since even if allocation failed in the ctor,
+                    // it would have been re-attempted when adding the first block to the queue; since there is such
+                    // a block, a block index must have been successfully allocated.
+                } else {
+                    // Whatever head value we see here is >= the last value we saw here (relatively),
+                    // and <= its current value. Since we have the most recent tail, the head must be
+                    // <= to it.
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                    if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+                        || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value
+                            && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+                        // We can't enqueue in another block because there's not enough leeway -- the
+                        // tail could surpass the head by the time the block fills up! (Or we'll exceed
+                        // the size limit, if the second part of the condition was true.)
+                        return false;
+                    }
+                    // We're going to need a new block; check that the block index has room
+                    if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+                        // Hmm, the circular block index is already full -- we'll need
+                        // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+                        // the initial allocation failed in the constructor.
+
+                        MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+                            return false;
+                        } else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+                            return false;
+                        }
+                    }
+
+                    // Insert a new block in the circular linked list
+                    auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                    if (newBlock == nullptr) {
+                        return false;
+                    }
+#ifdef MCDBGQ_TRACKMEM
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+                    if (this->tailBlock == nullptr) {
+                        newBlock->next = newBlock;
+                    } else {
+                        newBlock->next = this->tailBlock->next;
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock = newBlock;
+                    ++pr_blockIndexSlotsUsed;
+                }
+
+                MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+                    // The constructor may throw. We want the element not to appear in the queue in
+                    // that case (without corrupting the queue):
+                    MOODYCAMEL_TRY {
+                        new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+                    }
+                    MOODYCAMEL_CATCH(...) {
+                        // Revert change to the current block, but leave the new block available
+                        // for next time
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+                        MOODYCAMEL_RETHROW;
+                    }
+                } else {
+                    (void)startBlock;
+                    (void)originalBlockIndexSlotsUsed;
+                }
+
+                // Add block to block index
+                auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                entry.base = currentTailIndex;
+                entry.block = this->tailBlock;
+                blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+                pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+                MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+                    this->tailIndex.store(newTailIndex, std::memory_order_release);
+                    return true;
+                }
+            }
+
+            // Enqueue
+            new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template<typename U>
+        bool dequeue(U& element)
+        {
+            auto tail = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+            if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+                // Might be something to dequeue, let's give it a try
+
+                // Note that this if is purely for performance purposes in the common case when the queue is
+                // empty and the values are eventually consistent -- we may enter here spuriously.
+
+                // Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+                // change them) and must be the same value at this point (inside the if) as when the if condition was
+                // evaluated.
+
+                // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+                // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+                // the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+                // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+                // read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+                // unfortunately that can't be shown to be correct using only the C++11 standard.
+                // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                // Increment optimistic counter, then check if it went over the boundary
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+
+                // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+                // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+                // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+                // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+                // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+                // overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+
+                // Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+                // this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+                // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+                    // Guaranteed to be at least one element to dequeue!
+
+                    // Get the index. Note that since there's guaranteed to be at least one element, this
+                    // will never exceed tail. We need to do an acquire-release fence here since it's possible
+                    // that whatever condition got us to this point was for an earlier enqueued element (that
+                    // we already see the memory effects for), but that by the time we increment somebody else
+                    // has incremented it, and we need to see the memory effects for *that* element, which is
+                    // in such a case is necessarily visible on the thread that incremented it in the first
+                    // place with the more current condition (they must have acquired a tail that is at least
+                    // as recent).
+                    auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+                    // Determine which block the element is in
+
+                    auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+                    auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+                    // We need to be careful here about subtracting and dividing because of index wrap-around.
+                    // When an index wraps, we need to preserve the sign of the offset when dividing it by the
+                    // block size (in order to get a correct signed block count offset in all cases):
+                    auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+                    auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                    auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase)
+                                                      / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                    auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+
+                    // Dequeue
+                    auto& el = *((*block)[index]);
+                    if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+                        // Make sure the element is still fully dequeued and destroyed even if the assignment
+                        // throws
+                        struct Guard {
+                            Block* block;
+                            index_t index;
+
+                            ~Guard()
+                            {
+                                (*block)[index]->~T();
+                                block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+                            }
+                        } guard = { block, index };
+
+                        element = std::move(el);             // NOLINT
+                    } else {
+                        element = std::move(el);             // NOLINT
+                        el.~T();             // NOLINT
+                        block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+                    }
+
+                    return true;
+                } else {
+                    // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+                    this->dequeueOvercommit.fetch_add(1, std::memory_order_release);                  // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+                }
+            }
+
+            return false;
+        }
+
+        template<AllocationMode allocMode, typename It>
+        bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+        {
+            // First, we need to make sure we have enough room to enqueue all of the elements;
+            // this means pre-allocating blocks and putting them in the block index (but only if
+            // all the allocations succeeded).
+            index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            auto startBlock = this->tailBlock;
+            auto originalBlockIndexFront = pr_blockIndexFront;
+            auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+            Block* firstAllocatedBlock = nullptr;
+
+            // Figure out how many blocks we'll need to allocate, and do so
+            size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1))
+                                   - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+            index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            if (blockBaseDiff > 0) {
+                // Allocate as many blocks as possible from ahead
+                while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock
+                       && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    this->tailBlock = this->tailBlock->next;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+                    auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                    entry.base = currentTailIndex;
+                    entry.block = this->tailBlock;
+                    pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                }
+
+                // Now allocate as many blocks as necessary from the block pool
+                while (blockBaseDiff > 0) {
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                    bool full = !details::circular_less_than<index_t>(head,
+                                                                      currentTailIndex + BLOCK_SIZE)
+                                || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value
+                                    && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+                    if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+                        MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+                            // Failed to allocate, undo changes (but keep injected blocks)
+                            pr_blockIndexFront = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                            return false;
+                        } else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+                            // Failed to allocate, undo changes (but keep injected blocks)
+                            pr_blockIndexFront = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                            return false;
+                        }
+
+                        // pr_blockIndexFront is updated inside new_block_index, so we need to
+                        // update our fallback value too (since we keep the new index even if we
+                        // later fail)
+                        originalBlockIndexFront = originalBlockIndexSlotsUsed;
+                    }
+
+                    // Insert a new block in the circular linked list
+                    auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                    if (newBlock == nullptr) {
+                        pr_blockIndexFront = originalBlockIndexFront;
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                        return false;
+                    }
+
+#ifdef MCDBGQ_TRACKMEM
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+                    if (this->tailBlock == nullptr) {
+                        newBlock->next = newBlock;
+                    } else {
+                        newBlock->next = this->tailBlock->next;
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock = newBlock;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+                    ++pr_blockIndexSlotsUsed;
+
+                    auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                    entry.base = currentTailIndex;
+                    entry.block = this->tailBlock;
+                    pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                }
+
+                // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+                // publish the new block index front
+                auto block = firstAllocatedBlock;
+                while (true) {
+                    block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+                    if (block == this->tailBlock) {
+                        break;
+                    }
+                    block = block->next;
+                }
+
+                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst),
+                                                                 new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+                    blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                                                                            std::memory_order_release);
+                }
+            }
+
+            // Enqueue, one block at a time
+            index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+            currentTailIndex = startTailIndex;
+            auto endBlock = this->tailBlock;
+            this->tailBlock = startBlock;
+            assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+                this->tailBlock = firstAllocatedBlock;
+            }
+            while (true) {
+                index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+                    stopIndex = newTailIndex;
+                }
+                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst),
+                                                                 new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+                    while (currentTailIndex != stopIndex) {
+                        new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+                    }
+                } else {
+                    MOODYCAMEL_TRY {
+                        while (currentTailIndex != stopIndex) {
+                            // Must use copy constructor even if move constructor is available
+                            // because we may have to revert if there's an exception.
+                            // Sorry about the horrible templated next line, but it was the only way
+                            // to disable moving *at compile time*, which is important because a type
+                            // may only define a (noexcept) move constructor, and so calls to the
+                            // cctor will not compile, even if they are in an if branch that will never
+                            // be executed
+                            new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T,
+                                                                                                                      decltype(*itemFirst),
+                                                                                                                      new (static_cast
+                                                                                                                           < T
+                                                                                                                           * > (nullptr)) T(
+                                                                                                                          details::
+                                                                                                                          deref_noexcept(
+                                                                                                                              itemFirst)))>
+                                                                         ::eval(*itemFirst));
+                            ++currentTailIndex;
+                            ++itemFirst;
+                        }
+                    }
+                    MOODYCAMEL_CATCH(...) {
+                        // Oh dear, an exception's been thrown -- destroy the elements that
+                        // were enqueued so far and revert the entire bulk operation (we'll keep
+                        // any allocated blocks in our linked list for later, though).
+                        auto constructedStopIndex = currentTailIndex;
+                        auto lastBlockEnqueued = this->tailBlock;
+
+                        pr_blockIndexFront = originalBlockIndexFront;
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+                        if (!details::is_trivially_destructible<T>::value) {
+                            auto block = startBlock;
+                            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+                                block = firstAllocatedBlock;
+                            }
+                            currentTailIndex = startTailIndex;
+                            while (true) {
+                                stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+                                    stopIndex = constructedStopIndex;
+                                }
+                                while (currentTailIndex != stopIndex) {
+                                    (*block)[currentTailIndex++]->~T();
+                                }
+                                if (block == lastBlockEnqueued) {
+                                    break;
+                                }
+                                block = block->next;
+                            }
+                        }
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                if (this->tailBlock == endBlock) {
+                    assert(currentTailIndex == newTailIndex);
+                    break;
+                }
+                this->tailBlock = this->tailBlock->next;
+            }
+
+            MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst),
+                                                              new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+                if (firstAllocatedBlock != nullptr) {
+                    blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                                                                            std::memory_order_release);
+                }
+            }
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template<typename It>
+        size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            auto tail = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+            auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+            if (details::circular_less_than<size_t>(0, desiredCount)) {
+                desiredCount = desiredCount < max ? desiredCount : max;
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                if (details::circular_less_than<size_t>(0, actualCount)) {
+                    actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+                    if (actualCount < desiredCount) {
+                        this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+                    }
+
+                    // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+                    // will never exceed tail.
+                    auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+                    // Determine which block the first element is in
+                    auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+                    auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+                    auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+                    auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                    auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex
+                                                                                                            - headBase)
+                                                      / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                    auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+                    // Iterate the blocks and dequeue
+                    auto index = firstIndex;
+                    do{
+                        auto firstIndexInBlock = index;
+                        index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                        endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount),
+                                                                        endIndex) ? firstIndex
+                                   + static_cast<index_t>(actualCount) : endIndex;
+                        auto block = localBlockIndex->entries[indexIndex].block;
+                        if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+                            while (index != endIndex) {
+                                auto& el = *((*block)[index]);
+                                *itemFirst++ = std::move(el);
+                                el.~T();
+                                ++index;
+                            }
+                        } else {
+                            MOODYCAMEL_TRY {
+                                while (index != endIndex) {
+                                    auto& el = *((*block)[index]);
+                                    *itemFirst = std::move(el);
+                                    ++itemFirst;
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            MOODYCAMEL_CATCH(...) {
+                                // It's too late to revert the dequeue, but we can make sure that all
+                                // the dequeued objects are properly destroyed and the block index
+                                // (and empty count) are properly updated before we propagate the exception
+                                do{
+                                    block = localBlockIndex->entries[indexIndex].block;
+                                    while (index != endIndex) {
+                                        (*block)[index++]->~T();
+                                    }
+                                    block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock,
+                                                                                                             static_cast<size_t>(endIndex
+                                                                                                                                 -
+                                                                                                                                 firstIndexInBlock));
+                                    indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+                                    firstIndexInBlock = index;
+                                    endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                    endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount),
+                                                                                    endIndex) ? firstIndex
+                                               + static_cast<index_t>(actualCount) : endIndex;
+                                } while (index != firstIndex + actualCount);
+
+                                MOODYCAMEL_RETHROW;
+                            }
+                        }
+                        block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock,
+                                                                                                 static_cast<size_t>(endIndex
+                                                                                                                     - firstIndexInBlock));
+                        indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+                    } while (index != firstIndex + actualCount);
+
+                    return actualCount;
+                } else {
+                    // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+                    this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+                }
+            }
+
+            return 0;
+        }
+
+    private:
+        struct BlockIndexEntry
+        {
+            index_t base;
+            Block* block;
+        };
+
+        struct BlockIndexHeader
+        {
+            size_t size;
+            std::atomic<size_t> front;                // Current slot (not next, like pr_blockIndexFront)
+            BlockIndexEntry* entries;
+            void* prev;
+        };
+
+        bool new_block_index(size_t numberOfFilledSlotsToExpose)
+        {
+            auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+            // Create the new block
+            pr_blockIndexSize <<= 1;
+            auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1
+                                                                 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+            if (newRawPtr == nullptr) {
+                pr_blockIndexSize >>= 1;                    // Reset to allow graceful retry
+                return false;
+            }
+
+            auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr
+                                                                                                               + sizeof(BlockIndexHeader)));
+
+            // Copy in all the old indices, if any
+            size_t j = 0;
+            if (pr_blockIndexSlotsUsed != 0) {
+                auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+                do{
+                    newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+                    i = (i + 1) & prevBlockSizeMask;
+                } while (i != pr_blockIndexFront);
+            }
+
+            // Update everything
+            auto header = new (newRawPtr) BlockIndexHeader;
+            header->size = pr_blockIndexSize;
+            header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+            header->entries = newBlockIndexEntries;
+            header->prev = pr_blockIndexRaw;                // we link the new block to the old one so we can free it later
+
+            pr_blockIndexFront = j;
+            pr_blockIndexEntries = newBlockIndexEntries;
+            pr_blockIndexRaw = newRawPtr;
+            blockIndex.store(header, std::memory_order_release);
+
+            return true;
+        }
+
+    private:
+        std::atomic<BlockIndexHeader*> blockIndex;
+
+        // To be used by producer only -- consumer must use the ones in referenced by blockIndex
+        size_t pr_blockIndexSlotsUsed;
+        size_t pr_blockIndexSize;
+        size_t pr_blockIndexFront;              // Next slot (not current)
+        BlockIndexEntry* pr_blockIndexEntries;
+        void* pr_blockIndexRaw;
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    public:
+        ExplicitProducer* nextExplicitProducer;
+    private:
+#endif
+
+#ifdef MCDBGQ_TRACKMEM
+        friend struct MemStats;
+#endif
+    };
+
+    //////////////////////////////////
+    // Implicit queue
+    //////////////////////////////////
+
+    struct ImplicitProducer : public ProducerBase
+    {
+        ImplicitProducer(ConcurrentQueue* parent_)
+            : ProducerBase(parent_, false),
+            nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+            blockIndex(nullptr)
+        {
+            new_block_index();
+        }
+
+        ~ImplicitProducer()
+        {
+            // Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+            // completed already; this means that all undequeued elements are placed contiguously across
+            // contiguous blocks, and that only the first and last remaining blocks can be only partially
+            // empty (all other remaining blocks must be completely full).
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+            // Unregister ourselves for thread termination notification
+            if (!this->inactive.load(std::memory_order_relaxed)) {
+                details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+            }
+#endif
+
+            // Destroy all remaining elements!
+            auto tail = this->tailIndex.load(std::memory_order_relaxed);
+            auto index = this->headIndex.load(std::memory_order_relaxed);
+            Block* block = nullptr;
+            assert(index == tail || details::circular_less_than(index, tail));
+            bool forceFreeLastBlock = index != tail;              // If we enter the loop, then the last (tail) block will not be freed
+            while (index != tail) {
+                if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+                    if (block != nullptr) {
+                        // Free the old block
+                        this->parent->add_block_to_free_list(block);
+                    }
+
+                    block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+                }
+
+                ((*block)[index])->~T();
+                ++index;
+            }
+            // Even if the queue is empty, there's still one block that's not on the free list
+            // (unless the head index reached the end of it, in which case the tail will be poised
+            // to create a new block).
+            if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+                this->parent->add_block_to_free_list(this->tailBlock);
+            }
+
+            // Destroy block index
+            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+            if (localBlockIndex != nullptr) {
+                for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+                    localBlockIndex->index[i]->~BlockIndexEntry();
+                }
+                do{
+                    auto prev = localBlockIndex->prev;
+                    localBlockIndex->~BlockIndexHeader();
+                    (Traits::free)(localBlockIndex);
+                    localBlockIndex = prev;
+                } while (localBlockIndex != nullptr);
+            }
+        }
+
+        template<AllocationMode allocMode, typename U>
+        inline bool enqueue(U&& element)
+        {
+            index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            index_t newTailIndex = 1 + currentTailIndex;
+            if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+                // We reached the end of a block, start a new one
+                auto head = this->headIndex.load(std::memory_order_relaxed);
+                assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                if (!details::circular_less_than<index_t>(head,
+                                                          currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+                    return false;
+                }
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                // Find out where we'll be inserting this block in the block index
+                BlockIndexEntry* idxEntry;
+                if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+                    return false;
+                }
+
+                // Get ahold of a new block
+                auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                if (newBlock == nullptr) {
+                    rewind_block_index_tail();
+                    idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                    return false;
+                }
+#ifdef MCDBGQ_TRACKMEM
+                newBlock->owner = this;
+#endif
+                newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+                MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+                    // May throw, try to insert now before we publish the fact that we have this new block
+                    MOODYCAMEL_TRY {
+                        new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+                    }
+                    MOODYCAMEL_CATCH(...) {
+                        rewind_block_index_tail();
+                        idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                        this->parent->add_block_to_free_list(newBlock);
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                // Insert the new block into the index
+                idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                this->tailBlock = newBlock;
+
+                MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+                    this->tailIndex.store(newTailIndex, std::memory_order_release);
+                    return true;
+                }
+            }
+
+            // Enqueue
+            new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template<typename U>
+        bool dequeue(U& element)
+        {
+            // See ExplicitProducer::dequeue for rationale and explanation
+            index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+            index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+            if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+                    index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+                    // Determine which block the element is in
+                    auto entry = get_block_index_entry_for_index(index);
+
+                    // Dequeue
+                    auto block = entry->value.load(std::memory_order_relaxed);
+                    auto& el = *((*block)[index]);
+
+                    if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                        // Note: Acquiring the mutex with every dequeue instead of only when a block
+                        // is released is very sub-optimal, but it is, after all, purely debug code.
+                        debug::DebugLock lock(producer->mutex);
+#endif
+                        struct Guard {
+                            Block* block;
+                            index_t index;
+                            BlockIndexEntry* entry;
+                            ConcurrentQueue* parent;
+
+                            ~Guard()
+                            {
+                                (*block)[index]->~T();
+                                if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+                                    entry->value.store(nullptr, std::memory_order_relaxed);
+                                    parent->add_block_to_free_list(block);
+                                }
+                            }
+                        } guard = { block, index, entry, this->parent };
+
+                        element = std::move(el);             // NOLINT
+                    } else {
+                        element = std::move(el);             // NOLINT
+                        el.~T();             // NOLINT
+
+                        if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+                            {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                                debug::DebugLock lock(mutex);
+#endif
+                                // Add the block back into the global free pool (and remove from block index)
+                                entry->value.store(nullptr, std::memory_order_relaxed);
+                            }
+                            this->parent->add_block_to_free_list(block);                        // releases the above store
+                        }
+                    }
+
+                    return true;
+                } else {
+                    this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+                }
+            }
+
+            return false;
+        }
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4706)  // assignment within conditional expression
+#endif
+        template<AllocationMode allocMode, typename It>
+        bool enqueue_bulk(It itemFirst, size_t count)
+        {
+            // First, we need to make sure we have enough room to enqueue all of the elements;
+            // this means pre-allocating blocks and putting them in the block index (but only if
+            // all the allocations succeeded).
+
+            // Note that the tailBlock we start off with may not be owned by us any more;
+            // this happens if it was filled up exactly to the top (setting tailIndex to
+            // the first index of the next block which is not yet allocated), then dequeued
+            // completely (putting it on the free list) before we enqueue again.
+
+            index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            auto startBlock = this->tailBlock;
+            Block* firstAllocatedBlock = nullptr;
+            auto endBlock = this->tailBlock;
+
+            // Figure out how many blocks we'll need to allocate, and do so
+            size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1))
+                                   - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+            index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                do{
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    // Find out where we'll be inserting this block in the block index
+                    BlockIndexEntry* idxEntry = nullptr;            // initialization here unnecessary but compiler can't always tell
+                    Block* newBlock;
+                    bool indexInserted = false;
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                    bool full = !details::circular_less_than<index_t>(head,
+                                                                      currentTailIndex + BLOCK_SIZE)
+                                || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value
+                                    && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+                    if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry,
+                                                                                      currentTailIndex))
+                        || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+                        // Index allocation or block allocation failed; revert any other allocations
+                        // and index insertions done so far for this operation
+                        if (indexInserted) {
+                            rewind_block_index_tail();
+                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                        }
+                        currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+                            currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+                            idxEntry = get_block_index_entry_for_index(currentTailIndex);
+                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                            rewind_block_index_tail();
+                        }
+                        this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+                        this->tailBlock = startBlock;
+
+                        return false;
+                    }
+
+#ifdef MCDBGQ_TRACKMEM
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+                    newBlock->next = nullptr;
+
+                    // Insert the new block into the index
+                    idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                    // Store the chain of blocks so that we can undo if later allocations fail,
+                    // and so that we can find the blocks when we do the actual enqueueing
+                    if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+                        assert(this->tailBlock != nullptr);
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock = newBlock;
+                    endBlock = newBlock;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+                } while (blockBaseDiff > 0);
+            }
+
+            // Enqueue, one block at a time
+            index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+            currentTailIndex = startTailIndex;
+            this->tailBlock = startBlock;
+            assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+                this->tailBlock = firstAllocatedBlock;
+            }
+            while (true) {
+                index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+                    stopIndex = newTailIndex;
+                }
+                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst),
+                                                                 new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+                    while (currentTailIndex != stopIndex) {
+                        new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+                    }
+                } else {
+                    MOODYCAMEL_TRY {
+                        while (currentTailIndex != stopIndex) {
+                            new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T,
+                                                                                                                      decltype(*itemFirst),
+                                                                                                                      new (static_cast
+                                                                                                                           < T
+                                                                                                                           * > (nullptr)) T(
+                                                                                                                          details::
+                                                                                                                          deref_noexcept(
+                                                                                                                              itemFirst)))>
+                                                                         ::eval(*itemFirst));
+                            ++currentTailIndex;
+                            ++itemFirst;
+                        }
+                    }
+                    MOODYCAMEL_CATCH(...) {
+                        auto constructedStopIndex = currentTailIndex;
+                        auto lastBlockEnqueued = this->tailBlock;
+
+                        if (!details::is_trivially_destructible<T>::value) {
+                            auto block = startBlock;
+                            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+                                block = firstAllocatedBlock;
+                            }
+                            currentTailIndex = startTailIndex;
+                            while (true) {
+                                stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+                                    stopIndex = constructedStopIndex;
+                                }
+                                while (currentTailIndex != stopIndex) {
+                                    (*block)[currentTailIndex++]->~T();
+                                }
+                                if (block == lastBlockEnqueued) {
+                                    break;
+                                }
+                                block = block->next;
+                            }
+                        }
+
+                        currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+                            currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+                            auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                            rewind_block_index_tail();
+                        }
+                        this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+                        this->tailBlock = startBlock;
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                if (this->tailBlock == endBlock) {
+                    assert(currentTailIndex == newTailIndex);
+                    break;
+                }
+                this->tailBlock = this->tailBlock->next;
+            }
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+        template<typename It>
+        size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            auto tail = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+            auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+            if (details::circular_less_than<size_t>(0, desiredCount)) {
+                desiredCount = desiredCount < max ? desiredCount : max;
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                if (details::circular_less_than<size_t>(0, actualCount)) {
+                    actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+                    if (actualCount < desiredCount) {
+                        this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+                    }
+
+                    // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+                    // will never exceed tail.
+                    auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+                    // Iterate the blocks and dequeue
+                    auto index = firstIndex;
+                    BlockIndexHeader* localBlockIndex;
+                    auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+                    do{
+                        auto blockStartIndex = index;
+                        index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                        endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount),
+                                                                        endIndex) ? firstIndex
+                                   + static_cast<index_t>(actualCount) : endIndex;
+
+                        auto entry = localBlockIndex->index[indexIndex];
+                        auto block = entry->value.load(std::memory_order_relaxed);
+                        if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+                            while (index != endIndex) {
+                                auto& el = *((*block)[index]);
+                                *itemFirst++ = std::move(el);
+                                el.~T();
+                                ++index;
+                            }
+                        } else {
+                            MOODYCAMEL_TRY {
+                                while (index != endIndex) {
+                                    auto& el = *((*block)[index]);
+                                    *itemFirst = std::move(el);
+                                    ++itemFirst;
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            MOODYCAMEL_CATCH(...) {
+                                do{
+                                    entry = localBlockIndex->index[indexIndex];
+                                    block = entry->value.load(std::memory_order_relaxed);
+                                    while (index != endIndex) {
+                                        (*block)[index++]->~T();
+                                    }
+
+                                    if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex,
+                                                                                                                 static_cast<size_t>(
+                                                                                                                     endIndex
+                                                                                                                     - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                                        debug::DebugLock lock(mutex);
+#endif
+                                        entry->value.store(nullptr, std::memory_order_relaxed);
+                                        this->parent->add_block_to_free_list(block);
+                                    }
+                                    indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+                                    blockStartIndex = index;
+                                    endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                    endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount),
+                                                                                    endIndex) ? firstIndex
+                                               + static_cast<index_t>(actualCount) : endIndex;
+                                } while (index != firstIndex + actualCount);
+
+                                MOODYCAMEL_RETHROW;
+                            }
+                        }
+                        if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex,
+                                                                                                     static_cast<size_t>(endIndex
+                                                                                                                         - blockStartIndex)))
+                        {
+                            {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                                debug::DebugLock lock(mutex);
+#endif
+                                // Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+                                // we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+                                entry->value.store(nullptr, std::memory_order_relaxed);
+                            }
+                            this->parent->add_block_to_free_list(block);                        // releases the above store
+                        }
+                        indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+                    } while (index != firstIndex + actualCount);
+
+                    return actualCount;
+                } else {
+                    this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+                }
+            }
+
+            return 0;
+        }
+
+    private:
+        // The block size must be > 1, so any number with the low bit set is an invalid block base index
+        static const index_t INVALID_BLOCK_BASE = 1;
+
+        struct BlockIndexEntry
+        {
+            std::atomic<index_t> key;
+            std::atomic<Block*> value;
+        };
+
+        struct BlockIndexHeader
+        {
+            size_t capacity;
+            std::atomic<size_t> tail;
+            BlockIndexEntry* entries;
+            BlockIndexEntry** index;
+            BlockIndexHeader* prev;
+        };
+
+        template<AllocationMode allocMode>
+        inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+        {
+            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);                  // We're the only writer thread, relaxed is OK
+            if (localBlockIndex == nullptr) {
+                return false;          // this can happen if new_block_index failed in the constructor
+            }
+            size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+            idxEntry = localBlockIndex->index[newTail];
+            if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE
+                || idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+                idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                localBlockIndex->tail.store(newTail, std::memory_order_release);
+                return true;
+            }
+
+            // No room in the old block index, try to allocate another one!
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+                return false;
+            } else if (!new_block_index()) {
+                return false;
+            } else {
+                localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+                newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+                idxEntry = localBlockIndex->index[newTail];
+                assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+                idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                localBlockIndex->tail.store(newTail, std::memory_order_release);
+                return true;
+            }
+        }
+
+        inline void rewind_block_index_tail()
+        {
+            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+            localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1),
+                                        std::memory_order_relaxed);
+        }
+
+        inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+        {
+            BlockIndexHeader* localBlockIndex;
+            auto idx = get_block_index_index_for_index(index, localBlockIndex);
+            return localBlockIndex->index[idx];
+        }
+
+        inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+        {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+            debug::DebugLock lock(mutex);
+#endif
+            index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+            localBlockIndex = blockIndex.load(std::memory_order_acquire);
+            auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+            auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+            assert(tailBase != INVALID_BLOCK_BASE);
+            // Note: Must use division instead of shift because the index may wrap around, causing a negative
+            // offset, whose negativity we want to preserve
+            auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase)
+                                              / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+            size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+            assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index
+                   && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+            return idx;
+        }
+
+        bool new_block_index()
+        {
+            auto prev = blockIndex.load(std::memory_order_relaxed);
+            size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+            auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+            auto raw = static_cast<char*>((Traits::malloc)(
+                                              sizeof(BlockIndexHeader)
+                                              + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount
+                                              + std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*)
+                                              * nextBlockIndexCapacity));
+            if (raw == nullptr) {
+                return false;
+            }
+
+            auto header = new (raw) BlockIndexHeader;
+            auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+            auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries)
+                                                                                                  + sizeof(BlockIndexEntry) * entryCount));
+            if (prev != nullptr) {
+                auto prevTail = prev->tail.load(std::memory_order_relaxed);
+                auto prevPos = prevTail;
+                size_t i = 0;
+                do{
+                    prevPos = (prevPos + 1) & (prev->capacity - 1);
+                    index[i++] = prev->index[prevPos];
+                } while (prevPos != prevTail);
+                assert(i == prevCapacity);
+            }
+            for (size_t i = 0; i != entryCount; ++i) {
+                new (entries + i) BlockIndexEntry;
+                entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+                index[prevCapacity + i] = entries + i;
+            }
+            header->prev = prev;
+            header->entries = entries;
+            header->index = index;
+            header->capacity = nextBlockIndexCapacity;
+            header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+
+            blockIndex.store(header, std::memory_order_release);
+
+            nextBlockIndexCapacity <<= 1;
+
+            return true;
+        }
+
+    private:
+        size_t nextBlockIndexCapacity;
+        std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+    public:
+        details::ThreadExitListener threadExitListener;
+    private:
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    public:
+        ImplicitProducer* nextImplicitProducer;
+    private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+        mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+        friend struct MemStats;
+#endif
+    };
+
+    //////////////////////////////////
+    // Block pool manipulation
+    //////////////////////////////////
+
+    void populate_initial_block_list(size_t blockCount)
+    {
+        initialBlockPoolSize = blockCount;
+        if (initialBlockPoolSize == 0) {
+            initialBlockPool = nullptr;
+            return;
+        }
+
+        initialBlockPool = create_array<Block>(blockCount);
+        if (initialBlockPool == nullptr) {
+            initialBlockPoolSize = 0;
+        }
+        for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+            initialBlockPool[i].dynamicallyAllocated = false;
+        }
+    }
+
+    inline Block* try_get_block_from_initial_pool()
+    {
+        if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+            return nullptr;
+        }
+
+        auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+        return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+    }
+
+    inline void add_block_to_free_list(Block* block)
+    {
+#ifdef MCDBGQ_TRACKMEM
+        block->owner = nullptr;
+#endif
+        if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+            destroy(block);
+        } else {
+            freeList.add(block);
+        }
+    }
+
+    inline void add_blocks_to_free_list(Block* block)
+    {
+        while (block != nullptr) {
+            auto next = block->next;
+            add_block_to_free_list(block);
+            block = next;
+        }
+    }
+
+    inline Block* try_get_block_from_free_list()
+    {
+        return freeList.try_get();
+    }
+
+    // Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+    template<AllocationMode canAlloc>
+    Block* requisition_block()
+    {
+        auto block = try_get_block_from_initial_pool();
+        if (block != nullptr) {
+            return block;
+        }
+
+        block = try_get_block_from_free_list();
+        if (block != nullptr) {
+            return block;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) {
+            return create<Block>();
+        } else {
+            return nullptr;
+        }
+    }
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+    struct MemStats {
+        size_t allocatedBlocks;
+        size_t usedBlocks;
+        size_t freeBlocks;
+        size_t ownedBlocksExplicit;
+        size_t ownedBlocksImplicit;
+        size_t implicitProducers;
+        size_t explicitProducers;
+        size_t elementsEnqueued;
+        size_t blockClassBytes;
+        size_t queueClassBytes;
+        size_t implicitBlockIndexBytes;
+        size_t explicitBlockIndexBytes;
+
+        friend class ConcurrentQueue;
+
+    private:
+        static MemStats getFor(ConcurrentQueue* q)
+        {
+            MemStats stats = { 0 };
+
+            stats.elementsEnqueued = q->size_approx();
+
+            auto block = q->freeList.head_unsafe();
+            while (block != nullptr) {
+                ++stats.allocatedBlocks;
+                ++stats.freeBlocks;
+                block = block->freeListNext.load(std::memory_order_relaxed);
+            }
+
+            for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+                bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+                stats.implicitProducers += implicit ? 1 : 0;
+                stats.explicitProducers += implicit ? 0 : 1;
+
+                if (implicit) {
+                    auto prod = static_cast<ImplicitProducer*>(ptr);
+                    stats.queueClassBytes += sizeof(ImplicitProducer);
+                    auto head = prod->headIndex.load(std::memory_order_relaxed);
+                    auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+                    auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+                    if (hash != nullptr) {
+                        for (size_t i = 0; i != hash->capacity; ++i) {
+                            if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE
+                                && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+                                ++stats.allocatedBlocks;
+                                ++stats.ownedBlocksImplicit;
+                            }
+                        }
+                        stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+                        for (; hash != nullptr; hash = hash->prev) {
+                            stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity
+                                                             * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+                        }
+                    }
+                    for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+                        //auto block = prod->get_block_index_entry_for_index(head);
+                        ++stats.usedBlocks;
+                    }
+                } else {
+                    auto prod = static_cast<ExplicitProducer*>(ptr);
+                    stats.queueClassBytes += sizeof(ExplicitProducer);
+                    auto tailBlock = prod->tailBlock;
+                    bool wasNonEmpty = false;
+                    if (tailBlock != nullptr) {
+                        auto block = tailBlock;
+                        do{
+                            ++stats.allocatedBlocks;
+                            if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+                                ++stats.usedBlocks;
+                                wasNonEmpty = wasNonEmpty || block != tailBlock;
+                            }
+                            ++stats.ownedBlocksExplicit;
+                            block = block->next;
+                        } while (block != tailBlock);
+                    }
+                    auto index = prod->blockIndex.load(std::memory_order_relaxed);
+                    while (index != nullptr) {
+                        stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size
+                                                         * sizeof(typename ExplicitProducer::BlockIndexEntry);
+                        index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+                    }
+                }
+            }
+
+            auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed)
+                                     >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize
+                                     - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+            stats.allocatedBlocks += freeOnInitialPool;
+            stats.freeBlocks += freeOnInitialPool;
+
+            stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+            stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+            return stats;
+        }
+    };
+
+    // For debugging only. Not thread-safe.
+    MemStats getMemStats()
+    {
+        return MemStats::getFor(this);
+    }
+
+private:
+    friend struct MemStats;
+#endif
+
+    //////////////////////////////////
+    // Producer list manipulation
+    //////////////////////////////////
+
+    ProducerBase* recycle_or_create_producer(bool isExplicit)
+    {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+        // Try to re-use one first
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+                bool expected = true;
+                if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire,
+                                                          std::memory_order_relaxed)) {
+                    // We caught one! It's been marked as activated, the caller can have it
+                    return ptr;
+                }
+            }
+        }
+
+        return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+    }
+
+    ProducerBase* add_producer(ProducerBase* producer)
+    {
+        // Handle failed memory allocation
+        if (producer == nullptr) {
+            return nullptr;
+        }
+
+        producerCount.fetch_add(1, std::memory_order_relaxed);
+
+        // Add it to the lock-free list
+        auto prevTail = producerListTail.load(std::memory_order_relaxed);
+        do{
+            producer->next = prevTail;
+        } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        if (producer->isExplicit) {
+            auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+            do{
+                static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+            } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer),
+                                                              std::memory_order_release, std::memory_order_relaxed));
+        } else {
+            auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+            do{
+                static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+            } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer),
+                                                              std::memory_order_release, std::memory_order_relaxed));
+        }
+#endif
+
+        return producer;
+    }
+
+    void reown_producers()
+    {
+        // After another instance is moved-into/swapped-with this one, all the
+        // producers we stole still think their parents are the other queue.
+        // So fix them up!
+        for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+            ptr->parent = this;
+        }
+    }
+
+    //////////////////////////////////
+    // Implicit producer hash
+    //////////////////////////////////
+
+    struct ImplicitProducerKVP
+    {
+        std::atomic<details::thread_id_t> key;
+        ImplicitProducer* value;                // No need for atomicity since it's only read by the thread that sets it in the first place
+
+        ImplicitProducerKVP()
+            : value(nullptr) { }
+
+        ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+        {
+            key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            value = other.value;
+        }
+
+        inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+        {
+            swap(other);
+            return *this;
+        }
+
+        inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+        {
+            if (this != &other) {
+                details::swap_relaxed(key, other.key);
+                std::swap(value, other.value);
+            }
+        }
+    };
+
+    template<typename XT, typename XTraits>
+    friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&,
+                                 typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+
+    struct ImplicitProducerHash
+    {
+        size_t capacity;
+        ImplicitProducerKVP* entries;
+        ImplicitProducerHash* prev;
+    };
+
+    inline void populate_initial_implicit_producer_hash()
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+            return;
+        } else {
+            implicitProducerHashCount.store(0, std::memory_order_relaxed);
+            auto hash = &initialImplicitProducerHash;
+            hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+            hash->entries = &initialImplicitProducerHashEntries[0];
+            for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+                initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+            }
+            hash->prev = nullptr;
+            implicitProducerHash.store(hash, std::memory_order_relaxed);
+        }
+    }
+
+    void swap_implicit_producer_hashes(ConcurrentQueue& other)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+            return;
+        } else {
+            // Swap (assumes our implicit producer hash is initialized)
+            initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+            initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+            other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+
+            details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+
+            details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+            if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+                implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+            } else {
+                ImplicitProducerHash* hash;
+                for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash;
+                     hash = hash->prev) {
+                    continue;
+                }
+                hash->prev = &initialImplicitProducerHash;
+            }
+            if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+                other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+            } else {
+                ImplicitProducerHash* hash;
+                for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash;
+                     hash = hash->prev) {
+                    continue;
+                }
+                hash->prev = &other.initialImplicitProducerHash;
+            }
+        }
+    }
+
+    // Only fails (returns nullptr) if memory allocation fails
+    ImplicitProducer* get_or_add_implicit_producer()
+    {
+        // Note that since the data is essentially thread-local (key is thread ID),
+        // there's a reduced need for fences (memory ordering is already consistent
+        // for any individual thread), except for the current table itself.
+
+        // Start by looking for the thread ID in the current and all previous hash tables.
+        // If it's not found, it must not be in there yet, since this same thread would
+        // have added it previously to one of the tables that we traversed.
+
+        // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+
+        auto id = details::thread_id();
+        auto hashedId = details::hash_thread_id(id);
+
+        auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        assert(mainHash != nullptr);      // silence clang-tidy and MSVC warnings (hash cannot be null)
+        for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+            // Look for the id in this hash
+            auto index = hashedId;
+            while (true) {                // Not an infinite loop because at least one slot is free in the hash table
+                index &= hash->capacity - 1u;
+
+                auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+                if (probedKey == id) {
+                    // Found it! If we had to search several hashes deep, though, we should lazily add it
+                    // to the current main hash table to avoid the extended search next time.
+                    // Note there's guaranteed to be room in the current hash table since every subsequent
+                    // table implicitly reserves space for all previous tables (there's only one
+                    // implicitProducerHashCount).
+                    auto value = hash->entries[index].value;
+                    if (hash != mainHash) {
+                        index = hashedId;
+                        while (true) {
+                            index &= mainHash->capacity - 1u;
+                            auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+                            auto reusable = details::invalid_thread_id2;
+                            if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst,
+                                                                                     std::memory_order_relaxed)
+                                || mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst,
+                                                                                        std::memory_order_relaxed)) {
+#else
+                            if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst,
+                                                                                     std::memory_order_relaxed)) {
+#endif
+                                mainHash->entries[index].value = value;
+                                break;
+                            }
+                            ++index;
+                        }
+                    }
+
+                    return value;
+                }
+                if (probedKey == details::invalid_thread_id) {
+                    break;                      // Not in this hash table
+                }
+                ++index;
+            }
+        }
+
+        // Insert!
+        auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+        while (true) {
+            // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+            if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+                // We've acquired the resize lock, try to allocate a bigger hash table.
+                // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+                // we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+                // locked block).
+                mainHash = implicitProducerHash.load(std::memory_order_acquire);
+                if (newCount >= (mainHash->capacity >> 1)) {
+                    size_t newCapacity = mainHash->capacity << 1;
+                    while (newCount >= (newCapacity >> 1)) {
+                        newCapacity <<= 1;
+                    }
+                    auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash)
+                                                                   + std::alignment_of<ImplicitProducerKVP>::value - 1
+                                                                   + sizeof(ImplicitProducerKVP) * newCapacity));
+                    if (raw == nullptr) {
+                        // Allocation failed
+                        implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+                        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+                        return nullptr;
+                    }
+
+                    auto newHash = new (raw) ImplicitProducerHash;
+                    newHash->capacity = static_cast<size_t>(newCapacity);
+                    newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw
+                                                                                                                      + sizeof(
+                                                                                                                          ImplicitProducerHash)));
+                    for (size_t i = 0; i != newCapacity; ++i) {
+                        new (newHash->entries + i) ImplicitProducerKVP;
+                        newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+                    }
+                    newHash->prev = mainHash;
+                    implicitProducerHash.store(newHash, std::memory_order_release);
+                    implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+                    mainHash = newHash;
+                } else {
+                    implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+                }
+            }
+
+            // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+            // to finish being allocated by another thread (and if we just finished allocating above, the condition will
+            // always be true)
+            if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+                auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+                if (producer == nullptr) {
+                    implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+                    return nullptr;
+                }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+                producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+                producer->threadExitListener.userData = producer;
+                details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+
+                auto index = hashedId;
+                while (true) {
+                    index &= mainHash->capacity - 1u;
+                    auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+                    auto reusable = details::invalid_thread_id2;
+                    if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst,
+                                                                             std::memory_order_relaxed)) {
+                        implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);              // already counted as a used slot
+                        mainHash->entries[index].value = producer;
+                        break;
+                    }
+#endif
+                    if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst,
+                                                                             std::memory_order_relaxed)) {
+                        mainHash->entries[index].value = producer;
+                        break;
+                    }
+                    ++index;
+                }
+                return producer;
+            }
+
+            // Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+            // We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+            // we try to allocate ourselves).
+            mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        }
+    }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+    void implicit_producer_thread_exited(ImplicitProducer* producer)
+    {
+        // Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+        auto hash = implicitProducerHash.load(std::memory_order_acquire);
+        assert(hash != nullptr);                // The thread exit listener is only registered if we were added to a hash in the first place
+        auto id = details::thread_id();
+        auto hashedId = details::hash_thread_id(id);
+        details::thread_id_t probedKey;
+
+        // We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+        // trying to add an entry thinking there's a free slot (because they reused a producer)
+        for (; hash != nullptr; hash = hash->prev) {
+            auto index = hashedId;
+            do{
+                index &= hash->capacity - 1u;
+                probedKey = id;
+                if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst,
+                                                                     std::memory_order_relaxed)) {
+                    break;
+                }
+                ++index;
+            } while (probedKey != details::invalid_thread_id);                // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+        }
+
+        // Mark the queue as being recyclable
+        producer->inactive.store(true, std::memory_order_release);
+    }
+
+    static void implicit_producer_thread_exited_callback(void* userData)
+    {
+        auto producer = static_cast<ImplicitProducer*>(userData);
+        auto queue = producer->parent;
+        queue->implicit_producer_thread_exited(producer);
+    }
+
+#endif
+
+    //////////////////////////////////
+    // Utility functions
+    //////////////////////////////////
+
+    template<typename TAlign>
+    static inline void* aligned_malloc(size_t size)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+        return (Traits::malloc)(size);
+        else {
+            size_t alignment = std::alignment_of<TAlign>::value;
+            void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+            if (!raw) {
+                return nullptr;
+            }
+            char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+            *(reinterpret_cast<void**>(ptr) - 1) = raw;
+            return ptr;
+        }
+    }
+
+    template<typename TAlign>
+    static inline void aligned_free(void* ptr)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+        return (Traits::free)(ptr);
+        else {
+            (Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+        }
+    }
+
+    template<typename U>
+    static inline U* create_array(size_t count)
+    {
+        assert(count > 0);
+        U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+        if (p == nullptr) {
+            return nullptr;
+        }
+
+        for (size_t i = 0; i != count; ++i) {
+            new (p + i) U();
+        }
+        return p;
+    }
+
+    template<typename U>
+    static inline void destroy_array(U* p, size_t count)
+    {
+        if (p != nullptr) {
+            assert(count > 0);
+            for (size_t i = count; i != 0;) {
+                (p + --i)->~U();
+            }
+        }
+        aligned_free<U>(p);
+    }
+
+    template<typename U>
+    static inline U* create()
+    {
+        void* p = aligned_malloc<U>(sizeof(U));
+        return p != nullptr ? new (p) U : nullptr;
+    }
+
+    template<typename U, typename A1>
+    static inline U* create(A1&& a1)
+    {
+        void* p = aligned_malloc<U>(sizeof(U));
+        return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+    }
+
+    template<typename U>
+    static inline void destroy(U* p)
+    {
+        if (p != nullptr) {
+            p->~U();
+        }
+        aligned_free<U>(p);
+    }
+
+private:
+    std::atomic<ProducerBase*> producerListTail;
+    std::atomic<std::uint32_t> producerCount;
+
+    std::atomic<size_t> initialBlockPoolIndex;
+    Block* initialBlockPool;
+    size_t initialBlockPoolSize;
+
+#ifndef MCDBGQ_USEDEBUGFREELIST
+    FreeList<Block> freeList;
+#else
+    debug::DebugFreeList<Block> freeList;
+#endif
+
+    std::atomic<ImplicitProducerHash*> implicitProducerHash;
+    std::atomic<size_t> implicitProducerHashCount;          // Number of slots logically used
+    ImplicitProducerHash initialImplicitProducerHash;
+    std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+    std::atomic_flag implicitProducerHashResizeInProgress;
+
+    std::atomic<std::uint32_t> nextExplicitConsumerId;
+    std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugMutex implicitProdMutex;
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    std::atomic<ExplicitProducer*> explicitProducers;
+    std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+    : producer(queue.recycle_or_create_producer(true))
+{
+    if (producer != nullptr) {
+        producer->token = this;
+    }
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+    : producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+    if (producer != nullptr) {
+        producer->token = this;
+    }
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+    : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+    initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+    lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+    : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+    initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+    lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+}
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+#pragma warning(pop)
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic pop
+#endif
diff --git a/framework/audio/thirdparty/moodycamel/lightweightsemaphore.h b/framework/audio/thirdparty/moodycamel/lightweightsemaphore.h
new file mode 100644
index 0000000000..dcaf945bdf
--- /dev/null
+++ b/framework/audio/thirdparty/moodycamel/lightweightsemaphore.h
@@ -0,0 +1,436 @@
+// Provides an efficient implementation of a semaphore (LightweightSemaphore).
+// This is an extension of Jeff Preshing's sempahore implementation (licensed
+// under the terms of its separate zlib license) that has been adapted and
+// extended by Cameron Desrochers.
+
+#pragma once
+
+#include <cassert>
+#include <cstddef> // For std::size_t
+#include <atomic>
+#include <type_traits> // For std::make_signed<T>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+struct _SECURITY_ATTRIBUTES;
+__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount,
+                                                       const wchar_t* lpName);
+__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__MVS__)
+#include <zos-semaphore.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+
+#if defined(__GLIBC_PREREQ) && defined(_GNU_SOURCE)
+#if __GLIBC_PREREQ(2, 30)
+#define MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+#endif
+#endif
+#endif
+
+namespace moodycamel {
+namespace details {
+// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+// portable + lightweight semaphore implementations, originally from
+// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+// LICENSE:
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//	claim that you wrote the original software. If you use this software
+//	in a product, an acknowledgement in the product documentation would be
+//	appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//	misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+#if defined(_WIN32)
+class Semaphore
+{
+private:
+    void* m_hSema;
+
+    Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+    Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        const long maxLong = 0x7fffffff;
+        m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+        assert(m_hSema);
+    }
+
+    ~Semaphore()
+    {
+        CloseHandle(m_hSema);
+    }
+
+    bool wait()
+    {
+        const unsigned long infinite = 0xffffffff;
+        return WaitForSingleObject(m_hSema, infinite) == 0;
+    }
+
+    bool try_wait()
+    {
+        return WaitForSingleObject(m_hSema, 0) == 0;
+    }
+
+    bool timed_wait(std::uint64_t usecs)
+    {
+        return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0;
+    }
+
+    void signal(int count = 1)
+    {
+        while (!ReleaseSemaphore(m_hSema, count, nullptr)) {}
+    }
+};
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+    semaphore_t m_sema;
+
+    Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+    Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+        assert(rc == KERN_SUCCESS);
+        (void)rc;
+    }
+
+    ~Semaphore()
+    {
+        semaphore_destroy(mach_task_self(), m_sema);
+    }
+
+    bool wait()
+    {
+        return semaphore_wait(m_sema) == KERN_SUCCESS;
+    }
+
+    bool try_wait()
+    {
+        return timed_wait(0);
+    }
+
+    bool timed_wait(std::uint64_t timeout_usecs)
+    {
+        mach_timespec_t ts;
+        ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+        ts.tv_nsec = static_cast<int>((timeout_usecs % 1000000) * 1000);
+
+        // added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+        kern_return_t rc = semaphore_timedwait(m_sema, ts);
+        return rc == KERN_SUCCESS;
+    }
+
+    void signal()
+    {
+        while (semaphore_signal(m_sema) != KERN_SUCCESS) {}
+    }
+
+    void signal(int count)
+    {
+        while (count-- > 0)
+        {
+            while (semaphore_signal(m_sema) != KERN_SUCCESS) {}
+        }
+    }
+};
+#elif defined(__unix__) || defined(__MVS__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux, zOS)
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+    sem_t m_sema;
+
+    Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+    Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        int rc = sem_init(&m_sema, 0, static_cast<unsigned int>(initialCount));
+        assert(rc == 0);
+        (void)rc;
+    }
+
+    ~Semaphore()
+    {
+        sem_destroy(&m_sema);
+    }
+
+    bool wait()
+    {
+        // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+        int rc;
+        do {
+            rc = sem_wait(&m_sema);
+        } while (rc == -1 && errno == EINTR);
+        return rc == 0;
+    }
+
+    bool try_wait()
+    {
+        int rc;
+        do {
+            rc = sem_trywait(&m_sema);
+        } while (rc == -1 && errno == EINTR);
+        return rc == 0;
+    }
+
+    bool timed_wait(std::uint64_t usecs)
+    {
+        struct timespec ts;
+        const int usecs_in_1_sec = 1000000;
+        const int nsecs_in_1_sec = 1000000000;
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+        clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+        ts.tv_sec += (time_t)(usecs / usecs_in_1_sec);
+        ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000;
+        // sem_timedwait bombs if you have more than 1e9 in tv_nsec
+        // so we have to clean things up before passing it in
+        if (ts.tv_nsec >= nsecs_in_1_sec) {
+            ts.tv_nsec -= nsecs_in_1_sec;
+            ++ts.tv_sec;
+        }
+
+        int rc;
+        do {
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+            rc = sem_clockwait(&m_sema, CLOCK_MONOTONIC, &ts);
+#else
+            rc = sem_timedwait(&m_sema, &ts);
+#endif
+        } while (rc == -1 && errno == EINTR);
+        return rc == 0;
+    }
+
+    void signal()
+    {
+        while (sem_post(&m_sema) == -1) {}
+    }
+
+    void signal(int count)
+    {
+        while (count-- > 0)
+        {
+            while (sem_post(&m_sema) == -1) {}
+        }
+    }
+};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+}     // end namespace details
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+public:
+    typedef std::make_signed<std::size_t>::type ssize_t;
+
+private:
+    std::atomic<ssize_t> m_count;
+    details::Semaphore m_sema;
+    int m_maxSpins;
+
+    bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+    {
+        ssize_t oldCount;
+        int spin = m_maxSpins;
+        while (--spin >= 0)
+        {
+            oldCount = m_count.load(std::memory_order_relaxed);
+            if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire,
+                                                                  std::memory_order_relaxed)) {
+                return true;
+            }
+            std::atomic_signal_fence(std::memory_order_acquire);         // Prevent the compiler from collapsing the loop.
+        }
+        oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+        if (oldCount > 0) {
+            return true;
+        }
+        if (timeout_usecs < 0) {
+            if (m_sema.wait()) {
+                return true;
+            }
+        }
+        if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs)) {
+            return true;
+        }
+        // At this point, we've timed out waiting for the semaphore, but the
+        // count is still decremented indicating we may still be waiting on
+        // it. So we have to re-adjust the count, but only if the semaphore
+        // wasn't signaled enough times for us too since then. If it was, we
+        // need to release the semaphore too.
+        while (true)
+        {
+            oldCount = m_count.load(std::memory_order_acquire);
+            if (oldCount >= 0 && m_sema.try_wait()) {
+                return true;
+            }
+            if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed,
+                                                                std::memory_order_relaxed)) {
+                return false;
+            }
+        }
+    }
+
+    ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1)
+    {
+        assert(max > 0);
+        ssize_t oldCount;
+        int spin = m_maxSpins;
+        while (--spin >= 0)
+        {
+            oldCount = m_count.load(std::memory_order_relaxed);
+            if (oldCount > 0) {
+                ssize_t newCount = oldCount > max ? oldCount - max : 0;
+                if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) {
+                    return oldCount - newCount;
+                }
+            }
+            std::atomic_signal_fence(std::memory_order_acquire);
+        }
+        oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+        if (oldCount <= 0) {
+            if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait())
+                || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs))) {
+                while (true)
+                {
+                    oldCount = m_count.load(std::memory_order_acquire);
+                    if (oldCount >= 0 && m_sema.try_wait()) {
+                        break;
+                    }
+                    if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed,
+                                                                        std::memory_order_relaxed)) {
+                        return 0;
+                    }
+                }
+            }
+        }
+        if (max > 1) {
+            return 1 + tryWaitMany(max - 1);
+        }
+        return 1;
+    }
+
+public:
+    LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000)
+        : m_count(initialCount), m_maxSpins(maxSpins)
+    {
+        assert(initialCount >= 0);
+        assert(maxSpins >= 0);
+    }
+
+    bool tryWait()
+    {
+        ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+        while (oldCount > 0)
+        {
+            if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool wait()
+    {
+        return tryWait() || waitWithPartialSpinning();
+    }
+
+    bool wait(std::int64_t timeout_usecs)
+    {
+        return tryWait() || waitWithPartialSpinning(timeout_usecs);
+    }
+
+    // Acquires between 0 and (greedily) max, inclusive
+    ssize_t tryWaitMany(ssize_t max)
+    {
+        assert(max >= 0);
+        ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+        while (oldCount > 0)
+        {
+            ssize_t newCount = oldCount > max ? oldCount - max : 0;
+            if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) {
+                return oldCount - newCount;
+            }
+        }
+        return 0;
+    }
+
+    // Acquires at least one, and (greedily) at most max
+    ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+    {
+        assert(max >= 0);
+        ssize_t result = tryWaitMany(max);
+        if (result == 0 && max > 0) {
+            result = waitManyWithPartialSpinning(max, timeout_usecs);
+        }
+        return result;
+    }
+
+    ssize_t waitMany(ssize_t max)
+    {
+        ssize_t result = waitMany(max, -1);
+        assert(result > 0);
+        return result;
+    }
+
+    void signal(ssize_t count = 1)
+    {
+        assert(count >= 0);
+        ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+        ssize_t toRelease = -oldCount < count ? -oldCount : count;
+        if (toRelease > 0) {
+            m_sema.signal((int)toRelease);
+        }
+    }
+
+    std::size_t availableApprox() const
+    {
+        ssize_t count = m_count.load(std::memory_order_relaxed);
+        return count > 0 ? static_cast<std::size_t>(count) : 0;
+    }
+};
+}   // end namespace moodycamel
diff --git a/framework/global/CMakeLists.txt b/framework/global/CMakeLists.txt
index be1bc7363a..2b1e27f7de 100644
--- a/framework/global/CMakeLists.txt
+++ b/framework/global/CMakeLists.txt
@@ -40,6 +40,9 @@ target_sources(muse_global PRIVATE
     logremover.cpp
     logremover.h
     profiler.h
+    signpost.h
+    functional/inplace_function.h
+    functional/inplace_function_mv.h
     dataformatter.cpp
     dataformatter.h
     stringutils.cpp
@@ -72,6 +75,8 @@ target_sources(muse_global PRIVATE
     ticker.cpp
     ticker.h
 
+    thirdparty/sg14/inplace_function.h
+
     ${KORS_MODULARITY_SRC}
     modularity/ioccontext.cpp
     modularity/ioc.h
diff --git a/framework/global/functional/inplace_function.h b/framework/global/functional/inplace_function.h
new file mode 100644
index 0000000000..d891d93052
--- /dev/null
+++ b/framework/global/functional/inplace_function.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "global/thirdparty/sg14/inplace_function.h"
+
+namespace muse::functional {
+template<class Signature, size_t Capacity = stdext::inplace_function_detail::InplaceFunctionDefaultCapacity,
+         size_t Alignment = alignof(stdext::inplace_function_detail::aligned_storage_t<Capacity>)>
+using inplace_function = stdext::inplace_function<Signature, Capacity, Alignment>;
+}
diff --git a/framework/global/functional/inplace_function_mv.h b/framework/global/functional/inplace_function_mv.h
new file mode 100644
index 0000000000..58fe8c67f2
--- /dev/null
+++ b/framework/global/functional/inplace_function_mv.h
@@ -0,0 +1,215 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only
+ * MuseScore-CLA-applies
+ *
+ * MuseScore
+ * Music Composition & Notation
+ *
+ * Copyright (C) 2026 MuseScore Limited and others
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <cstddef>
+#include <functional>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+namespace muse::functional {
+template<typename Signature, size_t Capacity = 32, size_t Alignment = alignof(std::max_align_t)>
+class MoveOnlyInplaceFunction;
+
+namespace detail {
+template<typename>
+struct IsMoveOnlyInplaceFunction : std::false_type {};
+
+template<typename Signature, size_t Capacity, size_t Alignment>
+struct IsMoveOnlyInplaceFunction<MoveOnlyInplaceFunction<Signature, Capacity, Alignment> > : std::true_type {};
+
+template<typename Ret, typename ... Args>
+struct MoveOnlyInplaceFunctionVTable
+{
+    using Invoke = Ret (*)(void*, Args&&...);
+    using Relocate = void (*)(void*, void*) noexcept;
+    using Destroy = void (*)(void*) noexcept;
+
+    Invoke invoke = nullptr;
+    Relocate relocate = nullptr;
+    Destroy destroy = nullptr;
+};
+
+template<typename Callable, typename Ret, typename ... Args>
+Ret invokeMoveOnlyInplaceFunction(void* storage, Args&&... args)
+{
+    if constexpr (std::is_void_v<Ret>) {
+        (*static_cast<Callable*>(storage))(std::forward<Args>(args)...);
+    } else {
+        return (*static_cast<Callable*>(storage))(std::forward<Args>(args)...);
+    }
+}
+
+template<typename Callable>
+void relocateMoveOnlyInplaceFunction(void* destination, void* source) noexcept
+{
+    auto* callable = static_cast<Callable*>(source);
+    new (destination) Callable(std::move(*callable));
+    callable->~Callable();
+}
+
+template<typename Callable>
+void destroyMoveOnlyInplaceFunction(void* storage) noexcept
+{
+    static_cast<Callable*>(storage)->~Callable();
+}
+
+template<typename Callable, typename Ret, typename ... Args>
+const MoveOnlyInplaceFunctionVTable<Ret, Args...>* moveOnlyInplaceFunctionVTable() noexcept
+{
+    static const MoveOnlyInplaceFunctionVTable<Ret, Args...> vtable {
+        &invokeMoveOnlyInplaceFunction<Callable, Ret, Args...>,
+        &relocateMoveOnlyInplaceFunction<Callable>,
+        &destroyMoveOnlyInplaceFunction<Callable>
+    };
+
+    return &vtable;
+}
+} // namespace detail
+
+template<typename Ret, typename ... Args, size_t Capacity, size_t Alignment>
+class MoveOnlyInplaceFunction<Ret(Args...), Capacity, Alignment>
+{
+public:
+    MoveOnlyInplaceFunction() noexcept = default;
+
+    MoveOnlyInplaceFunction(std::nullptr_t) noexcept {}
+
+    MoveOnlyInplaceFunction(const MoveOnlyInplaceFunction&) = delete;
+    MoveOnlyInplaceFunction& operator=(const MoveOnlyInplaceFunction&) = delete;
+
+    MoveOnlyInplaceFunction(MoveOnlyInplaceFunction&& other) noexcept
+    {
+        moveFrom(std::move(other));
+    }
+
+    MoveOnlyInplaceFunction& operator=(MoveOnlyInplaceFunction&& other) noexcept
+    {
+        if (this != &other) {
+            reset();
+            moveFrom(std::move(other));
+        }
+
+        return *this;
+    }
+
+    template<typename Callable,
+             typename Fn = std::decay_t<Callable>,
+             std::enable_if_t<!detail::IsMoveOnlyInplaceFunction<Fn>::value
+                              && std::is_invocable_r_v<Ret, Fn&, Args...>, int> = 0>
+    MoveOnlyInplaceFunction(Callable&& callable)
+    {
+        static_assert(sizeof(Fn) <= Capacity, "Callable is too large for this MoveOnlyInplaceFunction");
+        static_assert(alignof(Fn) <= Alignment, "Callable alignment is too large for this MoveOnlyInplaceFunction");
+        static_assert(std::is_nothrow_move_constructible_v<Fn>,
+                      "MoveOnlyInplaceFunction requires nothrow move-constructible callables");
+
+        m_vtable = detail::moveOnlyInplaceFunctionVTable<Fn, Ret, Args...>();
+        new (storage()) Fn(std::forward<Callable>(callable));
+    }
+
+    template<typename Callable,
+             typename Fn = std::decay_t<Callable>,
+             std::enable_if_t<!detail::IsMoveOnlyInplaceFunction<Fn>::value
+                              && std::is_invocable_r_v<Ret, Fn&, Args...>, int> = 0>
+    MoveOnlyInplaceFunction& operator=(Callable&& callable)
+    {
+        *this = MoveOnlyInplaceFunction(std::forward<Callable>(callable));
+        return *this;
+    }
+
+    MoveOnlyInplaceFunction& operator=(std::nullptr_t) noexcept
+    {
+        reset();
+        return *this;
+    }
+
+    ~MoveOnlyInplaceFunction() noexcept
+    {
+        reset();
+    }
+
+    Ret operator()(Args... args) const
+    {
+        if (!m_vtable) {
+            throw std::bad_function_call();
+        }
+
+        if constexpr (std::is_void_v<Ret>) {
+            m_vtable->invoke(storage(), std::forward<Args>(args)...);
+        } else {
+            return m_vtable->invoke(storage(), std::forward<Args>(args)...);
+        }
+    }
+
+    explicit operator bool() const noexcept
+    {
+        return m_vtable != nullptr;
+    }
+
+    void reset() noexcept
+    {
+        if (!m_vtable) {
+            return;
+        }
+
+        m_vtable->destroy(storage());
+        m_vtable = nullptr;
+    }
+
+private:
+    using VTable = detail::MoveOnlyInplaceFunctionVTable<Ret, Args...>;
+    using Storage = std::aligned_storage_t<Capacity, Alignment>;
+
+    void* storage() const noexcept
+    {
+        return const_cast<Storage*>(&m_storage);
+    }
+
+    void moveFrom(MoveOnlyInplaceFunction&& other) noexcept
+    {
+        if (!other.m_vtable) {
+            return;
+        }
+
+        m_vtable = other.m_vtable;
+        m_vtable->relocate(storage(), other.storage());
+        other.m_vtable = nullptr;
+    }
+
+    const VTable* m_vtable = nullptr;
+    mutable Storage m_storage {};
+};
+
+template<typename Signature, size_t Capacity, size_t Alignment>
+bool operator==(const MoveOnlyInplaceFunction<Signature, Capacity, Alignment>& function, std::nullptr_t) noexcept
+{
+    return !function;
+}
+
+template<typename Signature, size_t Capacity, size_t Alignment>
+bool operator!=(const MoveOnlyInplaceFunction<Signature, Capacity, Alignment>& function, std::nullptr_t) noexcept
+{
+    return bool(function);
+}
+} // namespace muse::functional
diff --git a/framework/global/signpost.h b/framework/global/signpost.h
new file mode 100644
index 0000000000..d0ee0edd89
--- /dev/null
+++ b/framework/global/signpost.h
@@ -0,0 +1,46 @@
+#pragma once
+#ifdef __APPLE__
+#include <os/signpost.h>
+
+#include "global/defer.h"
+#include "muse_framework_config.h"
+
+#define MSS_SIGNPOST_CONCAT_IMPL(x, y) x##y
+#define MSS_SIGNPOST_CONCAT(x, y) MSS_SIGNPOST_CONCAT_IMPL(x, y)
+
+// Call this macro in your source file (at function or namespace scope) to initialize the signpost log.
+#define MSS_SIGNPOST_PREPARE \
+    static os_log_t sn_log = os_log_create(MUSE_APP_NAME_MACHINE_READABLE, OS_LOG_CATEGORY_POINTS_OF_INTEREST);
+
+// Call these macros to begin and end a signpost interval
+#define MSS_SIGNPOST_BEGIN(name)                \
+    auto spid = os_signpost_id_generate(sn_log); \
+    os_signpost_interval_begin(sn_log, spid, name);
+
+#define MSS_SIGNPOST_END(name) os_signpost_interval_end(sn_log, spid, name);
+#define MSS_SIGNPOST_SCOPE(name) MSS_SIGNPOST_SCOPE_IMPL(name, __COUNTER__)
+#define MSS_SIGNPOST_SCOPE_IMPL(name, id)                                       \
+    auto MSS_SIGNPOST_CONCAT(spid_, id) = os_signpost_id_generate(sn_log);      \
+    os_signpost_interval_begin(sn_log, MSS_SIGNPOST_CONCAT(spid_, id), name);   \
+    muse::Defer MSS_SIGNPOST_CONCAT(g_, id)([spid = MSS_SIGNPOST_CONCAT(spid_, id)] { \
+        os_signpost_interval_end(sn_log, spid, name);                           \
+    });
+
+#define MSS_SIGNPOST_FUNCTION MSS_SIGNPOST_FUNCTION_IMPL(__COUNTER__)
+#define MSS_SIGNPOST_FUNCTION_IMPL(id)                                                        \
+    auto MSS_SIGNPOST_CONCAT(spid_, id) = os_signpost_id_generate(sn_log);                     \
+    const char* MSS_SIGNPOST_CONCAT(function_, id) = __PRETTY_FUNCTION__;                      \
+    os_signpost_interval_begin(sn_log, MSS_SIGNPOST_CONCAT(spid_, id), "Function", "%s",      \
+                               MSS_SIGNPOST_CONCAT(function_, id));                            \
+    muse::Defer MSS_SIGNPOST_CONCAT(g_, id)([spid = MSS_SIGNPOST_CONCAT(spid_, id),            \
+                                             function = MSS_SIGNPOST_CONCAT(function_, id)] {   \
+        os_signpost_interval_end(sn_log, spid, "Function", "%s", function);                   \
+    });
+
+#else  // !__APPLE__
+#define MSS_SIGNPOST_PREPARE
+#define MSS_SIGNPOST_BEGIN(name)
+#define MSS_SIGNPOST_END(name)
+#define MSS_SIGNPOST_SCOPE(name)
+#define MSS_SIGNPOST_FUNCTION
+#endif
diff --git a/framework/global/tests/ringqueue_tests.cpp b/framework/global/tests/ringqueue_tests.cpp
index 055f0e608c..3754a05908 100644
--- a/framework/global/tests/ringqueue_tests.cpp
+++ b/framework/global/tests/ringqueue_tests.cpp
@@ -70,6 +70,10 @@ TEST_F(Global_Concurrency_RingQueueTests, FixedSizeQueue)
             if (successCount == 10) {
                 break;
             }
+            if (successCount == 0) {
+                // simple wait until the producer has added its first item
+                std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            }
         }
 
         EXPECT_EQ(successCount, 10);
diff --git a/framework/global/thirdparty/sg14/inplace_function.h b/framework/global/thirdparty/sg14/inplace_function.h
new file mode 100644
index 0000000000..5066d3a246
--- /dev/null
+++ b/framework/global/thirdparty/sg14/inplace_function.h
@@ -0,0 +1,387 @@
+/*
+ * Boost Software License - Version 1.0 - August 17th, 2003
+ *
+ * Permission is hereby granted, free of charge, to any person or organization
+ * obtaining a copy of the software and accompanying documentation covered by
+ * this license (the "Software") to use, reproduce, display, distribute,
+ * execute, and transmit the Software, and to prepare derivative works of the
+ * Software, and to permit third-parties to whom the Software is furnished to
+ * do so, all subject to the following:
+ *
+ * The copyright notices in the Software and this entire statement, including
+ * the above license grant, this restriction and the following disclaimer,
+ * must be included in all copies of the Software, in whole or in part, and
+ * all derivative works of the Software, unless such copies or derivative
+ * works are solely in the form of machine-executable object code generated by
+ * a source language processor.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <type_traits>
+#include <utility>
+#include <functional>
+
+#ifndef SG14_INPLACE_FUNCTION_THROW
+#define SG14_INPLACE_FUNCTION_THROW(x) throw (x)
+#endif
+
+namespace stdext {
+namespace inplace_function_detail {
+static constexpr size_t InplaceFunctionDefaultCapacity = 32;
+
+#ifndef SG14_USE_STD_ALIGNED_STORAGE
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61458
+// MSVC 32-bit has the same bug.
+// libc++ and MSVC 64-bit seem to work fine right now, but why run the risk?
+template<size_t Cap>
+union aligned_storage_helper {
+    struct double1 {
+        double a;
+    };
+    struct double4 {
+        double a[4];
+    };
+    template<class T> using maybe = std::conditional_t<(Cap >= sizeof(T)), T, char>;
+    char real_data[Cap];
+    maybe<int> a;
+    maybe<long> b;
+    maybe<long long> c;
+    maybe<void*> d;
+    maybe<void (*)()> e;
+    maybe<double1> f;
+    maybe<double4> g;
+    maybe<long double> h;
+};
+
+template<size_t Cap, size_t Align = alignof(aligned_storage_helper<Cap>)>
+struct aligned_storage {
+    using type = std::aligned_storage_t<Cap, Align>;
+};
+
+template<size_t Cap, size_t Align = alignof(aligned_storage_helper<Cap>)>
+using aligned_storage_t = typename aligned_storage<Cap, Align>::type;
+static_assert(sizeof(aligned_storage_t<sizeof(void*)>) == sizeof(void*), "A");
+static_assert(alignof(aligned_storage_t<sizeof(void*)>) == alignof(void*), "B");
+#else
+using std::aligned_storage;
+using std::aligned_storage_t;
+static_assert(sizeof(std::aligned_storage_t<sizeof(void*)>) == sizeof(void*), "C");
+static_assert(alignof(std::aligned_storage_t<sizeof(void*)>) == alignof(void*), "D");
+#endif
+
+template<class T> struct wrapper
+{
+    using type = T;
+};
+
+template<class R, class ... Args> struct vtable
+{
+    using storage_ptr_t = void*;
+
+    using invoke_ptr_t = R (*)(storage_ptr_t, Args&&...);
+    using process_ptr_t = void (*)(storage_ptr_t, storage_ptr_t);
+    using destructor_ptr_t = void (*)(storage_ptr_t);
+
+    const invoke_ptr_t invoke_ptr;
+    const process_ptr_t copy_ptr;
+    const process_ptr_t relocate_ptr;
+    const destructor_ptr_t destructor_ptr;
+
+    explicit constexpr vtable() noexcept
+        : invoke_ptr{[](storage_ptr_t, Args&&...) -> R
+        { SG14_INPLACE_FUNCTION_THROW(std::bad_function_call()); }
+                     },
+        copy_ptr{ [](storage_ptr_t, storage_ptr_t) -> void {} },
+    relocate_ptr{ [](storage_ptr_t, storage_ptr_t) -> void {} },
+    destructor_ptr{ [](storage_ptr_t) -> void {} }
+    {}
+
+    template<class C> explicit constexpr vtable(wrapper<C>) noexcept
+        : invoke_ptr{[](storage_ptr_t storage_ptr, Args&&... args) -> R
+        {
+            return (*static_cast<C*>(storage_ptr))(
+                static_cast<Args &&>(args)...
+                );
+        }
+                     },
+        copy_ptr{ [](storage_ptr_t dst_ptr, storage_ptr_t src_ptr) -> void
+        { ::new (dst_ptr) C{ (*static_cast<C*>(src_ptr)) }; }
+    },
+    relocate_ptr{ [](storage_ptr_t dst_ptr, storage_ptr_t src_ptr) -> void
+        {
+            ::new (dst_ptr) C{ std::move(*static_cast<C*>(src_ptr)) };
+            static_cast<C*>(src_ptr)->~C();
+        }
+    },
+    destructor_ptr{ [](storage_ptr_t src_ptr) -> void
+        { static_cast<C*>(src_ptr)->~C(); }
+    }
+    {}
+
+    vtable(const vtable&) = delete;
+    vtable(vtable&&) = delete;
+
+    vtable& operator=(const vtable&) = delete;
+    vtable& operator=(vtable&&) = delete;
+
+    ~vtable() = default;
+};
+
+template<class R, class ... Args>
+#if __cplusplus >= 201703L
+inline constexpr
+#endif
+vtable<R, Args...> empty_vtable{};
+
+template<size_t DstCap, size_t DstAlign, size_t SrcCap, size_t SrcAlign>
+struct is_valid_inplace_dst : std::true_type
+{
+    static_assert(DstCap >= SrcCap,
+                  "Can't squeeze larger inplace_function into a smaller one"
+                  );
+
+    static_assert(DstAlign % SrcAlign == 0,
+                  "Incompatible inplace_function alignments"
+                  );
+};
+
+// C++11 MSVC compatible implementation of std::is_invocable_r.
+
+template<class R> void accept(R);
+
+template<class, class R, class F, class ... Args> struct is_invocable_r_impl : std::false_type {};
+
+template<class F, class ... Args> struct is_invocable_r_impl<
+    decltype(std::declval<F>()(std::declval<Args>()...), void()),
+    void,
+    F,
+    Args...
+    > : std::true_type {};
+
+template<class F, class ... Args> struct is_invocable_r_impl<
+    decltype(std::declval<F>()(std::declval<Args>()...), void()),
+    const void,
+    F,
+    Args...
+    > : std::true_type {};
+
+template<class R, class F, class ... Args> struct is_invocable_r_impl<
+    decltype(accept<R>(std::declval<F>()(std::declval<Args>()...))),
+    R,
+    F,
+    Args...
+    > : std::true_type {};
+
+template<class R, class F, class ... Args> using is_invocable_r = is_invocable_r_impl<
+    void,
+    R,
+    F,
+    Args...
+    >;
+} // namespace inplace_function_detail
+
+template<
+    class Signature,
+    size_t Capacity = inplace_function_detail::InplaceFunctionDefaultCapacity,
+    size_t Alignment = alignof(inplace_function_detail::aligned_storage_t<Capacity>)
+    >
+class inplace_function; // unspecified
+
+namespace inplace_function_detail {
+template<class> struct is_inplace_function : std::false_type {};
+template<class Sig, size_t Cap, size_t Align>
+struct is_inplace_function<inplace_function<Sig, Cap, Align> > : std::true_type {};
+} // namespace inplace_function_detail
+
+template<
+    class R,
+    class ... Args,
+    size_t Capacity,
+    size_t Alignment
+    >
+class inplace_function<R(Args...), Capacity, Alignment>
+{
+    using storage_t = inplace_function_detail::aligned_storage_t<Capacity, Alignment>;
+    using vtable_t = inplace_function_detail::vtable<R, Args...>;
+    using vtable_ptr_t = const vtable_t*;
+
+    template<class, size_t, size_t> friend class inplace_function;
+
+public:
+    using capacity = std::integral_constant<size_t, Capacity>;
+    using alignment = std::integral_constant<size_t, Alignment>;
+
+    inplace_function() noexcept
+        : vtable_ptr_{std::addressof(inplace_function_detail::empty_vtable<R, Args...>)}
+    {}
+
+    template<
+        class T,
+        class C = std::decay_t<T>,
+        class = std::enable_if_t<
+            !inplace_function_detail::is_inplace_function<C>::value
+            && inplace_function_detail::is_invocable_r<R, C&, Args...>::value
+            >
+        >
+    inplace_function(T&& closure)
+    {
+        static_assert(std::is_copy_constructible<C>::value,
+                      "inplace_function cannot be constructed from non-copyable type"
+                      );
+
+        static_assert(sizeof(C) <= Capacity,
+                      "inplace_function cannot be constructed from object with this (large) size"
+                      );
+
+        static_assert(Alignment % alignof(C) == 0,
+                      "inplace_function cannot be constructed from object with this (large) alignment"
+                      );
+
+        static const vtable_t vt{ inplace_function_detail::wrapper<C> {} };
+        vtable_ptr_ = std::addressof(vt);
+
+        ::new (std::addressof(storage_)) C{ std::forward<T>(closure) };
+    }
+
+    template<size_t Cap, size_t Align>
+    inplace_function(const inplace_function<R(Args...), Cap, Align>& other)
+        : inplace_function(other.vtable_ptr_, other.vtable_ptr_->copy_ptr, std::addressof(other.storage_))
+    {
+        static_assert(inplace_function_detail::is_valid_inplace_dst<
+                          Capacity, Alignment, Cap, Align
+                          >::value, "conversion not allowed");
+    }
+
+    template<size_t Cap, size_t Align>
+    inplace_function(inplace_function<R(Args...), Cap, Align>&& other) noexcept
+        : inplace_function(other.vtable_ptr_, other.vtable_ptr_->relocate_ptr, std::addressof(other.storage_))
+    {
+        static_assert(inplace_function_detail::is_valid_inplace_dst<
+                          Capacity, Alignment, Cap, Align
+                          >::value, "conversion not allowed");
+
+        other.vtable_ptr_ = std::addressof(inplace_function_detail::empty_vtable<R, Args...>);
+    }
+
+    inplace_function(std::nullptr_t) noexcept
+        : vtable_ptr_{std::addressof(inplace_function_detail::empty_vtable<R, Args...>)}
+    {}
+
+    inplace_function(const inplace_function& other)
+        : vtable_ptr_{other.vtable_ptr_}
+    {
+        vtable_ptr_->copy_ptr(
+            std::addressof(storage_),
+            std::addressof(other.storage_)
+            );
+    }
+
+    inplace_function(inplace_function&& other) noexcept
+        : vtable_ptr_{std::exchange(other.vtable_ptr_, std::addressof(inplace_function_detail::empty_vtable<R, Args...>))}
+    {
+        vtable_ptr_->relocate_ptr(
+            std::addressof(storage_),
+            std::addressof(other.storage_)
+            );
+    }
+
+    inplace_function& operator=(std::nullptr_t) noexcept
+    {
+        vtable_ptr_->destructor_ptr(std::addressof(storage_));
+        vtable_ptr_ = std::addressof(inplace_function_detail::empty_vtable<R, Args...>);
+        return *this;
+    }
+
+    inplace_function& operator=(inplace_function other) noexcept
+    {
+        vtable_ptr_->destructor_ptr(std::addressof(storage_));
+
+        vtable_ptr_ = std::exchange(other.vtable_ptr_, std::addressof(inplace_function_detail::empty_vtable<R, Args...>));
+        vtable_ptr_->relocate_ptr(
+            std::addressof(storage_),
+            std::addressof(other.storage_)
+            );
+        return *this;
+    }
+
+    ~inplace_function()
+    {
+        vtable_ptr_->destructor_ptr(std::addressof(storage_));
+    }
+
+    R operator()(Args... args) const
+    {
+        return vtable_ptr_->invoke_ptr(
+            std::addressof(storage_),
+            std::forward<Args>(args)...
+            );
+    }
+
+    constexpr bool operator==(std::nullptr_t) const noexcept
+    {
+        return !operator bool();
+    }
+
+    constexpr bool operator!=(std::nullptr_t) const noexcept
+    {
+        return operator bool();
+    }
+
+    explicit constexpr operator bool() const noexcept
+    {
+        return vtable_ptr_ != std::addressof(inplace_function_detail::empty_vtable<R, Args...>);
+    }
+
+    void swap(inplace_function& other) noexcept
+    {
+        if (this == std::addressof(other)) {
+            return;
+        }
+
+        storage_t tmp;
+        vtable_ptr_->relocate_ptr(
+            std::addressof(tmp),
+            std::addressof(storage_)
+            );
+
+        other.vtable_ptr_->relocate_ptr(
+            std::addressof(storage_),
+            std::addressof(other.storage_)
+            );
+
+        vtable_ptr_->relocate_ptr(
+            std::addressof(other.storage_),
+            std::addressof(tmp)
+            );
+
+        std::swap(vtable_ptr_, other.vtable_ptr_);
+    }
+
+    friend void swap(inplace_function& lhs, inplace_function& rhs) noexcept
+    {
+        lhs.swap(rhs);
+    }
+
+private:
+    vtable_ptr_t vtable_ptr_;
+    mutable storage_t storage_;
+
+    inplace_function(
+        vtable_ptr_t vtable_ptr,
+        typename vtable_t::process_ptr_t process_ptr,
+        typename vtable_t::storage_ptr_t storage_ptr)
+        : vtable_ptr_{vtable_ptr}
+    {
+        process_ptr(std::addressof(storage_), storage_ptr);
+    }
+};
+} // namespace stdext
diff --git a/framework/stubs/audio/CMakeLists.txt b/framework/stubs/audio/CMakeLists.txt
index 837d3ff9cd..90eea43010 100644
--- a/framework/stubs/audio/CMakeLists.txt
+++ b/framework/stubs/audio/CMakeLists.txt
@@ -37,4 +37,5 @@ target_sources(muse_audio PRIVATE
     audiodriverstub.h
     audiodrivercontrollerstub.cpp
     audiodrivercontrollerstub.h
+    audioworkgroupstub.cpp
 )
diff --git a/framework/stubs/audio/audioworkgroupstub.cpp b/framework/stubs/audio/audioworkgroupstub.cpp
new file mode 100644
index 0000000000..aadcba5a12
--- /dev/null
+++ b/framework/stubs/audio/audioworkgroupstub.cpp
@@ -0,0 +1,70 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only
+ * MuseScore-CLA-applies
+ *
+ * MuseScore
+ * Music Composition & Notation
+ *
+ * Copyright (C) 2026 MuseScore Limited and others
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "audio/common/audioworkgroup.h"
+
+#include <memory>
+#include <thread>
+#include <utility>
+
+namespace muse::audio {
+class AudioWorkgroupProvider
+{
+};
+
+AudioWorkGroup::AudioWorkGroup() = default;
+
+AudioWorkGroup::AudioWorkGroup(std::unique_ptr<AudioWorkgroupProvider> provider)
+    : m_provider(std::move(provider)) {}
+
+AudioWorkGroup::~AudioWorkGroup() = default;
+
+AudioWorkGroup::AudioWorkGroup(const AudioWorkGroup& other)
+    : m_provider(other.m_provider ? std::make_unique<AudioWorkgroupProvider>(*other.m_provider) : nullptr) {}
+
+AudioWorkGroup::AudioWorkGroup(AudioWorkGroup&& other) noexcept = default;
+
+AudioWorkGroup& AudioWorkGroup::operator=(const AudioWorkGroup& other)
+{
+    if (this != &other) {
+        m_provider = other.m_provider ? std::make_unique<AudioWorkgroupProvider>(*other.m_provider) : nullptr;
+    }
+    return *this;
+}
+
+AudioWorkGroup& AudioWorkGroup::operator=(AudioWorkGroup&& other) noexcept = default;
+
+bool AudioWorkGroup::join(AudioWorkgroupToken&)
+{
+    return false;
+}
+
+size_t AudioWorkGroup::getMaxParallelThreadCount() const
+{
+    return std::thread::hardware_concurrency();
+}
+
+AudioWorkGroup makeAudioWorkgroup(void*)
+{
+    return {};
+}
+} // namespace muse::audio