From 657bc676830d56ba19ba2a73d7d151fed60d2a09 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Thu, 5 Feb 2026 16:12:25 +0000
Subject: [PATCH 1/9] Add perf microbenchmark for post-teardown frees (issue
 #809)

- add perf test that compares baseline frees vs frees after ThreadAlloc teardown
- cover atexit/static destruction scenario to repro post-finalisation slowdown
---
 src/test/perf/post_teardown/post-teardown.cc | 50 ++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 src/test/perf/post_teardown/post-teardown.cc
diff --git a/src/test/perf/post_teardown/post-teardown.cc b/src/test/perf/post_teardown/post-teardown.cc
new file mode 100644
index 000000000..ca9296d71
--- /dev/null
+++ b/src/test/perf/post_teardown/post-teardown.cc
@@ -0,0 +1,50 @@
+#include <cstdlib>
+#include <snmalloc/snmalloc.h>
+#include <test/measuretime.h>
+#include <test/setup.h>
+#include <vector>
+
+using namespace snmalloc;
+
+void fill(std::vector<void*>& out, size_t count, size_t size)
+{
+  out.reserve(count);
+  for (size_t i = 0; i < count; i++)
+  {
+    out.push_back(snmalloc::alloc<Uninit>(size));
+  }
+}
+
+void drain(const char* label, std::vector<void*>& vec, size_t size)
+{
+  MeasureTime m;
+  m << label << " (" << vec.size() << " x " << size << " B)";
+  for (void* p : vec)
+  {
+    snmalloc::dealloc(p, size);
+  }
+  vec.clear();
+}
+
+int main(int, char**)
+{
+  setup();
+  // Issue #809: perf when many objects are freed after the allocator has
+  // already been finalised (e.g. static/global teardown). Keep counts equal
+  // for baseline and post-teardown to isolate the teardown cost.
+  constexpr size_t alloc_count = 1 << 18;
+  constexpr size_t obj_size = 64;
+
+  std::vector<void*> ptrs;
+  fill(ptrs, alloc_count, obj_size);
+  drain("Baseline dealloc before finalise", ptrs, obj_size);
+
+  // Simulate the allocator already being torn down before remaining frees
+  // (post-main / static destruction path from #809).
+  ThreadAlloc::teardown();
+
+  fill(ptrs, alloc_count, obj_size);
+  drain("Immediate dealloc after teardown", ptrs, obj_size);
+
+  return 0;
+}

From bbd7edc689e8db8a4139340bb0c0c7b0f6ddd307 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Thu, 5 Feb 2026 16:57:26 +0000
Subject: [PATCH 2/9] Reduce time spent in teardown

The teardown code can be a performance bottleneck when a lot of deallocations occur after a thread begins to be torn down.

This code tracks how many operations have occured after the thread has been torndown, and will stop performing the tidying with an exponential back-off.

This takes the overhead on the microbenchmark from around 50x to 10x.
---
 src/snmalloc/global/threadalloc.h |  9 +++++----
 src/snmalloc/mem/corealloc.h      | 26 ++++++++++++++++----------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/snmalloc/global/threadalloc.h b/src/snmalloc/global/threadalloc.h
index e49c6d5c5..d037995e5 100644
--- a/src/snmalloc/global/threadalloc.h
+++ b/src/snmalloc/global/threadalloc.h
@@ -93,7 +93,7 @@ namespace snmalloc
     // we need to record if we are already in that state as we will not
     // receive another teardown call, so each operation needs to release
     // the underlying data structures after the call.
-    static inline thread_local bool teardown_called{false};
+    static inline thread_local size_t times_teardown_called{0};
 
   public:
     /**
@@ -114,8 +114,9 @@ namespace snmalloc
       if (alloc == &default_alloc)
         return;
 
-      teardown_called = true;
-      alloc->flush();
+      times_teardown_called++;
+      if (bits::is_pow2(times_teardown_called) || times_teardown_called < 128)
+        alloc->flush();
       AllocPool<Config>::release(alloc);
       alloc = const_cast<Alloc*>(&default_alloc);
     }
@@ -131,7 +132,7 @@ namespace snmalloc
       template<typename Restart, typename... Args>
       SNMALLOC_SLOW_PATH static auto check_init_slow(Restart r, Args... args)
       {
-        bool post_teardown = teardown_called;
+        bool post_teardown = times_teardown_called > 0;
 
         alloc = AllocPool<Config>::acquire();
 
diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h
index d83ab9dc7..5ec7bf1f3 100644
--- a/src/snmalloc/mem/corealloc.h
+++ b/src/snmalloc/mem/corealloc.h
@@ -1169,6 +1169,10 @@ namespace snmalloc
     template<bool check_slabs = false>
     SNMALLOC_SLOW_PATH void dealloc_local_slabs(smallsizeclass_t sizeclass)
     {
+      if constexpr (!check_slabs)
+        if (alloc_classes[sizeclass].unused == 0)
+          return;
+
       // Return unused slabs of sizeclass_t back to global allocator
       alloc_classes[sizeclass].available.iterate([this, sizeclass](auto* meta) {
         auto domesticate =
@@ -1420,18 +1424,20 @@ namespace snmalloc
       for (smallsizeclass_t sizeclass = 0; sizeclass < NUM_SMALL_SIZECLASSES;
            sizeclass++)
       {
-        dealloc_local_slabs<true>(sizeclass);
+        dealloc_local_slabs<mitigations(freelist_teardown_validate)>(sizeclass);
       }
 
-      laden.iterate(
-        [domesticate](BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA {
-          if (!meta->is_large())
-          {
-            meta->free_queue.validate(
-              freelist::Object::key_root, meta->as_key_tweak(), domesticate);
-          }
-        });
-
+      if constexpr (mitigations(freelist_teardown_validate))
+      {
+        laden.iterate(
+          [domesticate](BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA {
+            if (!meta->is_large())
+            {
+              meta->free_queue.validate(
+                freelist::Object::key_root, meta->as_key_tweak(), domesticate);
+            }
+          });
+      }
       // Set the remote_dealloc_cache to immediately slow path.
       remote_dealloc_cache.capacity = 0;
 

From ecf4a5c85b724da4f2aa892b44b37392cdf9e1d5 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:38:51 +0000
Subject: [PATCH 3/9] Make pthread_fork protection opt in.

This improves performance for most cases.  The feature is problematic on glibc as there is potential for it to work incorrectly.

This disables the feature by default, and can be enabled if someone really cares about it.
---
 CMakeLists.txt                     |  7 ++++++-
 src/snmalloc/ds_aal/prevent_fork.h | 13 +++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cad25ef65..20cf36225 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ option(SNMALLOC_IPO "Link with IPO/LTO support" OFF)
 option(SNMALLOC_BENCHMARK_INDIVIDUAL_MITIGATIONS "Build tests and ld_preload for individual mitigations" OFF)
 option(SNMALLOC_ENABLE_DYNAMIC_LOADING "Build such that snmalloc can be dynamically loaded. This is not required for LD_PRELOAD, and will harm performance if enabled." OFF)
 option(SNMALLOC_ENABLE_WAIT_ON_ADDRESS "Use wait on address backoff strategy if it is available" ON)
+option(SNMALLOC_PTHREAD_FORK_PROTECTION "Guard against forking while allocator locks are held using pthread_atfork hooks" OFF)
 option(SNMALLOC_ENABLE_FUZZING "Enable fuzzing instrumentation tests" OFF)
 option(SNMALLOC_USE_SELF_VENDORED_STL "Avoid using system STL" OFF)
 # Options that apply only if we're not building the header-only library
@@ -133,6 +134,9 @@ int main() {
 }
 " SNMALLOC_PTHREAD_ATFORK_WORKS)
 
+if (SNMALLOC_PTHREAD_FORK_PROTECTION AND NOT SNMALLOC_PTHREAD_ATFORK_WORKS)
+  message(FATAL_ERROR "SNMALLOC_PTHREAD_FORK_PROTECTION requires working pthread_atfork support")
+endif()
 
 if (NOT MSVC AND NOT (SNMALLOC_CLEANUP STREQUAL CXX11_DESTRUCTORS))
   # If the target compiler doesn't support -nostdlib++ then we must enable C at
@@ -333,8 +337,9 @@ endfunction()
 add_as_define(SNMALLOC_QEMU_WORKAROUND)
 add_as_define(SNMALLOC_TRACING)
 add_as_define(SNMALLOC_CI_BUILD)
+add_as_define(SNMALLOC_PTHREAD_FORK_PROTECTION)
 add_as_define(SNMALLOC_PLATFORM_HAS_GETENTROPY)
-add_as_define(SNMALLOC_PTHREAD_ATFORK_WORKS)
+#add_as_define(SNMALLOC_PTHREAD_ATFORK_WORKS)
 add_as_define(SNMALLOC_HAS_LINUX_RANDOM_H)
 add_as_define(SNMALLOC_HAS_LINUX_FUTEX_H)
 if (SNMALLOC_NO_REALLOCARRAY)
diff --git a/src/snmalloc/ds_aal/prevent_fork.h b/src/snmalloc/ds_aal/prevent_fork.h
index 2ef0c2465..93482150a 100644
--- a/src/snmalloc/ds_aal/prevent_fork.h
+++ b/src/snmalloc/ds_aal/prevent_fork.h
@@ -10,6 +10,8 @@
 
 namespace snmalloc
 {
+
+#ifdef SNMALLOC_PTHREAD_FORK_PROTECTION
   // This is a simple implementation of a class that can be
   // used to prevent a process from forking. Holding a lock
   // in the allocator while forking can lead to deadlocks.
@@ -158,4 +160,15 @@ namespace snmalloc
       threads_preventing_fork--;
     }
   };
+#else
+  // The fork protection can cost a lot and it generally not required.
+  // This is a dummy implementation of the PreventFork class that does nothing.
+  class PreventFork
+  {
+  public:
+    PreventFork() {}
+
+    ~PreventFork() {}
+  };
+#endif
 } // namespace snmalloc
\ No newline at end of file

From 1680012a9471af56bc8bdab23058ead7d47d2334 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:39:28 +0000
Subject: [PATCH 4/9] Add branch predictor hints to spin lock

---
 src/snmalloc/ds_aal/flaglock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/snmalloc/ds_aal/flaglock.h b/src/snmalloc/ds_aal/flaglock.h
index a939f8e49..e17ece1c1 100644
--- a/src/snmalloc/ds_aal/flaglock.h
+++ b/src/snmalloc/ds_aal/flaglock.h
@@ -116,7 +116,7 @@ namespace snmalloc
   public:
     FlagLock(FlagWord& lock) : lock(lock)
     {
-      while (lock.flag.exchange(true, stl::memory_order_acquire))
+      while (SNMALLOC_UNLIKELY(lock.flag.exchange(true, stl::memory_order_acquire)))
       {
         // assert_not_owned_by_current_thread is only called when the first
         // acquiring is failed; which means the lock is already held somewhere
@@ -124,7 +124,7 @@ namespace snmalloc
         lock.assert_not_owned_by_current_thread();
         // This loop is better for spin-waiting because it won't issue
         // expensive write operation (xchg for example).
-        while (lock.flag.load(stl::memory_order_relaxed))
+        while (SNMALLOC_UNLIKELY(lock.flag.load(stl::memory_order_relaxed)))
         {
           Aal::pause();
         }

From e0c80df798077bc7c32886758e08d6bf72ce178b Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:40:26 +0000
Subject: [PATCH 5/9] Make in use checking only in debug builds

The in_use flag on pooled types is used to check for incorrect pool usage.  This is not required for correct code.  So moving to a Debug feature.
---
 src/snmalloc/mem/pooled.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/snmalloc/mem/pooled.h b/src/snmalloc/mem/pooled.h
index 489d72f9d..ca4e94101 100644
--- a/src/snmalloc/mem/pooled.h
+++ b/src/snmalloc/mem/pooled.h
@@ -51,13 +51,21 @@ namespace snmalloc
   public:
     void set_in_use()
     {
-      if (in_use.exchange(true))
+#ifndef NDEBUG
+      if (in_use.exchange(true, stl::memory_order_acq_rel))
         error("Critical error: double use of Pooled Type!");
+#else
+      in_use.store(true, stl::memory_order_relaxed);
+#endif
     }
 
     void reset_in_use()
     {
-      in_use.store(false);
+#ifndef NDEBUG
+      in_use.store(false, stl::memory_order_release);
+#else
+      in_use.store(false, stl::memory_order_relaxed);
+#endif
     }
 
     bool debug_is_in_use()

From 898e77d23badedba9f07d2771012c83198b09246 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Fri, 6 Feb 2026 11:55:17 +0000
Subject: [PATCH 6/9] Clangformat

---
 src/snmalloc/ds_aal/flaglock.h     | 3 ++-
 src/snmalloc/ds_aal/prevent_fork.h | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/snmalloc/ds_aal/flaglock.h b/src/snmalloc/ds_aal/flaglock.h
index e17ece1c1..c55b80bb1 100644
--- a/src/snmalloc/ds_aal/flaglock.h
+++ b/src/snmalloc/ds_aal/flaglock.h
@@ -116,7 +116,8 @@ namespace snmalloc
   public:
     FlagLock(FlagWord& lock) : lock(lock)
     {
-      while (SNMALLOC_UNLIKELY(lock.flag.exchange(true, stl::memory_order_acquire)))
+      while (
+        SNMALLOC_UNLIKELY(lock.flag.exchange(true, stl::memory_order_acquire)))
       {
         // assert_not_owned_by_current_thread is only called when the first
         // acquiring is failed; which means the lock is already held somewhere
diff --git a/src/snmalloc/ds_aal/prevent_fork.h b/src/snmalloc/ds_aal/prevent_fork.h
index 93482150a..fe51ee900 100644
--- a/src/snmalloc/ds_aal/prevent_fork.h
+++ b/src/snmalloc/ds_aal/prevent_fork.h
@@ -45,7 +45,7 @@ namespace snmalloc
     // calls would be ignored.
     static void ensure_init()
     {
-#ifdef SNMALLOC_PTHREAD_ATFORK_WORKS
+#  ifdef SNMALLOC_PTHREAD_ATFORK_WORKS
       static stl::Atomic<bool> initialised{false};
 
       if (initialised.load(stl::memory_order_acquire))
@@ -53,7 +53,7 @@ namespace snmalloc
 
       pthread_atfork(prefork, postfork_parent, postfork_child);
       initialised.store(true, stl::memory_order_release);
-#endif
+#  endif
     };
 
   public:

From 52cf718c3f1d62d71fffb45bbd7837bf043f0130 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Fri, 6 Feb 2026 14:10:44 +0000
Subject: [PATCH 7/9] Code review!

---
 CMakeLists.txt                     | 2 +-
 src/snmalloc/ds_aal/prevent_fork.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 20cf36225..e017d1e7a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -339,7 +339,7 @@ add_as_define(SNMALLOC_TRACING)
 add_as_define(SNMALLOC_CI_BUILD)
 add_as_define(SNMALLOC_PTHREAD_FORK_PROTECTION)
 add_as_define(SNMALLOC_PLATFORM_HAS_GETENTROPY)
-#add_as_define(SNMALLOC_PTHREAD_ATFORK_WORKS)
+add_as_define(SNMALLOC_PTHREAD_ATFORK_WORKS)
 add_as_define(SNMALLOC_HAS_LINUX_RANDOM_H)
 add_as_define(SNMALLOC_HAS_LINUX_FUTEX_H)
 if (SNMALLOC_NO_REALLOCARRAY)
diff --git a/src/snmalloc/ds_aal/prevent_fork.h b/src/snmalloc/ds_aal/prevent_fork.h
index fe51ee900..6e7635cb7 100644
--- a/src/snmalloc/ds_aal/prevent_fork.h
+++ b/src/snmalloc/ds_aal/prevent_fork.h
@@ -161,7 +161,7 @@ namespace snmalloc
     }
   };
 #else
-  // The fork protection can cost a lot and it generally not required.
+  // The fork protection can cost a lot and it is generally not required.
   // This is a dummy implementation of the PreventFork class that does nothing.
   class PreventFork
   {

From 0c05f5b568bba6502677f0c1a683fcc9cbd1d0e9 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Fri, 6 Feb 2026 14:33:12 +0000
Subject: [PATCH 8/9] Fix protect fork benchmarks

---
 src/test/func/protect_fork/protect_fork.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/func/protect_fork/protect_fork.cc b/src/test/func/protect_fork/protect_fork.cc
index 28a8f4c7a..026e16256 100644
--- a/src/test/func/protect_fork/protect_fork.cc
+++ b/src/test/func/protect_fork/protect_fork.cc
@@ -1,6 +1,4 @@
 #include <iostream>
-#include <snmalloc/snmalloc.h>
-
 #ifndef SNMALLOC_PTHREAD_ATFORK_WORKS
 int main()
 {
@@ -9,6 +7,8 @@ int main()
 }
 #else
 
+#  define SNMALLOC_PTHREAD_FORK_PROTECTION
+#  include <snmalloc/snmalloc.h>
 #  include <pthread.h>
 #  include <thread>
 

From a7b37117ea1810a5694dbdbc34b2eb42b864b7ab Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mjp41@users.noreply.github.com>
Date: Fri, 6 Feb 2026 16:49:32 +0000
Subject: [PATCH 9/9] Clangformat

---
 src/test/func/protect_fork/protect_fork.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/func/protect_fork/protect_fork.cc b/src/test/func/protect_fork/protect_fork.cc
index 026e16256..77227b202 100644
--- a/src/test/func/protect_fork/protect_fork.cc
+++ b/src/test/func/protect_fork/protect_fork.cc
@@ -8,8 +8,8 @@ int main()
 #else
 
 #  define SNMALLOC_PTHREAD_FORK_PROTECTION
-#  include <snmalloc/snmalloc.h>
 #  include <pthread.h>
+#  include <snmalloc/snmalloc.h>
 #  include <thread>
 
 void simulate_allocation()