diff --git a/include/clasp/core/lispStream.h b/include/clasp/core/lispStream.h
index bb019444b1..48138a3efa 100644
--- a/include/clasp/core/lispStream.h
+++ b/include/clasp/core/lispStream.h
@@ -685,6 +685,9 @@ class StringOutputStream_O : public StringStream_O {
   String_sp get_string();
 
   claspCharacter write_char(claspCharacter c) override;
+  // Bulk override: avoids the per-character boxing + virtual vectorPushExtend
+  // of the default AnsiStream_O::write_string char-by-char loop.
+  void write_string(String_sp data, cl_index start, cl_index end) override;
   void clear_output() override;
   void finish_output() override;
   void force_output() override;
diff --git a/include/clasp/gctools/boehm_config.h b/include/clasp/gctools/boehm_config.h
index 9b27bb63d5..3b814ec455 100644
--- a/include/clasp/gctools/boehm_config.h
+++ b/include/clasp/gctools/boehm_config.h
@@ -205,7 +205,14 @@
 #define PACKAGE_VERSION "8.2.8"
 
 /* Define to enable parallel marking. */
-/* #undef PARALLEL_MARK */
+/* clasp: ENABLED. Heap marking is Boehm-standard conservative marking
+   (ALL_INTERIOR_POINTERS=1 + GC_register_displacement for tagged pointers; clasp
+   registers NO custom mark procedures / typed descriptors), so parallel markers
+   parallelize Boehm's own, parallel-safe marking loop -- clasp injects no
+   per-thread marking logic. GC_THREADS and THREAD_LOCAL_ALLOC are already on, and
+   bdwgc auto-starts (#CPUs-1) marker threads in GC_thr_init. This parallelizes
+   GC_mark_from, the dominant runtime cost (~25-30% in profiles) on multicore. */
+#define PARALLEL_MARK 1
 
 /* If defined, redirect free to this function. */
 /* #undef REDIRECT_FREE */
diff --git a/include/clasp/gctools/threadlocal.fwd.h b/include/clasp/gctools/threadlocal.fwd.h
index 39fc9653bd..9e6ab5aeff 100644
--- a/include/clasp/gctools/threadlocal.fwd.h
+++ b/include/clasp/gctools/threadlocal.fwd.h
@@ -25,13 +25,21 @@ struct MonitorAllocations {
 #endif
 
 struct GlobalAllocationProfiler {
-  std::atomic<int64_t> _BytesAllocated;
+  // These counters live in the THREAD_LOCAL ThreadLocalStateLowLevel (as the
+  // member _Allocations) and are only ever accessed via
+  // my_thread_low_level->_Allocations, i.e. by the owning thread alone (see
+  // gcalloc_boehm.h, gcFunctions.cc, startRunStop.cc, memoryManagement.cc).
+  // There is no shared/global instance and no cross-thread read, so atomics are
+  // pure overhead on registerAllocation(), which runs on every heap allocation.
+  // Plain int64_t with in-class zero-init (the previous atomics were not all
+  // initialized by the constructors below).
+  int64_t _BytesAllocated = 0;
   size_t _AllocationSizeThreshold;
   size_t _AllocationNumberThreshold;
-  std::atomic<int64_t> _AllocationSizeCounter;
-  std::atomic<int64_t> _AllocationNumberCounter;
-  std::atomic<int64_t> _HitAllocationNumberCounter;
-  std::atomic<int64_t> _HitAllocationSizeCounter;
+  int64_t _AllocationSizeCounter = 0;
+  int64_t _AllocationNumberCounter = 0;
+  int64_t _HitAllocationNumberCounter = 0;
+  int64_t _HitAllocationSizeCounter = 0;
 #ifdef DEBUG_MONITOR_ALLOCATIONS
   MonitorAllocations _Monitor;
 #endif
diff --git a/include/clasp/llvmo/code.h b/include/clasp/llvmo/code.h
index d988f056b2..baa991b184 100644
--- a/include/clasp/llvmo/code.h
+++ b/include/clasp/llvmo/code.h
@@ -428,6 +428,11 @@ namespace llvmo {
 core::T_sp identify_code_or_library(gctools::clasp_ptr_t entry_point);
 
 size_t countObjectFileNames(const std::string& name);
+// Maintain the O(1) name->count index used by countObjectFileNames.
+// recordObjectFileName is called for every registered object file;
+// clearObjectFileNameCounts is called whenever _AllObjectFiles is cleared.
+void recordObjectFileName(const std::string& name);
+void clearObjectFileNameCounts();
 
 std::string createIRModuleObjectFileName(size_t startupId, std::string& prefix);
 bool verifyIRModuleObjectFileStartupSymbol(const std::string& name);
@@ -456,6 +461,8 @@ template <typename Stage = gctools::RuntimeStage> void registerObjectFile(Object
     expected = _lisp->_Roots._AllObjectFiles.load();
     entry->rplacd(expected);
   } while (!_lisp->_Roots._AllObjectFiles.compare_exchange_weak(expected, entry));
+  // Keep the O(1) countObjectFileNames index in sync (single add point).
+  recordObjectFileName(name);
 }
 
 }; // namespace llvmo
diff --git a/src/core/bytecode.cc b/src/core/bytecode.cc
index 16df16cd16..97bf6cbca2 100644
--- a/src/core/bytecode.cc
+++ b/src/core/bytecode.cc
@@ -64,9 +64,9 @@ void BytecodeModule_O::register_for_debug() {
 // Note that we check stepping in the callER not the callEE.
 // This is so that we could provide the actual source forms, as we already do
 // in native code. TODO
-static void maybe_step_call(void* frame,
+static void maybe_step_call(ThreadLocalState* thread, void* frame,
                             Function_sp func, size_t nargs, T_O** rargs) {
-  if (my_thread->_Breakstep) [[unlikely]] {
+  if (thread->_Breakstep) [[unlikely]] {
     ql::list args;
     for (size_t iarg = 0; iarg < nargs; ++iarg)
       args << T_sp((gctools::Tagged)rargs[iarg]);
@@ -241,6 +241,12 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
   uintptr_t bytecode_end = (uintptr_t)(bc->rowMajorAddressOfElement_(bc->length()));
 #endif
   MultipleValues& multipleValues = core::lisp_multipleValues();
+  // Resolve the thread-local pointer once per VM frame. On Darwin every
+  // `my_thread` access is a _tlv_get_addr thunk call; the interpreter hits it on
+  // every call (maybe_step_call's breakstep check) and in several opcodes, so
+  // caching it here removes a per-call/per-op thunk. The thread does not change
+  // during a VM frame.
+  ThreadLocalState* thread = my_thread;
   unsigned char* pc = vm._pc;
   while (1) {
     VM_PC_CHECK(vm, pc, bytecode_start, bytecode_end);
@@ -288,7 +294,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
       T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
       Function_sp func = gc::As_assert<Function_sp>(tfunc);
       T_O** args = vm.stackref(sp, nargs - 1);
-      maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+      maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
       // We push the PC for the debugger (see make_bytecode_frame in backtrace.cc)
       // We do this here rather than bytecode_call because e.g. we may call a
       // non-bytecode function, that in turn calls a bunch of different bytecode
@@ -311,7 +317,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
       VM_RECORD_PLAYBACK(func, "vm_call_receive_one_func");
       VM_RECORD_PLAYBACK((void*)(uintptr_t)nargs, "vm_call_receive_one_nargs");
       T_O** args = vm.stackref(sp, nargs - 1);
-      maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+      maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
 #if DEBUG_VM_RECORD_PLAYBACK == 1
       for (size_t ii = 0; ii < nargs; ii++) {
         stringstream name_args;
@@ -336,7 +342,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
       T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
       Function_sp func = gc::As_assert<Function_sp>(tfunc);
       T_O** args = vm.stackref(sp, nargs - 1);
-      maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+      maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
       vm.push(sp, (T_O*)pc);
       vm._pc = pc;
       vm._stackPointer = sp;
@@ -690,7 +696,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
       T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
       Function_sp func = gc::As_assert<Function_sp>(tfunc);
       T_O** args = vm.stackref(sp, nargs - 1);
-      maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+      maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
       vm.push(sp, (T_O*)pc);
       vm._pc = pc;
       vm._stackPointer = sp;
@@ -708,7 +714,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
       T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
       Function_sp func = gc::As_assert<Function_sp>(tfunc);
       T_O** args = vm.stackref(sp, nargs - 1);
-      maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+      maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
       vm.push(sp, (T_O*)pc);
       vm._pc = pc;
       vm._stackPointer = sp;
@@ -727,7 +733,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
       T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
       Function_sp func = gc::As_assert<Function_sp>(tfunc);
       T_O** args = vm.stackref(sp, nargs - 1);
-      maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+      maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
       vm.push(sp, (T_O*)pc);
       vm._pc = pc;
       vm._stackPointer = sp;
@@ -1049,6 +1055,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
 static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, MultipleValues& multipleValues, T_O** literals,
                                     T_O** closed, Closure_O* closure, core::T_O** fp, core::T_O** sp, size_t lcc_nargs,
                                     core::T_O** lcc_args, uint8_t sub_opcode) {
+  ThreadLocalState* thread = my_thread; // cache TLS pointer once (see bytecode_vm)
   switch ((vm_code)sub_opcode) {
   case vm_code::ref: {
     uint8_t low = *(pc + 1);
@@ -1083,7 +1090,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi
     T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
     Function_sp func = gc::As_assert<Function_sp>(tfunc);
     T_O** args = vm.stackref(sp, nargs - 1);
-    maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+    maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
     vm.push(sp, (T_O*)pc);
     vm._pc = pc;
     vm._stackPointer = sp;
@@ -1100,7 +1107,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi
     T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
     Function_sp func = gc::As_assert<Function_sp>(tfunc);
     T_O** args = vm.stackref(sp, nargs - 1);
-    maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+    maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
     VM_RECORD_PLAYBACK(func, "vm_call_receive_one_func");
     VM_RECORD_PLAYBACK((void*)(uintptr_t)nargs, "vm_call_receive_one_nargs");
 #if DEBUG_VM_RECORD_PLAYBACK == 1
@@ -1129,7 +1136,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi
     T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
     Function_sp func = gc::As_assert<Function_sp>(tfunc);
     T_O** args = vm.stackref(sp, nargs - 1);
-    maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+    maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
     vm.push(sp, (T_O*)pc);
     vm._pc = pc;
     vm._stackPointer = sp;
@@ -1348,7 +1355,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi
     T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
     Function_sp func = gc::As_assert<Function_sp>(tfunc);
     T_O** args = vm.stackref(sp, nargs - 1);
-    maybe_step_call(__builtin_frame_address(0), func, nargs, args);
+    maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
     vm.push(sp, (T_O*)pc);
     vm._pc = pc;
     vm._stackPointer = sp;
diff --git a/src/core/lispStream.cc b/src/core/lispStream.cc
index f86d9b9834..2961efd9ed 100644
--- a/src/core/lispStream.cc
+++ b/src/core/lispStream.cc
@@ -2723,6 +2723,89 @@ claspCharacter StringOutputStream_O::write_char(claspCharacter c) {
   return c;
 }
 
+// Bulk write_string for string-output-streams. The default
+// AnsiStream_O::write_string writes one character at a time, and each char
+// pays a boxing (clasp_make_character) plus a virtual vectorPushExtend with a
+// fill-pointer/realloc check. Here we (1) update the output cursor over the
+// appended range exactly as the per-char path would, and (2) append the
+// content in one shot with a single geometric growth and the type-safe subseq
+// copy used elsewhere for string buffers (see read_line / unsafe_setf_subseq),
+// which preserves base/extended conversion and narrowing-error behavior.
+void StringOutputStream_O::write_string(String_sp data, cl_index start, cl_index end) {
+  if (start >= end)
+    return;
+  const cl_index count = end - start;
+  const cl_index oldFill = this->_contents->fillPointer();
+  const cl_index newFill = oldFill + count;
+  // Grow the destination once (geometric) so the copy needs no per-character
+  // reallocation or fill-pointer check. _contents is always an adjustable
+  // fill-pointer string (an MDArray) for a string-output-stream; resize keeps
+  // the fill pointer for fill-pointer arrays.
+  if (newFill > static_cast<cl_index>(this->_contents->arrayTotalSize())) {
+    cl_index newTotal = static_cast<cl_index>(this->_contents->arrayTotalSize()) * 2;
+    if (newTotal < newFill)
+      newTotal = newFill;
+    gc::As<MDArray_sp>(this->_contents)->resize(newTotal);
+  }
+  StreamCursor& cur = this->_output_cursor;
+  // Resolve the underlying simple vectors so element access is a non-virtual,
+  // typed indexing operation. Boehm GC is non-moving and the copy loop below
+  // performs no allocation, so these references stay valid throughout.
+  AbstractSimpleVector_sp ssv;
+  size_t s0, s1;
+  data->asAbstractSimpleVectorRange(ssv, s0, s1);
+  AbstractSimpleVector_sp dsv;
+  size_t d0, d1;
+  this->_contents->asAbstractSimpleVectorRange(dsv, d0, d1);
+  const cl_index srcOff = static_cast<cl_index>(s0) + start;
+  const cl_index dstOff = static_cast<cl_index>(d0) + oldFill;
+  bool fast = true;
+  if (gc::IsA<SimpleCharacterString_sp>(dsv)) {
+    SimpleCharacterString_O& dst = *gc::As_unsafe<SimpleCharacterString_sp>(dsv);
+    if (gc::IsA<SimpleCharacterString_sp>(ssv)) {
+      SimpleCharacterString_O& src = *gc::As_unsafe<SimpleCharacterString_sp>(ssv);
+      for (cl_index i = 0; i < count; ++i) {
+        claspCharacter c = src[srcOff + i];
+        dst[dstOff + i] = c;
+        cur.update(c);
+      }
+    } else if (gc::IsA<SimpleBaseString_sp>(ssv)) {
+      SimpleBaseString_O& src = *gc::As_unsafe<SimpleBaseString_sp>(ssv);
+      for (cl_index i = 0; i < count; ++i) {
+        claspCharacter c = src[srcOff + i]; // widen 8 -> 32 bits
+        dst[dstOff + i] = c;
+        cur.update(c);
+      }
+    } else
+      fast = false;
+  } else if (gc::IsA<SimpleBaseString_sp>(dsv) && gc::IsA<SimpleBaseString_sp>(ssv)) {
+    SimpleBaseString_O& dst = *gc::As_unsafe<SimpleBaseString_sp>(dsv);
+    SimpleBaseString_O& src = *gc::As_unsafe<SimpleBaseString_sp>(ssv);
+    for (cl_index i = 0; i < count; ++i) {
+      claspChar c = src[srcOff + i];
+      dst[dstOff + i] = c;
+      cur.update(c);
+    }
+  } else {
+    // Character source into a base destination may narrow; defer to the safe
+    // path which raises the proper type error for out-of-range characters.
+    fast = false;
+  }
+  if (fast) {
+    this->_contents->fillPointerSet(newFill);
+    return;
+  }
+  // Safe fallback: cursor updated identically to the per-char path, content
+  // copied with the tested type-checking subseq machinery.
+  for (cl_index i = start; i < end; ++i)
+    cur.update(clasp_as_claspCharacter(cl__char(data, i)));
+  Array_sp src = (start == 0 && static_cast<cl_index>(data->length()) == end)
+                     ? gc::As_unsafe<Array_sp>(data)
+                     : gc::As_unsafe<Array_sp>(data->unsafe_subseq(start, end));
+  this->_contents->unsafe_setf_subseq(oldFill, newFill, src);
+  this->_contents->fillPointerSet(newFill);
+}
+
 T_sp StringOutputStream_O::position() { return Integer_O::create((gc::Fixnum)_contents->fillPointer()); }
 
 void StringOutputStream_O::clear_output() {}
diff --git a/src/gctools/snapshotSaveLoad.cc b/src/gctools/snapshotSaveLoad.cc
index 8b32e4c90a..1329eb9114 100644
--- a/src/gctools/snapshotSaveLoad.cc
+++ b/src/gctools/snapshotSaveLoad.cc
@@ -2321,6 +2321,7 @@ void snapshot_load(void* maybeStartOfSnapshot, void* maybeEndOfSnapshot, const s
         my_thread->finish_initialization_main_thread(nil);
         // Now we have NIL in 'nil' - use it to initialize a few things.
         _lisp->_Roots._AllObjectFiles.store(nil);
+        llvmo::clearObjectFileNameCounts(); // keep countObjectFileNames index in sync with the cleared list
         _lisp->_Roots._AllCodeBlocks.store(nil);
       }
 
diff --git a/src/lisp/kernel/lsp/format-pprint.lisp b/src/lisp/kernel/lsp/format-pprint.lisp
index d055ed0ff5..120c867ab5 100644
--- a/src/lisp/kernel/lsp/format-pprint.lisp
+++ b/src/lisp/kernel/lsp/format-pprint.lisp
@@ -902,4 +902,10 @@
         *standard-pprint-dispatch* *initial-pprint-dispatch*)
   (setf (pprint-dispatch-table-read-only-p *standard-pprint-dispatch*) t)
   (setf (first (cdr si::+io-syntax-progv-list+)) *standard-pprint-dispatch*)
-  (setf *print-pretty* t))
+  ;; Default *print-pretty* to NIL, matching SBCL/CCL/CLISP. clasp's pretty
+  ;; printer is a CLOS Gray stream (a C++<->Lisp generic-dispatch per output
+  ;; character), making it ~370x slower than the ordinary printer; defaulting
+  ;; it on made every princ/prin1/format-~A/print pay that cost. The pretty
+  ;; printer remains available via PPRINT, ~<...~:>, *print-pretty* bindings,
+  ;; and the interactive REPL (which binds it back to T).
+  (setf *print-pretty* nil))
diff --git a/src/lisp/kernel/lsp/top.lisp b/src/lisp/kernel/lsp/top.lisp
index a1b3cc21ca..e89e1c6cf1 100644
--- a/src/lisp/kernel/lsp/top.lisp
+++ b/src/lisp/kernel/lsp/top.lisp
@@ -668,10 +668,15 @@ Use special code 0 to cancel this operation.")
 	   (push (subseq line start end) list)))))
 
 (defun tpl-print (values)
-  (fresh-line)
-  (dolist (v values)
-    (prin1 v)
-    (terpri)))
+  ;; The global default of *print-pretty* is NIL (the ordinary printer is far
+  ;; faster than clasp's CLOS-Gray-stream pretty printer). Re-enable pretty
+  ;; printing for interactive REPL results only, where the per-form cost is
+  ;; irrelevant and nicely-wrapped output is desirable.
+  (let ((*print-pretty* t))
+    (fresh-line)
+    (dolist (v values)
+      (prin1 v)
+      (terpri))))
 
 (defun tpl-unknown-command (command)
   (format t "Unknown top level command: ~s~%" command)
diff --git a/src/llvmo/code.cc b/src/llvmo/code.cc
index dacc85f89c..38f6cb45fa 100644
--- a/src/llvmo/code.cc
+++ b/src/llvmo/code.cc
@@ -10,6 +10,8 @@
 #include <dlfcn.h>
 #include <iomanip>
 #include <cstdint>
+#include <mutex>
+#include <unordered_map>
 #include <clasp/core/foundation.h>
 #include <clasp/core/lispStream.h>
 #include <clasp/core/debugger.h>
@@ -451,6 +453,7 @@ CL_LISPIFY_NAME(release_object_files);
 DOCGROUP(clasp);
 CL_DEFUN void release_object_files() {
   _lisp->_Roots._AllObjectFiles.store(nil<core::T_O>());
+  clearObjectFileNameCounts();
   core::clasp_write_string("ObjectFiles have been released\n");
 }
 
@@ -732,21 +735,42 @@ std::string CodeBlock_O::__repr__() const {
 
 void CodeBlock_O::describe() const { printf("%s:%d:%s entered\n", __FILE__, __LINE__, __FUNCTION__); }
 
+// Object-file names are uniquified at registration time
+// (ensureUniqueMemoryBufferName -> countObjectFileNames). The original
+// countObjectFileNames rescanned the whole _AllObjectFiles list with a memcmp
+// per call, which is O(N) per registration and therefore O(N^2) as JITted
+// modules accumulate -- a real scalability cost for long-running /
+// compilation-heavy processes (it was the #1 self-time function, ~15%, in a
+// profiled workload). We maintain an auxiliary name->count index updated at the
+// single registration site (recordObjectFileName) and reset whenever
+// _AllObjectFiles is cleared (clearObjectFileNameCounts), making the lookup
+// O(1). _AllObjectFiles is only ever appended to (registerObjectFile) or
+// bulk-cleared -- never individually pruned -- so the index stays in sync. The
+// map holds no GC pointers; a mutex guards it because registration can race.
+static std::mutex& objectFileNameCountsMutex() {
+  static std::mutex m;
+  return m;
+}
+static std::unordered_map<std::string, size_t>& objectFileNameCounts() {
+  static std::unordered_map<std::string, size_t> m;
+  return m;
+}
+
+void recordObjectFileName(const std::string& name) {
+  std::lock_guard<std::mutex> guard(objectFileNameCountsMutex());
+  ++objectFileNameCounts()[name];
+}
+
+void clearObjectFileNameCounts() {
+  std::lock_guard<std::mutex> guard(objectFileNameCountsMutex());
+  objectFileNameCounts().clear();
+}
+
 size_t countObjectFileNames(const std::string& name) {
   DEBUG_OBJECT_FILES_PRINT(("%s:%d:%s Lookup name %s\n", __FILE__, __LINE__, __FUNCTION__, name.c_str()));
-  size_t count = 0;
-  core::T_sp cur = _lisp->_Roots._AllObjectFiles.load();
-  while (cur.consp()) {
-    ObjectFile_sp of = gc::As<ObjectFile_sp>(CONS_CAR(cur));
-    const char* codeNameStart = (const char*)of->_CodeName->_Data.data();
-    if (of->_CodeName->length() == name.size()) {
-      if (memcmp(codeNameStart, name.data(), name.size()) == 0) {
-        count++;
-      }
-    }
-    cur = CONS_CDR(cur);
-  }
-  return count;
+  std::lock_guard<std::mutex> guard(objectFileNameCountsMutex());
+  auto it = objectFileNameCounts().find(name);
+  return it == objectFileNameCounts().end() ? 0 : it->second;
 };
 
 CL_DEFUN core::T_sp llvm_sys__allObjectFileNames() {
diff --git a/src/llvmo/llvmoPackage.cc b/src/llvmo/llvmoPackage.cc
index 589d77da60..cb0cd3aa71 100644
--- a/src/llvmo/llvmoPackage.cc
+++ b/src/llvmo/llvmoPackage.cc
@@ -54,6 +54,7 @@ THE SOFTWARE.
 #include <clasp/llvmo/clbindLlvmExpose.h>
 #include <clasp/llvmo/debugInfoExpose.h>
 #include <clasp/llvmo/intrinsics.h>
+#include <clasp/llvmo/code.h> // JITDataReadWriteMaybeExecute / JITDataReadExecute (Apple Silicon W^X)
 #include <clasp/llvmo/claspLinkPass.h>
 #include <clasp/core/bytecode.h>
 #include <clasp/core/instance.h>
@@ -690,7 +691,16 @@ CL_DEFUN_SETF core::T_sp setf_jit_lookup_t(core::T_sp value, JITDylib_sp dylib,
   if (!found)
     SIMPLE_ERROR("Could not find pointer for name |{}|", name);
   core::T_O** tptr = (core::T_O**)ptr;
+  // `tptr` points into a JIT'd dylib's memory (e.g. the callback-lisp-function-N
+  // global created by make-callback). On Apple Silicon that memory is MAP_JIT and
+  // execute-protected for this thread under W^X, so a plain store faults with
+  // SIGBUS. Enable writing around the store, exactly like the other JIT-literal
+  // write sites (code.cc, loadltv.cc, compiler.cc, snapshotSaveLoad.cc). The
+  // window contains only this pointer store -- no allocation/GC/JIT. No-op off
+  // Apple Silicon.
+  JITDataReadWriteMaybeExecute();
   *tptr = value.raw_();
+  JITDataReadExecute();
   return value;
 }