diff --git a/include/clasp/core/lispStream.h b/include/clasp/core/lispStream.h index bb019444b1..48138a3efa 100644 --- a/include/clasp/core/lispStream.h +++ b/include/clasp/core/lispStream.h @@ -685,6 +685,9 @@ class StringOutputStream_O : public StringStream_O { String_sp get_string(); claspCharacter write_char(claspCharacter c) override; + // Bulk override: avoids the per-character boxing + virtual vectorPushExtend + // of the default AnsiStream_O::write_string char-by-char loop. + void write_string(String_sp data, cl_index start, cl_index end) override; void clear_output() override; void finish_output() override; void force_output() override; diff --git a/include/clasp/gctools/boehm_config.h b/include/clasp/gctools/boehm_config.h index 9b27bb63d5..3b814ec455 100644 --- a/include/clasp/gctools/boehm_config.h +++ b/include/clasp/gctools/boehm_config.h @@ -205,7 +205,14 @@ #define PACKAGE_VERSION "8.2.8" /* Define to enable parallel marking. */ -/* #undef PARALLEL_MARK */ +/* clasp: ENABLED. Heap marking is Boehm-standard conservative marking + (ALL_INTERIOR_POINTERS=1 + GC_register_displacement for tagged pointers; clasp + registers NO custom mark procedures / typed descriptors), so parallel markers + parallelize Boehm's own, parallel-safe marking loop -- clasp injects no + per-thread marking logic. GC_THREADS and THREAD_LOCAL_ALLOC are already on, and + bdwgc auto-starts (#CPUs-1) marker threads in GC_thr_init. This parallelizes + GC_mark_from, the dominant runtime cost (~25-30% in profiles) on multicore. */ +#define PARALLEL_MARK 1 /* If defined, redirect free to this function. */ /* #undef REDIRECT_FREE */ diff --git a/include/clasp/gctools/threadlocal.fwd.h b/include/clasp/gctools/threadlocal.fwd.h index 39fc9653bd..9e6ab5aeff 100644 --- a/include/clasp/gctools/threadlocal.fwd.h +++ b/include/clasp/gctools/threadlocal.fwd.h @@ -25,13 +25,21 @@ struct MonitorAllocations { #endif struct GlobalAllocationProfiler { - std::atomic _BytesAllocated; + // These counters live in the THREAD_LOCAL ThreadLocalStateLowLevel (as the + // member _Allocations) and are only ever accessed via + // my_thread_low_level->_Allocations, i.e. by the owning thread alone (see + // gcalloc_boehm.h, gcFunctions.cc, startRunStop.cc, memoryManagement.cc). + // There is no shared/global instance and no cross-thread read, so atomics are + // pure overhead on registerAllocation(), which runs on every heap allocation. + // Plain int64_t with in-class zero-init (the previous atomics were not all + // initialized by the constructors below). + int64_t _BytesAllocated = 0; size_t _AllocationSizeThreshold; size_t _AllocationNumberThreshold; - std::atomic _AllocationSizeCounter; - std::atomic _AllocationNumberCounter; - std::atomic _HitAllocationNumberCounter; - std::atomic _HitAllocationSizeCounter; + int64_t _AllocationSizeCounter = 0; + int64_t _AllocationNumberCounter = 0; + int64_t _HitAllocationNumberCounter = 0; + int64_t _HitAllocationSizeCounter = 0; #ifdef DEBUG_MONITOR_ALLOCATIONS MonitorAllocations _Monitor; #endif diff --git a/include/clasp/llvmo/code.h b/include/clasp/llvmo/code.h index d988f056b2..baa991b184 100644 --- a/include/clasp/llvmo/code.h +++ b/include/clasp/llvmo/code.h @@ -428,6 +428,11 @@ namespace llvmo { core::T_sp identify_code_or_library(gctools::clasp_ptr_t entry_point); size_t countObjectFileNames(const std::string& name); +// Maintain the O(1) name->count index used by countObjectFileNames. +// recordObjectFileName is called for every registered object file; +// clearObjectFileNameCounts is called whenever _AllObjectFiles is cleared. +void recordObjectFileName(const std::string& name); +void clearObjectFileNameCounts(); std::string createIRModuleObjectFileName(size_t startupId, std::string& prefix); bool verifyIRModuleObjectFileStartupSymbol(const std::string& name); @@ -456,6 +461,8 @@ template void registerObjectFile(Object expected = _lisp->_Roots._AllObjectFiles.load(); entry->rplacd(expected); } while (!_lisp->_Roots._AllObjectFiles.compare_exchange_weak(expected, entry)); + // Keep the O(1) countObjectFileNames index in sync (single add point). + recordObjectFileName(name); } }; // namespace llvmo diff --git a/src/core/bytecode.cc b/src/core/bytecode.cc index 16df16cd16..97bf6cbca2 100644 --- a/src/core/bytecode.cc +++ b/src/core/bytecode.cc @@ -64,9 +64,9 @@ void BytecodeModule_O::register_for_debug() { // Note that we check stepping in the callER not the callEE. // This is so that we could provide the actual source forms, as we already do // in native code. TODO -static void maybe_step_call(void* frame, +static void maybe_step_call(ThreadLocalState* thread, void* frame, Function_sp func, size_t nargs, T_O** rargs) { - if (my_thread->_Breakstep) [[unlikely]] { + if (thread->_Breakstep) [[unlikely]] { ql::list args; for (size_t iarg = 0; iarg < nargs; ++iarg) args << T_sp((gctools::Tagged)rargs[iarg]); @@ -241,6 +241,12 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure uintptr_t bytecode_end = (uintptr_t)(bc->rowMajorAddressOfElement_(bc->length())); #endif MultipleValues& multipleValues = core::lisp_multipleValues(); + // Resolve the thread-local pointer once per VM frame. On Darwin every + // `my_thread` access is a _tlv_get_addr thunk call; the interpreter hits it on + // every call (maybe_step_call's breakstep check) and in several opcodes, so + // caching it here removes a per-call/per-op thunk. The thread does not change + // during a VM frame. + ThreadLocalState* thread = my_thread; unsigned char* pc = vm._pc; while (1) { VM_PC_CHECK(vm, pc, bytecode_start, bytecode_end); @@ -288,7 +294,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); // We push the PC for the debugger (see make_bytecode_frame in backtrace.cc) // We do this here rather than bytecode_call because e.g. we may call a // non-bytecode function, that in turn calls a bunch of different bytecode @@ -311,7 +317,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure VM_RECORD_PLAYBACK(func, "vm_call_receive_one_func"); VM_RECORD_PLAYBACK((void*)(uintptr_t)nargs, "vm_call_receive_one_nargs"); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); #if DEBUG_VM_RECORD_PLAYBACK == 1 for (size_t ii = 0; ii < nargs; ii++) { stringstream name_args; @@ -336,7 +342,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); vm.push(sp, (T_O*)pc); vm._pc = pc; vm._stackPointer = sp; @@ -690,7 +696,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); vm.push(sp, (T_O*)pc); vm._pc = pc; vm._stackPointer = sp; @@ -708,7 +714,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); vm.push(sp, (T_O*)pc); vm._pc = pc; vm._stackPointer = sp; @@ -727,7 +733,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); vm.push(sp, (T_O*)pc); vm._pc = pc; vm._stackPointer = sp; @@ -1049,6 +1055,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, MultipleValues& multipleValues, T_O** literals, T_O** closed, Closure_O* closure, core::T_O** fp, core::T_O** sp, size_t lcc_nargs, core::T_O** lcc_args, uint8_t sub_opcode) { + ThreadLocalState* thread = my_thread; // cache TLS pointer once (see bytecode_vm) switch ((vm_code)sub_opcode) { case vm_code::ref: { uint8_t low = *(pc + 1); @@ -1083,7 +1090,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); vm.push(sp, (T_O*)pc); vm._pc = pc; vm._stackPointer = sp; @@ -1100,7 +1107,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); VM_RECORD_PLAYBACK(func, "vm_call_receive_one_func"); VM_RECORD_PLAYBACK((void*)(uintptr_t)nargs, "vm_call_receive_one_nargs"); #if DEBUG_VM_RECORD_PLAYBACK == 1 @@ -1129,7 +1136,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); vm.push(sp, (T_O*)pc); vm._pc = pc; vm._stackPointer = sp; @@ -1348,7 +1355,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); - maybe_step_call(__builtin_frame_address(0), func, nargs, args); + maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args); vm.push(sp, (T_O*)pc); vm._pc = pc; vm._stackPointer = sp; diff --git a/src/core/lispStream.cc b/src/core/lispStream.cc index f86d9b9834..2961efd9ed 100644 --- a/src/core/lispStream.cc +++ b/src/core/lispStream.cc @@ -2723,6 +2723,89 @@ claspCharacter StringOutputStream_O::write_char(claspCharacter c) { return c; } +// Bulk write_string for string-output-streams. The default +// AnsiStream_O::write_string writes one character at a time, and each char +// pays a boxing (clasp_make_character) plus a virtual vectorPushExtend with a +// fill-pointer/realloc check. Here we (1) update the output cursor over the +// appended range exactly as the per-char path would, and (2) append the +// content in one shot with a single geometric growth and the type-safe subseq +// copy used elsewhere for string buffers (see read_line / unsafe_setf_subseq), +// which preserves base/extended conversion and narrowing-error behavior. +void StringOutputStream_O::write_string(String_sp data, cl_index start, cl_index end) { + if (start >= end) + return; + const cl_index count = end - start; + const cl_index oldFill = this->_contents->fillPointer(); + const cl_index newFill = oldFill + count; + // Grow the destination once (geometric) so the copy needs no per-character + // reallocation or fill-pointer check. _contents is always an adjustable + // fill-pointer string (an MDArray) for a string-output-stream; resize keeps + // the fill pointer for fill-pointer arrays. + if (newFill > static_cast(this->_contents->arrayTotalSize())) { + cl_index newTotal = static_cast(this->_contents->arrayTotalSize()) * 2; + if (newTotal < newFill) + newTotal = newFill; + gc::As(this->_contents)->resize(newTotal); + } + StreamCursor& cur = this->_output_cursor; + // Resolve the underlying simple vectors so element access is a non-virtual, + // typed indexing operation. Boehm GC is non-moving and the copy loop below + // performs no allocation, so these references stay valid throughout. + AbstractSimpleVector_sp ssv; + size_t s0, s1; + data->asAbstractSimpleVectorRange(ssv, s0, s1); + AbstractSimpleVector_sp dsv; + size_t d0, d1; + this->_contents->asAbstractSimpleVectorRange(dsv, d0, d1); + const cl_index srcOff = static_cast(s0) + start; + const cl_index dstOff = static_cast(d0) + oldFill; + bool fast = true; + if (gc::IsA(dsv)) { + SimpleCharacterString_O& dst = *gc::As_unsafe(dsv); + if (gc::IsA(ssv)) { + SimpleCharacterString_O& src = *gc::As_unsafe(ssv); + for (cl_index i = 0; i < count; ++i) { + claspCharacter c = src[srcOff + i]; + dst[dstOff + i] = c; + cur.update(c); + } + } else if (gc::IsA(ssv)) { + SimpleBaseString_O& src = *gc::As_unsafe(ssv); + for (cl_index i = 0; i < count; ++i) { + claspCharacter c = src[srcOff + i]; // widen 8 -> 32 bits + dst[dstOff + i] = c; + cur.update(c); + } + } else + fast = false; + } else if (gc::IsA(dsv) && gc::IsA(ssv)) { + SimpleBaseString_O& dst = *gc::As_unsafe(dsv); + SimpleBaseString_O& src = *gc::As_unsafe(ssv); + for (cl_index i = 0; i < count; ++i) { + claspChar c = src[srcOff + i]; + dst[dstOff + i] = c; + cur.update(c); + } + } else { + // Character source into a base destination may narrow; defer to the safe + // path which raises the proper type error for out-of-range characters. + fast = false; + } + if (fast) { + this->_contents->fillPointerSet(newFill); + return; + } + // Safe fallback: cursor updated identically to the per-char path, content + // copied with the tested type-checking subseq machinery. + for (cl_index i = start; i < end; ++i) + cur.update(clasp_as_claspCharacter(cl__char(data, i))); + Array_sp src = (start == 0 && static_cast(data->length()) == end) + ? gc::As_unsafe(data) + : gc::As_unsafe(data->unsafe_subseq(start, end)); + this->_contents->unsafe_setf_subseq(oldFill, newFill, src); + this->_contents->fillPointerSet(newFill); +} + T_sp StringOutputStream_O::position() { return Integer_O::create((gc::Fixnum)_contents->fillPointer()); } void StringOutputStream_O::clear_output() {} diff --git a/src/gctools/snapshotSaveLoad.cc b/src/gctools/snapshotSaveLoad.cc index 8b32e4c90a..1329eb9114 100644 --- a/src/gctools/snapshotSaveLoad.cc +++ b/src/gctools/snapshotSaveLoad.cc @@ -2321,6 +2321,7 @@ void snapshot_load(void* maybeStartOfSnapshot, void* maybeEndOfSnapshot, const s my_thread->finish_initialization_main_thread(nil); // Now we have NIL in 'nil' - use it to initialize a few things. _lisp->_Roots._AllObjectFiles.store(nil); + llvmo::clearObjectFileNameCounts(); // keep countObjectFileNames index in sync with the cleared list _lisp->_Roots._AllCodeBlocks.store(nil); } diff --git a/src/lisp/kernel/lsp/format-pprint.lisp b/src/lisp/kernel/lsp/format-pprint.lisp index d055ed0ff5..120c867ab5 100644 --- a/src/lisp/kernel/lsp/format-pprint.lisp +++ b/src/lisp/kernel/lsp/format-pprint.lisp @@ -902,4 +902,10 @@ *standard-pprint-dispatch* *initial-pprint-dispatch*) (setf (pprint-dispatch-table-read-only-p *standard-pprint-dispatch*) t) (setf (first (cdr si::+io-syntax-progv-list+)) *standard-pprint-dispatch*) - (setf *print-pretty* t)) + ;; Default *print-pretty* to NIL, matching SBCL/CCL/CLISP. clasp's pretty + ;; printer is a CLOS Gray stream (a C++<->Lisp generic-dispatch per output + ;; character), making it ~370x slower than the ordinary printer; defaulting + ;; it on made every princ/prin1/format-~A/print pay that cost. The pretty + ;; printer remains available via PPRINT, ~<...~:>, *print-pretty* bindings, + ;; and the interactive REPL (which binds it back to T). + (setf *print-pretty* nil)) diff --git a/src/lisp/kernel/lsp/top.lisp b/src/lisp/kernel/lsp/top.lisp index a1b3cc21ca..e89e1c6cf1 100644 --- a/src/lisp/kernel/lsp/top.lisp +++ b/src/lisp/kernel/lsp/top.lisp @@ -668,10 +668,15 @@ Use special code 0 to cancel this operation.") (push (subseq line start end) list))))) (defun tpl-print (values) - (fresh-line) - (dolist (v values) - (prin1 v) - (terpri))) + ;; The global default of *print-pretty* is NIL (the ordinary printer is far + ;; faster than clasp's CLOS-Gray-stream pretty printer). Re-enable pretty + ;; printing for interactive REPL results only, where the per-form cost is + ;; irrelevant and nicely-wrapped output is desirable. + (let ((*print-pretty* t)) + (fresh-line) + (dolist (v values) + (prin1 v) + (terpri)))) (defun tpl-unknown-command (command) (format t "Unknown top level command: ~s~%" command) diff --git a/src/llvmo/code.cc b/src/llvmo/code.cc index dacc85f89c..38f6cb45fa 100644 --- a/src/llvmo/code.cc +++ b/src/llvmo/code.cc @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include #include @@ -451,6 +453,7 @@ CL_LISPIFY_NAME(release_object_files); DOCGROUP(clasp); CL_DEFUN void release_object_files() { _lisp->_Roots._AllObjectFiles.store(nil()); + clearObjectFileNameCounts(); core::clasp_write_string("ObjectFiles have been released\n"); } @@ -732,21 +735,42 @@ std::string CodeBlock_O::__repr__() const { void CodeBlock_O::describe() const { printf("%s:%d:%s entered\n", __FILE__, __LINE__, __FUNCTION__); } +// Object-file names are uniquified at registration time +// (ensureUniqueMemoryBufferName -> countObjectFileNames). The original +// countObjectFileNames rescanned the whole _AllObjectFiles list with a memcmp +// per call, which is O(N) per registration and therefore O(N^2) as JITted +// modules accumulate -- a real scalability cost for long-running / +// compilation-heavy processes (it was the #1 self-time function, ~15%, in a +// profiled workload). We maintain an auxiliary name->count index updated at the +// single registration site (recordObjectFileName) and reset whenever +// _AllObjectFiles is cleared (clearObjectFileNameCounts), making the lookup +// O(1). _AllObjectFiles is only ever appended to (registerObjectFile) or +// bulk-cleared -- never individually pruned -- so the index stays in sync. The +// map holds no GC pointers; a mutex guards it because registration can race. +static std::mutex& objectFileNameCountsMutex() { + static std::mutex m; + return m; +} +static std::unordered_map& objectFileNameCounts() { + static std::unordered_map m; + return m; +} + +void recordObjectFileName(const std::string& name) { + std::lock_guard guard(objectFileNameCountsMutex()); + ++objectFileNameCounts()[name]; +} + +void clearObjectFileNameCounts() { + std::lock_guard guard(objectFileNameCountsMutex()); + objectFileNameCounts().clear(); +} + size_t countObjectFileNames(const std::string& name) { DEBUG_OBJECT_FILES_PRINT(("%s:%d:%s Lookup name %s\n", __FILE__, __LINE__, __FUNCTION__, name.c_str())); - size_t count = 0; - core::T_sp cur = _lisp->_Roots._AllObjectFiles.load(); - while (cur.consp()) { - ObjectFile_sp of = gc::As(CONS_CAR(cur)); - const char* codeNameStart = (const char*)of->_CodeName->_Data.data(); - if (of->_CodeName->length() == name.size()) { - if (memcmp(codeNameStart, name.data(), name.size()) == 0) { - count++; - } - } - cur = CONS_CDR(cur); - } - return count; + std::lock_guard guard(objectFileNameCountsMutex()); + auto it = objectFileNameCounts().find(name); + return it == objectFileNameCounts().end() ? 0 : it->second; }; CL_DEFUN core::T_sp llvm_sys__allObjectFileNames() { diff --git a/src/llvmo/llvmoPackage.cc b/src/llvmo/llvmoPackage.cc index 589d77da60..cb0cd3aa71 100644 --- a/src/llvmo/llvmoPackage.cc +++ b/src/llvmo/llvmoPackage.cc @@ -54,6 +54,7 @@ THE SOFTWARE. #include #include #include +#include // JITDataReadWriteMaybeExecute / JITDataReadExecute (Apple Silicon W^X) #include #include #include @@ -690,7 +691,16 @@ CL_DEFUN_SETF core::T_sp setf_jit_lookup_t(core::T_sp value, JITDylib_sp dylib, if (!found) SIMPLE_ERROR("Could not find pointer for name |{}|", name); core::T_O** tptr = (core::T_O**)ptr; + // `tptr` points into a JIT'd dylib's memory (e.g. the callback-lisp-function-N + // global created by make-callback). On Apple Silicon that memory is MAP_JIT and + // execute-protected for this thread under W^X, so a plain store faults with + // SIGBUS. Enable writing around the store, exactly like the other JIT-literal + // write sites (code.cc, loadltv.cc, compiler.cc, snapshotSaveLoad.cc). The + // window contains only this pointer store -- no allocation/GC/JIT. No-op off + // Apple Silicon. + JITDataReadWriteMaybeExecute(); *tptr = value.raw_(); + JITDataReadExecute(); return value; }