Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/clasp/core/lispStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,9 @@ class StringOutputStream_O : public StringStream_O {
String_sp get_string();

claspCharacter write_char(claspCharacter c) override;
// Bulk override: avoids the per-character boxing + virtual vectorPushExtend
// of the default AnsiStream_O::write_string char-by-char loop.
void write_string(String_sp data, cl_index start, cl_index end) override;
void clear_output() override;
void finish_output() override;
void force_output() override;
Expand Down
9 changes: 8 additions & 1 deletion include/clasp/gctools/boehm_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,14 @@
#define PACKAGE_VERSION "8.2.8"

/* Define to enable parallel marking. */
/* #undef PARALLEL_MARK */
/* clasp: ENABLED. Heap marking is Boehm-standard conservative marking
(ALL_INTERIOR_POINTERS=1 + GC_register_displacement for tagged pointers; clasp
registers NO custom mark procedures / typed descriptors), so parallel markers
parallelize Boehm's own, parallel-safe marking loop -- clasp injects no
per-thread marking logic. GC_THREADS and THREAD_LOCAL_ALLOC are already on, and
bdwgc auto-starts (#CPUs-1) marker threads in GC_thr_init. This parallelizes
GC_mark_from, the dominant runtime cost (~25-30% in profiles) on multicore. */
#define PARALLEL_MARK 1

/* If defined, redirect free to this function. */
/* #undef REDIRECT_FREE */
Expand Down
18 changes: 13 additions & 5 deletions include/clasp/gctools/threadlocal.fwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,21 @@ struct MonitorAllocations {
#endif

struct GlobalAllocationProfiler {
std::atomic<int64_t> _BytesAllocated;
// These counters live in the THREAD_LOCAL ThreadLocalStateLowLevel (as the
// member _Allocations) and are only ever accessed via
// my_thread_low_level->_Allocations, i.e. by the owning thread alone (see
// gcalloc_boehm.h, gcFunctions.cc, startRunStop.cc, memoryManagement.cc).
// There is no shared/global instance and no cross-thread read, so atomics are
// pure overhead on registerAllocation(), which runs on every heap allocation.
// Plain int64_t with in-class zero-init (the previous atomics were not all
// initialized by the constructors below).
int64_t _BytesAllocated = 0;
size_t _AllocationSizeThreshold;
size_t _AllocationNumberThreshold;
std::atomic<int64_t> _AllocationSizeCounter;
std::atomic<int64_t> _AllocationNumberCounter;
std::atomic<int64_t> _HitAllocationNumberCounter;
std::atomic<int64_t> _HitAllocationSizeCounter;
int64_t _AllocationSizeCounter = 0;
int64_t _AllocationNumberCounter = 0;
int64_t _HitAllocationNumberCounter = 0;
int64_t _HitAllocationSizeCounter = 0;
#ifdef DEBUG_MONITOR_ALLOCATIONS
MonitorAllocations _Monitor;
#endif
Expand Down
7 changes: 7 additions & 0 deletions include/clasp/llvmo/code.h
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,11 @@ namespace llvmo {
core::T_sp identify_code_or_library(gctools::clasp_ptr_t entry_point);

size_t countObjectFileNames(const std::string& name);
// Maintain the O(1) name->count index used by countObjectFileNames.
// recordObjectFileName is called for every registered object file;
// clearObjectFileNameCounts is called whenever _AllObjectFiles is cleared.
void recordObjectFileName(const std::string& name);
void clearObjectFileNameCounts();

std::string createIRModuleObjectFileName(size_t startupId, std::string& prefix);
bool verifyIRModuleObjectFileStartupSymbol(const std::string& name);
Expand Down Expand Up @@ -456,6 +461,8 @@ template <typename Stage = gctools::RuntimeStage> void registerObjectFile(Object
expected = _lisp->_Roots._AllObjectFiles.load();
entry->rplacd(expected);
} while (!_lisp->_Roots._AllObjectFiles.compare_exchange_weak(expected, entry));
// Keep the O(1) countObjectFileNames index in sync (single add point).
recordObjectFileName(name);
}

}; // namespace llvmo
31 changes: 19 additions & 12 deletions src/core/bytecode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ void BytecodeModule_O::register_for_debug() {
// Note that we check stepping in the callER not the callEE.
// This is so that we could provide the actual source forms, as we already do
// in native code. TODO
static void maybe_step_call(void* frame,
static void maybe_step_call(ThreadLocalState* thread, void* frame,
Function_sp func, size_t nargs, T_O** rargs) {
if (my_thread->_Breakstep) [[unlikely]] {
if (thread->_Breakstep) [[unlikely]] {
ql::list args;
for (size_t iarg = 0; iarg < nargs; ++iarg)
args << T_sp((gctools::Tagged)rargs[iarg]);
Expand Down Expand Up @@ -241,6 +241,12 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
uintptr_t bytecode_end = (uintptr_t)(bc->rowMajorAddressOfElement_(bc->length()));
#endif
MultipleValues& multipleValues = core::lisp_multipleValues();
// Resolve the thread-local pointer once per VM frame. On Darwin every
// `my_thread` access is a _tlv_get_addr thunk call; the interpreter hits it on
// every call (maybe_step_call's breakstep check) and in several opcodes, so
// caching it here removes a per-call/per-op thunk. The thread does not change
// during a VM frame.
ThreadLocalState* thread = my_thread;
unsigned char* pc = vm._pc;
while (1) {
VM_PC_CHECK(vm, pc, bytecode_start, bytecode_end);
Expand Down Expand Up @@ -288,7 +294,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
// We push the PC for the debugger (see make_bytecode_frame in backtrace.cc)
// We do this here rather than bytecode_call because e.g. we may call a
// non-bytecode function, that in turn calls a bunch of different bytecode
Expand All @@ -311,7 +317,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
VM_RECORD_PLAYBACK(func, "vm_call_receive_one_func");
VM_RECORD_PLAYBACK((void*)(uintptr_t)nargs, "vm_call_receive_one_nargs");
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
#if DEBUG_VM_RECORD_PLAYBACK == 1
for (size_t ii = 0; ii < nargs; ii++) {
stringstream name_args;
Expand All @@ -336,7 +342,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
vm.push(sp, (T_O*)pc);
vm._pc = pc;
vm._stackPointer = sp;
Expand Down Expand Up @@ -690,7 +696,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
vm.push(sp, (T_O*)pc);
vm._pc = pc;
vm._stackPointer = sp;
Expand All @@ -708,7 +714,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
vm.push(sp, (T_O*)pc);
vm._pc = pc;
vm._stackPointer = sp;
Expand All @@ -727,7 +733,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
vm.push(sp, (T_O*)pc);
vm._pc = pc;
vm._stackPointer = sp;
Expand Down Expand Up @@ -1049,6 +1055,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure
static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, MultipleValues& multipleValues, T_O** literals,
T_O** closed, Closure_O* closure, core::T_O** fp, core::T_O** sp, size_t lcc_nargs,
core::T_O** lcc_args, uint8_t sub_opcode) {
ThreadLocalState* thread = my_thread; // cache TLS pointer once (see bytecode_vm)
switch ((vm_code)sub_opcode) {
case vm_code::ref: {
uint8_t low = *(pc + 1);
Expand Down Expand Up @@ -1083,7 +1090,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
vm.push(sp, (T_O*)pc);
vm._pc = pc;
vm._stackPointer = sp;
Expand All @@ -1100,7 +1107,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
VM_RECORD_PLAYBACK(func, "vm_call_receive_one_func");
VM_RECORD_PLAYBACK((void*)(uintptr_t)nargs, "vm_call_receive_one_nargs");
#if DEBUG_VM_RECORD_PLAYBACK == 1
Expand Down Expand Up @@ -1129,7 +1136,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
vm.push(sp, (T_O*)pc);
vm._pc = pc;
vm._stackPointer = sp;
Expand Down Expand Up @@ -1348,7 +1355,7 @@ static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, Multi
T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs))));
Function_sp func = gc::As_assert<Function_sp>(tfunc);
T_O** args = vm.stackref(sp, nargs - 1);
maybe_step_call(__builtin_frame_address(0), func, nargs, args);
maybe_step_call(thread, __builtin_frame_address(0), func, nargs, args);
vm.push(sp, (T_O*)pc);
vm._pc = pc;
vm._stackPointer = sp;
Expand Down
83 changes: 83 additions & 0 deletions src/core/lispStream.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2723,6 +2723,89 @@ claspCharacter StringOutputStream_O::write_char(claspCharacter c) {
return c;
}

// Bulk write_string for string-output-streams. The default
// AnsiStream_O::write_string writes one character at a time, and each char
// pays a boxing (clasp_make_character) plus a virtual vectorPushExtend with a
// fill-pointer/realloc check. Here we (1) update the output cursor over the
// appended range exactly as the per-char path would, and (2) append the
// content in one shot with a single geometric growth and the type-safe subseq
// copy used elsewhere for string buffers (see read_line / unsafe_setf_subseq),
// which preserves base/extended conversion and narrowing-error behavior.
void StringOutputStream_O::write_string(String_sp data, cl_index start, cl_index end) {
if (start >= end)
return;
const cl_index count = end - start;
const cl_index oldFill = this->_contents->fillPointer();
const cl_index newFill = oldFill + count;
// Grow the destination once (geometric) so the copy needs no per-character
// reallocation or fill-pointer check. _contents is always an adjustable
// fill-pointer string (an MDArray) for a string-output-stream; resize keeps
// the fill pointer for fill-pointer arrays.
if (newFill > static_cast<cl_index>(this->_contents->arrayTotalSize())) {
cl_index newTotal = static_cast<cl_index>(this->_contents->arrayTotalSize()) * 2;
if (newTotal < newFill)
newTotal = newFill;
gc::As<MDArray_sp>(this->_contents)->resize(newTotal);
}
StreamCursor& cur = this->_output_cursor;
// Resolve the underlying simple vectors so element access is a non-virtual,
// typed indexing operation. Boehm GC is non-moving and the copy loop below
// performs no allocation, so these references stay valid throughout.
AbstractSimpleVector_sp ssv;
size_t s0, s1;
data->asAbstractSimpleVectorRange(ssv, s0, s1);
AbstractSimpleVector_sp dsv;
size_t d0, d1;
this->_contents->asAbstractSimpleVectorRange(dsv, d0, d1);
const cl_index srcOff = static_cast<cl_index>(s0) + start;
const cl_index dstOff = static_cast<cl_index>(d0) + oldFill;
bool fast = true;
if (gc::IsA<SimpleCharacterString_sp>(dsv)) {
SimpleCharacterString_O& dst = *gc::As_unsafe<SimpleCharacterString_sp>(dsv);
if (gc::IsA<SimpleCharacterString_sp>(ssv)) {
SimpleCharacterString_O& src = *gc::As_unsafe<SimpleCharacterString_sp>(ssv);
for (cl_index i = 0; i < count; ++i) {
claspCharacter c = src[srcOff + i];
dst[dstOff + i] = c;
cur.update(c);
}
} else if (gc::IsA<SimpleBaseString_sp>(ssv)) {
SimpleBaseString_O& src = *gc::As_unsafe<SimpleBaseString_sp>(ssv);
for (cl_index i = 0; i < count; ++i) {
claspCharacter c = src[srcOff + i]; // widen 8 -> 32 bits
dst[dstOff + i] = c;
cur.update(c);
}
} else
fast = false;
} else if (gc::IsA<SimpleBaseString_sp>(dsv) && gc::IsA<SimpleBaseString_sp>(ssv)) {
SimpleBaseString_O& dst = *gc::As_unsafe<SimpleBaseString_sp>(dsv);
SimpleBaseString_O& src = *gc::As_unsafe<SimpleBaseString_sp>(ssv);
for (cl_index i = 0; i < count; ++i) {
claspChar c = src[srcOff + i];
dst[dstOff + i] = c;
cur.update(c);
}
} else {
// Character source into a base destination may narrow; defer to the safe
// path which raises the proper type error for out-of-range characters.
fast = false;
}
if (fast) {
this->_contents->fillPointerSet(newFill);
return;
}
// Safe fallback: cursor updated identically to the per-char path, content
// copied with the tested type-checking subseq machinery.
for (cl_index i = start; i < end; ++i)
cur.update(clasp_as_claspCharacter(cl__char(data, i)));
Array_sp src = (start == 0 && static_cast<cl_index>(data->length()) == end)
? gc::As_unsafe<Array_sp>(data)
: gc::As_unsafe<Array_sp>(data->unsafe_subseq(start, end));
this->_contents->unsafe_setf_subseq(oldFill, newFill, src);
this->_contents->fillPointerSet(newFill);
}

T_sp StringOutputStream_O::position() { return Integer_O::create((gc::Fixnum)_contents->fillPointer()); }

void StringOutputStream_O::clear_output() {}
Expand Down
1 change: 1 addition & 0 deletions src/gctools/snapshotSaveLoad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2321,6 +2321,7 @@ void snapshot_load(void* maybeStartOfSnapshot, void* maybeEndOfSnapshot, const s
my_thread->finish_initialization_main_thread(nil);
// Now we have NIL in 'nil' - use it to initialize a few things.
_lisp->_Roots._AllObjectFiles.store(nil);
llvmo::clearObjectFileNameCounts(); // keep countObjectFileNames index in sync with the cleared list
_lisp->_Roots._AllCodeBlocks.store(nil);
}

Expand Down
8 changes: 7 additions & 1 deletion src/lisp/kernel/lsp/format-pprint.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -902,4 +902,10 @@
*standard-pprint-dispatch* *initial-pprint-dispatch*)
(setf (pprint-dispatch-table-read-only-p *standard-pprint-dispatch*) t)
(setf (first (cdr si::+io-syntax-progv-list+)) *standard-pprint-dispatch*)
(setf *print-pretty* t))
;; Default *print-pretty* to NIL, matching SBCL/CCL/CLISP. clasp's pretty
;; printer is a CLOS Gray stream (a C++<->Lisp generic-dispatch per output
;; character), making it ~370x slower than the ordinary printer; defaulting
;; it on made every princ/prin1/format-~A/print pay that cost. The pretty
;; printer remains available via PPRINT, ~<...~:>, *print-pretty* bindings,
;; and the interactive REPL (which binds it back to T).
(setf *print-pretty* nil))
13 changes: 9 additions & 4 deletions src/lisp/kernel/lsp/top.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -668,10 +668,15 @@ Use special code 0 to cancel this operation.")
(push (subseq line start end) list)))))

(defun tpl-print (values)
(fresh-line)
(dolist (v values)
(prin1 v)
(terpri)))
;; The global default of *print-pretty* is NIL (the ordinary printer is far
;; faster than clasp's CLOS-Gray-stream pretty printer). Re-enable pretty
;; printing for interactive REPL results only, where the per-form cost is
;; irrelevant and nicely-wrapped output is desirable.
(let ((*print-pretty* t))
(fresh-line)
(dolist (v values)
(prin1 v)
(terpri))))

(defun tpl-unknown-command (command)
(format t "Unknown top level command: ~s~%" command)
Expand Down
50 changes: 37 additions & 13 deletions src/llvmo/code.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <dlfcn.h>
#include <iomanip>
#include <cstdint>
#include <mutex>
#include <unordered_map>
#include <clasp/core/foundation.h>
#include <clasp/core/lispStream.h>
#include <clasp/core/debugger.h>
Expand Down Expand Up @@ -451,6 +453,7 @@ CL_LISPIFY_NAME(release_object_files);
DOCGROUP(clasp);
CL_DEFUN void release_object_files() {
_lisp->_Roots._AllObjectFiles.store(nil<core::T_O>());
clearObjectFileNameCounts();
core::clasp_write_string("ObjectFiles have been released\n");
}

Expand Down Expand Up @@ -732,21 +735,42 @@ std::string CodeBlock_O::__repr__() const {

void CodeBlock_O::describe() const { printf("%s:%d:%s entered\n", __FILE__, __LINE__, __FUNCTION__); }

// Object-file names are uniquified at registration time
// (ensureUniqueMemoryBufferName -> countObjectFileNames). The original
// countObjectFileNames rescanned the whole _AllObjectFiles list with a memcmp
// per call, which is O(N) per registration and therefore O(N^2) as JITted
// modules accumulate -- a real scalability cost for long-running /
// compilation-heavy processes (it was the #1 self-time function, ~15%, in a
// profiled workload). We maintain an auxiliary name->count index updated at the
// single registration site (recordObjectFileName) and reset whenever
// _AllObjectFiles is cleared (clearObjectFileNameCounts), making the lookup
// O(1). _AllObjectFiles is only ever appended to (registerObjectFile) or
// bulk-cleared -- never individually pruned -- so the index stays in sync. The
// map holds no GC pointers; a mutex guards it because registration can race.
static std::mutex& objectFileNameCountsMutex() {
static std::mutex m;
return m;
}
static std::unordered_map<std::string, size_t>& objectFileNameCounts() {
static std::unordered_map<std::string, size_t> m;
return m;
}

void recordObjectFileName(const std::string& name) {
std::lock_guard<std::mutex> guard(objectFileNameCountsMutex());
++objectFileNameCounts()[name];
}

void clearObjectFileNameCounts() {
std::lock_guard<std::mutex> guard(objectFileNameCountsMutex());
objectFileNameCounts().clear();
}

size_t countObjectFileNames(const std::string& name) {
DEBUG_OBJECT_FILES_PRINT(("%s:%d:%s Lookup name %s\n", __FILE__, __LINE__, __FUNCTION__, name.c_str()));
size_t count = 0;
core::T_sp cur = _lisp->_Roots._AllObjectFiles.load();
while (cur.consp()) {
ObjectFile_sp of = gc::As<ObjectFile_sp>(CONS_CAR(cur));
const char* codeNameStart = (const char*)of->_CodeName->_Data.data();
if (of->_CodeName->length() == name.size()) {
if (memcmp(codeNameStart, name.data(), name.size()) == 0) {
count++;
}
}
cur = CONS_CDR(cur);
}
return count;
std::lock_guard<std::mutex> guard(objectFileNameCountsMutex());
auto it = objectFileNameCounts().find(name);
return it == objectFileNameCounts().end() ? 0 : it->second;
};

CL_DEFUN core::T_sp llvm_sys__allObjectFileNames() {
Expand Down
Loading
Loading