Skip to content

Commit ed74ca7

Browse files
committed
claude driven performance improvements
1 parent 3ddc64b commit ed74ca7

File tree

2 files changed

+64
-60
lines changed

2 files changed

+64
-60
lines changed

include/iso_alloc_ds.h

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ typedef int64_t bitmap_index_t;
1919
typedef uint16_t zone_lookup_table_t;
2020
typedef uint16_t chunk_lookup_table_t;
2121

22-
#if ZONE_FREE_LIST_SZ > 255
22+
#if ZONE_FREE_LIST_SZ >= 255
2323
typedef uint16_t free_bit_slot_t;
2424
#define FREE_LIST_SHF 16
2525
#else
@@ -28,31 +28,36 @@ typedef uint8_t free_bit_slot_t;
2828
#endif
2929

3030
typedef struct {
31+
/* Hot fields: all fit within the first 64-byte cache line.
32+
* These are accessed on every alloc/free operation, so keeping
33+
* them co-located avoids extra cache misses (Agner Fog §9.4). */
3134
void *user_pages_start; /* Start of the pages backing this zone */
3235
void *bitmap_start; /* Start of the bitmap */
3336
int64_t next_free_bit_slot; /* The last bit slot returned by get_next_free_bit_slot */
34-
bit_slot_t free_bit_slots[ZONE_FREE_LIST_SZ]; /* A cache of bit slots that point to freed chunks */
3537
uint64_t canary_secret; /* Each zone has its own canary secret */
3638
uint64_t pointer_mask; /* Each zone has its own pointer protection secret */
3739
bitmap_index_t max_bitmap_idx; /* Max bitmap index for this bitmap */
3840
uint32_t chunk_size; /* Size of chunks managed by this zone */
41+
free_bit_slot_t free_bit_slots_usable; /* The oldest members of the free cache are served first */
42+
free_bit_slot_t free_bit_slots_index; /* Tracks how many entries in the cache are filled */
43+
bool is_full; /* Flags whether this zone is full to avoid bit slot searches */
44+
bool internal; /* Zones can be managed by iso_alloc or private */
45+
#if MEMORY_TAGGING
46+
bool tagged; /* Zone supports memory tagging */
47+
#endif
48+
int8_t preallocated_bitmap_idx; /* The bitmap is preallocated and its index */
49+
#if CPU_PIN
50+
uint8_t cpu_core; /* What CPU core this zone is pinned to */
51+
#endif
52+
/* Warm/cold fields: accessed less frequently */
3953
uint32_t bitmap_size; /* Size of the bitmap in bytes */
4054
uint32_t af_count; /* Increment/Decrement with each alloc/free operation */
4155
uint32_t chunk_count; /* Total number of chunks in this zone */
4256
uint32_t alloc_count; /* Total number of lifetime allocations */
4357
uint16_t index; /* Zone index */
4458
uint16_t next_sz_index; /* What is the index of the next zone of this size */
45-
free_bit_slot_t free_bit_slots_index; /* Tracks how many entries in the cache are filled */
46-
free_bit_slot_t free_bit_slots_usable; /* The oldest members of the free cache are served first */
47-
int8_t preallocated_bitmap_idx; /* The bitmap is preallocated and its index */
48-
#if CPU_PIN
49-
uint8_t cpu_core; /* What CPU core this zone is pinned to */
50-
#endif
51-
bool internal; /* Zones can be managed by iso_alloc or private */
52-
bool is_full; /* Flags whether this zone is full to avoid bit slot searches */
53-
#if MEMORY_TAGGING
54-
bool tagged; /* Zone supports memory tagging */
55-
#endif
59+
/* Large cold array: only accessed when refilling the free list */
60+
bit_slot_t free_bit_slots[ZONE_FREE_LIST_SZ]; /* A cache of bit slots that point to freed chunks */
5661
} __attribute__((packed, aligned(sizeof(int64_t)))) iso_alloc_zone_t;
5762

5863
/* Meta data for big allocations are allocated near the

src/iso_alloc.c

Lines changed: 46 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -603,15 +603,19 @@ INTERNAL_HIDDEN void fill_free_bit_slots(iso_alloc_zone_t *zone) {
603603
}
604604
}
605605
} else {
606-
for(uint64_t j = 0; j < BITS_PER_QWORD; j += BITS_PER_CHUNK) {
607-
if((GET_BIT(bts, j)) == 0) {
608-
free_bit_slots[free_bit_slots_index] = (bm_idx_shf + j);
609-
free_bit_slots_index++;
606+
/* Use ctzll to skip directly to each free slot instead of
607+
* iterating all 32 even-bit positions (Agner Fog §14.3). */
608+
uint64_t free_mask = ~(uint64_t)bts & USED_BIT_VECTOR;
610609

611-
if(UNLIKELY(free_bit_slots_index >= ZONE_FREE_LIST_SZ)) {
612-
break;
613-
}
610+
while(free_mask) {
611+
free_bit_slots[free_bit_slots_index] = (bm_idx_shf + __builtin_ctzll(free_mask));
612+
free_bit_slots_index++;
613+
614+
if(UNLIKELY(free_bit_slots_index >= ZONE_FREE_LIST_SZ)) {
615+
break;
614616
}
617+
618+
free_mask &= free_mask - 1; /* clear lowest set bit */
615619
}
616620
}
617621
}
@@ -729,24 +733,19 @@ INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot_slow(iso_alloc_zone_t *zone)
729733

730734
for(bitmap_index_t i = 0; i < max; i += 2) {
731735
int64x2_t im = vld1q_s64(&bm[i]);
732-
int64_t i0 = vgetq_lane_s64(im, 0);
733736

734-
if(i0 != USED_BIT_VECTOR) {
735-
for(int64_t j = 0; j < BITS_PER_QWORD; j += BITS_PER_CHUNK) {
736-
if((GET_BIT(i0, j)) == 0) {
737-
return ((i << BITS_PER_QWORD_SHIFT) + j);
738-
}
739-
}
737+
/* Use ctzll to find the first free slot in O(1) instead of
738+
* scanning bit-by-bit. USED_BIT_VECTOR selects even-position
739+
* bits (one per chunk); inverting gives 1s where chunks are
740+
* free (Agner Fog §14.3 - use bitwise ops on multiple values). */
741+
uint64_t free_mask0 = ~(uint64_t)vgetq_lane_s64(im, 0) & USED_BIT_VECTOR;
742+
if(free_mask0) {
743+
return ((i << BITS_PER_QWORD_SHIFT) + __builtin_ctzll(free_mask0));
740744
}
741745

742-
int64_t i1 = vgetq_lane_s64(im, 1);
743-
744-
if(i1 != USED_BIT_VECTOR) {
745-
for(int64_t j = 0; j < BITS_PER_QWORD; j += BITS_PER_CHUNK) {
746-
if((GET_BIT(i1, j)) == 0) {
747-
return (((i + 1) << BITS_PER_QWORD_SHIFT) + j);
748-
}
749-
}
746+
uint64_t free_mask1 = ~(uint64_t)vgetq_lane_s64(im, 1) & USED_BIT_VECTOR;
747+
if(free_mask1) {
748+
return (((i + 1) << BITS_PER_QWORD_SHIFT) + __builtin_ctzll(free_mask1));
750749
}
751750
}
752751

@@ -758,28 +757,26 @@ INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot_slow(iso_alloc_zone_t *zone)
758757
max = (zone->max_bitmap_idx >> 1);
759758

760759
for(size_t i = 0; i < max; i++) {
761-
__int128_t bts = ebm[i];
760+
unsigned __int128 bts = (unsigned __int128)ebm[i];
762761

763-
for(int64_t j = 0; j < BITS_PER_ODWORD; j += BITS_PER_CHUNK) {
764-
if((GET_BIT(bts, j)) == 0) {
765-
return ((i << BITS_PER_ODWORD_SHIFT) + j);
766-
}
762+
/* Split 128-bit word into two 64-bit halves and use ctzll */
763+
uint64_t free_lo = ~(uint64_t)bts & USED_BIT_VECTOR;
764+
if(free_lo) {
765+
return ((i << BITS_PER_ODWORD_SHIFT) + __builtin_ctzll(free_lo));
766+
}
767+
768+
uint64_t free_hi = ~(uint64_t)(bts >> 64) & USED_BIT_VECTOR;
769+
if(free_hi) {
770+
return ((i << BITS_PER_ODWORD_SHIFT) + 64 + __builtin_ctzll(free_hi));
767771
}
768772
}
769773
#endif
770774
bm = (bitmap_index_t *) zone->bitmap_start;
771775

772776
for(bitmap_index_t i = max; i < zone->max_bitmap_idx; i++) {
773-
bit_slot_t bts = bm[i];
774-
775-
if(bts != USED_BIT_VECTOR) {
776-
continue;
777-
}
778-
779-
for(int64_t j = 0; j < BITS_PER_QWORD; j += BITS_PER_CHUNK) {
780-
if((GET_BIT(bts, j)) == 0) {
781-
return ((i << BITS_PER_QWORD_SHIFT) + j);
782-
}
777+
uint64_t free_mask = ~(uint64_t)bm[i] & USED_BIT_VECTOR;
778+
if(free_mask) {
779+
return ((i << BITS_PER_QWORD_SHIFT) + __builtin_ctzll(free_mask));
783780
}
784781
}
785782

@@ -1130,11 +1127,13 @@ INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc(iso_alloc_zone_t *zone, size_t s
11301127
/* Hot Path: Check the zone cache for a zone this
11311128
* thread recently used for an alloc/free operation.
11321129
* It's likely we are allocating a similar size chunk
1133-
* and this will speed up that operation */
1130+
* and this will speed up that operation.
1131+
* Scan newest-to-oldest (LIFO) since the most recently
1132+
* used zone is most likely to have free slots available. */
11341133
size_t _zone_cache_count = zone_cache_count;
11351134
_tzc *tzc = zone_cache;
11361135

1137-
for(size_t i = 0; i < _zone_cache_count; i++) {
1136+
for(size_t i = _zone_cache_count; i-- > 0;) {
11381137
if(tzc[i].chunk_size >= size) {
11391138
iso_alloc_zone_t *_zone = tzc[i].zone;
11401139
if(is_zone_usable(_zone, size) != NULL) {
@@ -1415,15 +1414,15 @@ INTERNAL_HIDDEN void iso_free_chunk_from_zone(iso_alloc_zone_t *zone, void *rest
14151414
#else
14161415
const uint64_t chunk_offset = (uint64_t) (p - UNMASK_USER_PTR(zone));
14171416
#endif
1418-
1419-
const size_t chunk_number = (chunk_offset / zone->chunk_size);
1417+
const size_t chunk_size = zone->chunk_size;
1418+
const size_t chunk_number = (chunk_offset / chunk_size);
14201419
const bit_slot_t bit_slot = (chunk_number << BITS_PER_CHUNK_SHIFT);
14211420
const bit_slot_t dwords_to_bit_slot = (bit_slot >> BITS_PER_QWORD_SHIFT);
14221421

14231422
/* Ensure the pointer is a multiple of chunk size. */
1424-
if(UNLIKELY((chunk_offset % zone->chunk_size) != 0)) {
1423+
if(UNLIKELY((chunk_offset % chunk_size) != 0)) {
14251424
LOG_AND_ABORT("Chunk %d at 0x%p is not a multiple of zone[%d] chunk size %d. Off by %lu bits",
1426-
chunk_offset, p, zone->index, zone->chunk_size, (chunk_offset & (zone->chunk_size - 1)));
1425+
chunk_offset, p, zone->index, chunk_size, (chunk_offset & (chunk_size - 1)));
14271426
}
14281427

14291428
if(UNLIKELY(dwords_to_bit_slot > zone->max_bitmap_idx)) {
@@ -1455,10 +1454,10 @@ INTERNAL_HIDDEN void iso_free_chunk_from_zone(iso_alloc_zone_t *zone, void *rest
14551454
UNSET_BIT(b, which_bit);
14561455
insert_free_bit_slot(zone, bit_slot);
14571456
#if !ENABLE_ASAN && SANITIZE_CHUNKS
1458-
__iso_memset(p, POISON_BYTE, zone->chunk_size);
1457+
__iso_memset(p, POISON_BYTE, chunk_size);
14591458
#endif
14601459
} else {
1461-
__iso_memset(p, POISON_BYTE, zone->chunk_size);
1460+
__iso_memset(p, POISON_BYTE, chunk_size);
14621461
}
14631462

14641463
bm[dwords_to_bit_slot] = b;
@@ -1475,14 +1474,14 @@ INTERNAL_HIDDEN void iso_free_chunk_from_zone(iso_alloc_zone_t *zone, void *rest
14751474
if((chunk_number + 1) != zone->chunk_count) {
14761475
const bit_slot_t bit_slot_over = ((chunk_number + 1) << BITS_PER_CHUNK_SHIFT);
14771476
if((GET_BIT(bm[(bit_slot_over >> BITS_PER_QWORD_SHIFT)], (WHICH_BIT(bit_slot_over) + 1))) == 1) {
1478-
check_canary(zone, p + zone->chunk_size);
1477+
check_canary(zone, p + chunk_size);
14791478
}
14801479
}
14811480

14821481
if(chunk_number != 0) {
14831482
const bit_slot_t bit_slot_under = ((chunk_number - 1) << BITS_PER_CHUNK_SHIFT);
14841483
if((GET_BIT(bm[(bit_slot_under >> BITS_PER_QWORD_SHIFT)], (WHICH_BIT(bit_slot_under) + 1))) == 1) {
1485-
check_canary(zone, p - zone->chunk_size);
1484+
check_canary(zone, p - chunk_size);
14861485
}
14871486
}
14881487
#endif

0 commit comments

Comments
 (0)