@@ -603,15 +603,19 @@ INTERNAL_HIDDEN void fill_free_bit_slots(iso_alloc_zone_t *zone) {
603603 }
604604 }
605605 } else {
606- for (uint64_t j = 0 ; j < BITS_PER_QWORD ; j += BITS_PER_CHUNK ) {
607- if ((GET_BIT (bts , j )) == 0 ) {
608- free_bit_slots [free_bit_slots_index ] = (bm_idx_shf + j );
609- free_bit_slots_index ++ ;
606+ /* Use ctzll to skip directly to each free slot instead of
607+ * iterating all 32 even-bit positions (Agner Fog §14.3). */
608+ uint64_t free_mask = ~(uint64_t )bts & USED_BIT_VECTOR ;
610609
611- if (UNLIKELY (free_bit_slots_index >= ZONE_FREE_LIST_SZ )) {
612- break ;
613- }
610+ while (free_mask ) {
611+ free_bit_slots [free_bit_slots_index ] = (bm_idx_shf + __builtin_ctzll (free_mask ));
612+ free_bit_slots_index ++ ;
613+
614+ if (UNLIKELY (free_bit_slots_index >= ZONE_FREE_LIST_SZ )) {
615+ break ;
614616 }
617+
618+ free_mask &= free_mask - 1 ; /* clear lowest set bit */
615619 }
616620 }
617621 }
@@ -729,24 +733,19 @@ INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot_slow(iso_alloc_zone_t *zone)
729733
730734 for (bitmap_index_t i = 0 ; i < max ; i += 2 ) {
731735 int64x2_t im = vld1q_s64 (& bm [i ]);
732- int64_t i0 = vgetq_lane_s64 (im , 0 );
733736
734- if (i0 != USED_BIT_VECTOR ) {
735- for (int64_t j = 0 ; j < BITS_PER_QWORD ; j += BITS_PER_CHUNK ) {
736- if ((GET_BIT (i0 , j )) == 0 ) {
737- return ((i << BITS_PER_QWORD_SHIFT ) + j );
738- }
739- }
737+ /* Use ctzll to find the first free slot in O(1) instead of
738+ * scanning bit-by-bit. USED_BIT_VECTOR selects even-position
739+ * bits (one per chunk); inverting gives 1s where chunks are
740+ * free (Agner Fog §14.3 - use bitwise ops on multiple values). */
741+ uint64_t free_mask0 = ~(uint64_t )vgetq_lane_s64 (im , 0 ) & USED_BIT_VECTOR ;
742+ if (free_mask0 ) {
743+ return ((i << BITS_PER_QWORD_SHIFT ) + __builtin_ctzll (free_mask0 ));
740744 }
741745
742- int64_t i1 = vgetq_lane_s64 (im , 1 );
743-
744- if (i1 != USED_BIT_VECTOR ) {
745- for (int64_t j = 0 ; j < BITS_PER_QWORD ; j += BITS_PER_CHUNK ) {
746- if ((GET_BIT (i1 , j )) == 0 ) {
747- return (((i + 1 ) << BITS_PER_QWORD_SHIFT ) + j );
748- }
749- }
746+ uint64_t free_mask1 = ~(uint64_t )vgetq_lane_s64 (im , 1 ) & USED_BIT_VECTOR ;
747+ if (free_mask1 ) {
748+ return (((i + 1 ) << BITS_PER_QWORD_SHIFT ) + __builtin_ctzll (free_mask1 ));
750749 }
751750 }
752751
@@ -758,28 +757,26 @@ INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot_slow(iso_alloc_zone_t *zone)
758757 max = (zone -> max_bitmap_idx >> 1 );
759758
760759 for (size_t i = 0 ; i < max ; i ++ ) {
761- __int128_t bts = ebm [i ];
760+ unsigned __int128 bts = ( unsigned __int128 ) ebm [i ];
762761
763- for (int64_t j = 0 ; j < BITS_PER_ODWORD ; j += BITS_PER_CHUNK ) {
764- if ((GET_BIT (bts , j )) == 0 ) {
765- return ((i << BITS_PER_ODWORD_SHIFT ) + j );
766- }
762+ /* Split 128-bit word into two 64-bit halves and use ctzll */
763+ uint64_t free_lo = ~(uint64_t )bts & USED_BIT_VECTOR ;
764+ if (free_lo ) {
765+ return ((i << BITS_PER_ODWORD_SHIFT ) + __builtin_ctzll (free_lo ));
766+ }
767+
768+ uint64_t free_hi = ~(uint64_t )(bts >> 64 ) & USED_BIT_VECTOR ;
769+ if (free_hi ) {
770+ return ((i << BITS_PER_ODWORD_SHIFT ) + 64 + __builtin_ctzll (free_hi ));
767771 }
768772 }
769773#endif
770774 bm = (bitmap_index_t * ) zone -> bitmap_start ;
771775
772776 for (bitmap_index_t i = max ; i < zone -> max_bitmap_idx ; i ++ ) {
773- bit_slot_t bts = bm [i ];
774-
775- if (bts != USED_BIT_VECTOR ) {
776- continue ;
777- }
778-
779- for (int64_t j = 0 ; j < BITS_PER_QWORD ; j += BITS_PER_CHUNK ) {
780- if ((GET_BIT (bts , j )) == 0 ) {
781- return ((i << BITS_PER_QWORD_SHIFT ) + j );
782- }
777+ uint64_t free_mask = ~(uint64_t )bm [i ] & USED_BIT_VECTOR ;
778+ if (free_mask ) {
779+ return ((i << BITS_PER_QWORD_SHIFT ) + __builtin_ctzll (free_mask ));
783780 }
784781 }
785782
@@ -1130,11 +1127,13 @@ INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc(iso_alloc_zone_t *zone, size_t s
11301127 /* Hot Path: Check the zone cache for a zone this
11311128 * thread recently used for an alloc/free operation.
11321129 * It's likely we are allocating a similar size chunk
1133- * and this will speed up that operation */
1130+ * and this will speed up that operation.
1131+ * Scan newest-to-oldest (LIFO) since the most recently
1132+ * used zone is most likely to have free slots available. */
11341133 size_t _zone_cache_count = zone_cache_count ;
11351134 _tzc * tzc = zone_cache ;
11361135
1137- for (size_t i = 0 ; i < _zone_cache_count ; i ++ ) {
1136+ for (size_t i = _zone_cache_count ; i -- > 0 ; ) {
11381137 if (tzc [i ].chunk_size >= size ) {
11391138 iso_alloc_zone_t * _zone = tzc [i ].zone ;
11401139 if (is_zone_usable (_zone , size ) != NULL ) {
@@ -1415,15 +1414,15 @@ INTERNAL_HIDDEN void iso_free_chunk_from_zone(iso_alloc_zone_t *zone, void *rest
14151414#else
14161415 const uint64_t chunk_offset = (uint64_t ) (p - UNMASK_USER_PTR (zone ));
14171416#endif
1418-
1419- const size_t chunk_number = (chunk_offset / zone -> chunk_size );
1417+ const size_t chunk_size = zone -> chunk_size ;
1418+ const size_t chunk_number = (chunk_offset / chunk_size );
14201419 const bit_slot_t bit_slot = (chunk_number << BITS_PER_CHUNK_SHIFT );
14211420 const bit_slot_t dwords_to_bit_slot = (bit_slot >> BITS_PER_QWORD_SHIFT );
14221421
14231422 /* Ensure the pointer is a multiple of chunk size. */
1424- if (UNLIKELY ((chunk_offset % zone -> chunk_size ) != 0 )) {
1423+ if (UNLIKELY ((chunk_offset % chunk_size ) != 0 )) {
14251424 LOG_AND_ABORT ("Chunk %d at 0x%p is not a multiple of zone[%d] chunk size %d. Off by %lu bits" ,
1426- chunk_offset , p , zone -> index , zone -> chunk_size , (chunk_offset & (zone -> chunk_size - 1 )));
1425+ chunk_offset , p , zone -> index , chunk_size , (chunk_offset & (chunk_size - 1 )));
14271426 }
14281427
14291428 if (UNLIKELY (dwords_to_bit_slot > zone -> max_bitmap_idx )) {
@@ -1455,10 +1454,10 @@ INTERNAL_HIDDEN void iso_free_chunk_from_zone(iso_alloc_zone_t *zone, void *rest
14551454 UNSET_BIT (b , which_bit );
14561455 insert_free_bit_slot (zone , bit_slot );
14571456#if !ENABLE_ASAN && SANITIZE_CHUNKS
1458- __iso_memset (p , POISON_BYTE , zone -> chunk_size );
1457+ __iso_memset (p , POISON_BYTE , chunk_size );
14591458#endif
14601459 } else {
1461- __iso_memset (p , POISON_BYTE , zone -> chunk_size );
1460+ __iso_memset (p , POISON_BYTE , chunk_size );
14621461 }
14631462
14641463 bm [dwords_to_bit_slot ] = b ;
@@ -1475,14 +1474,14 @@ INTERNAL_HIDDEN void iso_free_chunk_from_zone(iso_alloc_zone_t *zone, void *rest
14751474 if ((chunk_number + 1 ) != zone -> chunk_count ) {
14761475 const bit_slot_t bit_slot_over = ((chunk_number + 1 ) << BITS_PER_CHUNK_SHIFT );
14771476 if ((GET_BIT (bm [(bit_slot_over >> BITS_PER_QWORD_SHIFT )], (WHICH_BIT (bit_slot_over ) + 1 ))) == 1 ) {
1478- check_canary (zone , p + zone -> chunk_size );
1477+ check_canary (zone , p + chunk_size );
14791478 }
14801479 }
14811480
14821481 if (chunk_number != 0 ) {
14831482 const bit_slot_t bit_slot_under = ((chunk_number - 1 ) << BITS_PER_CHUNK_SHIFT );
14841483 if ((GET_BIT (bm [(bit_slot_under >> BITS_PER_QWORD_SHIFT )], (WHICH_BIT (bit_slot_under ) + 1 ))) == 1 ) {
1485- check_canary (zone , p - zone -> chunk_size );
1484+ check_canary (zone , p - chunk_size );
14861485 }
14871486 }
14881487#endif
0 commit comments