Skip to content

Commit 6599811

Browse files
authored
Merge pull request #237 from struct/perf_improvements_3_26
Perf improvements 3 26
2 parents 9c7b930 + b6276f8 commit 6599811

File tree

10 files changed

+177
-79
lines changed

10 files changed

+177
-79
lines changed

Makefile

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,14 @@ PROTECT_FREE_BIG_ZONES = -DPROTECT_FREE_BIG_ZONES=0
229229
## incurs a small performance cost
230230
MASK_PTRS = -DMASK_PTRS=1
231231

232-
## IsoAlloc uses ARM64 Neon instructions where possible. You can
233-
## explicitly disable that here
232+
## IsoAlloc uses ARM64 Neon instructions where possible. Automatically
233+
## enabled on ARM/AArch64 hosts, disabled everywhere else.
234+
ARCH := $(shell uname -m)
235+
ifneq ($(filter aarch64 arm%,$(ARCH)),)
236+
DONT_USE_NEON = -DDONT_USE_NEON=0
237+
else
234238
DONT_USE_NEON = -DDONT_USE_NEON=1
239+
endif
235240

236241
## We start with the standard C++ specifics but giving
237242
## the liberty to choose the gnu++* variants and/or
@@ -350,9 +355,22 @@ library: clean
350355
## ABORT_ON_UNOWNED_PTR=0 silently drops pointers not owned by isoalloc
351356
## (e.g. those allocated by libc before the isoalloc constructor fires)
352357
## instead of aborting. All other flags are identical to 'library'.
353-
library_perf: ABORT_ON_UNOWNED_PTR = -DABORT_ON_UNOWNED_PTR=0
354-
library_perf: clean
355-
@echo "make library_perf"
358+
library_less_strict: ABORT_ON_UNOWNED_PTR = -DABORT_ON_UNOWNED_PTR=0
359+
library_less_strict: clean
360+
@echo "make library_less_strict"
361+
$(CC) $(CFLAGS) $(LIBRARY) $(OPTIMIZE) $(OS_FLAGS) $(C_SRCS) -o $(BUILD_DIR)/$(LIBNAME)
362+
$(STRIP)
363+
364+
## Build a performance-optimized library with the most expensive security
365+
## features disabled. Intended for benchmarking and performance measurement.
366+
## All other flags inherit from the top-level defaults.
367+
library_benchmark: DISABLE_CANARY = -DDISABLE_CANARY=1
368+
library_benchmark: PRE_POPULATE_PAGES = -DPRE_POPULATE_PAGES=1
369+
library_benchmark: RANDOMIZE_FREELIST = -DRANDOMIZE_FREELIST=0
370+
library_benchmark: MASK_PTRS = -DMASK_PTRS=0
371+
library_benchmark: ABORT_ON_UNOWNED_PTR = -DABORT_ON_UNOWNED_PTR=0
372+
library_benchmark: clean
373+
@echo "make library_benchmark"
356374
$(CC) $(CFLAGS) $(LIBRARY) $(OPTIMIZE) $(OS_FLAGS) $(C_SRCS) -o $(BUILD_DIR)/$(LIBNAME)
357375
$(STRIP)
358376

@@ -456,7 +474,7 @@ libc_sanity_tests: clean library_debug_unit_tests
456474
$(CC) $(CFLAGS) $(EXE_CFLAGS) $(DEBUG_LOG_FLAGS) $(GDB_FLAGS) $(OS_FLAGS) tests/memcpy_sanity.c $(ISO_ALLOC_PRINTF_SRC) -o $(BUILD_DIR)/memcpy_sanity $(LDFLAGS)
457475
$(CC) $(CFLAGS) $(EXE_CFLAGS) $(DEBUG_LOG_FLAGS) $(GDB_FLAGS) $(OS_FLAGS) tests/memmove_sanity.c $(ISO_ALLOC_PRINTF_SRC) -o $(BUILD_DIR)/memmove_sanity $(LDFLAGS)
458476
$(CC) $(CFLAGS) $(EXE_CFLAGS) $(DEBUG_LOG_FLAGS) $(GDB_FLAGS) $(OS_FLAGS) tests/bzero_sanity.c $(ISO_ALLOC_PRINTF_SRC) -o $(BUILD_DIR)/bzero_sanity $(LDFLAGS)
459-
build/memset_sanity ; build/memcpy_sanity; build/memmove_sanity; build/bzero_sanity ;
477+
LD_LIBRARY_PATH=build/ build/memset_sanity ; LD_LIBRARY_PATH=build/ build/memcpy_sanity; LD_LIBRARY_PATH=build/ build/memmove_sanity; LD_LIBRARY_PATH=build/ build/bzero_sanity
460478

461479
fuzz_test: clean library_debug_unit_tests
462480
@echo "make fuzz_test"

PERFORMANCE.md

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ If you know your program will not require multi-threaded access to IsoAlloc you
3434

3535
`DISABLE_CANARY` can be set to 1 to disable the creation and verification of canary chunks. This removes a useful security feature but will significantly improve performance and RSS.
3636

37+
`MASK_PTRS` is enabled by default and causes the `user_pages_start` and `bitmap_start` pointers stored in every zone's metadata to be XOR'd with a per-zone random secret between alloc and free operations. This protects against attackers who can read or corrupt zone metadata. Each alloc and free pays a small cost for these XOR operations. Setting `MASK_PTRS=0` removes this overhead at the cost of this security property.
38+
39+
`CANARY_COUNT_DIV` in `conf.h` controls what fraction of chunks in a zone are reserved as canaries. It is used as a right-shift on the total chunk count: `chunk_count >> CANARY_COUNT_DIV`. The default value of 7 reserves less than 1% of chunks. Increasing this value reduces canary density and frees more chunks for user allocations; decreasing it increases security coverage at the cost of usable memory.
40+
41+
`ZONE_ALLOC_RETIRE` in `conf.h` controls how frequently zones are retired and replaced. A zone is retired once it has completed `ZONE_ALLOC_RETIRE * max_chunk_count_for_zone` total alloc/free cycles. Lowering this value causes zones to be replaced more often, reducing the window for use-after-free exploitation but increasing the frequency of zone creation. `BIG_ZONE_ALLOC_RETIRE` is the equivalent for big zones.
42+
43+
`SMALL_MEM_STARTUP` reduces the number and size of default zones created at startup. This decreases initial RSS at the cost of more frequent zone creation for programs with diverse allocation sizes.
44+
45+
`STRONG_SIZE_ISOLATION` enforces stricter isolation by size class. When enabled, chunk sizes are rounded up to a smaller set of buckets which increases isolation between differently-sized allocations. This may increase per-allocation waste but reduces cross-size heap exploitation primitives.
46+
3747
By default IsoAlloc will attempt to use Huge Pages (for both Linux and Mac OS) for any allocations that are a multiple of 2 mb in size. This is the default huge page size on most systems but it might not be on yours. On Linux you can check the value for your system by running the following command:
3848

3949
```
@@ -143,35 +153,37 @@ iso_realloc/iso_free 1834784 tests completed in 0.901481 seconds
143153
The following benchmarks were collected from [mimalloc-bench](https://github.com/daanx/mimalloc-bench) with the default configuration of IsoAlloc. As you can see from the data IsoAlloc is competitive with other allocators for some benchmarks but clearly falls behind on others. For any benchmark that IsoAlloc scores poorly on I was able to tweak its build to improve the CPU time and memory consumption. It's worth noting that IsoAlloc was able to stay competitive even with performing many security checks not present in other allocators. Please note these are 'best case' measurements, not averages.
144154

145155
```
156+
make library_benchmark
157+
146158
#------------------------------------------------------------------
147159
# test alloc time rss user sys page-faults page-reclaims
148-
cfrac je 02.99 4912 2.99 0.00 0 454
149-
cfrac mi 03.01 2484 3.00 0.00 0 346
150-
cfrac iso 05.84 26616 5.75 0.09 0 6502
160+
cfrac je 03.07 4552 3.06 0.00 0 454
161+
cfrac mi 02.97 2484 2.96 0.00 0 347
162+
cfrac iso 04.78 30612 4.69 0.09 0 7503
151163
152-
espresso je 02.52 4872 2.50 0.01 0 538
153-
espresso mi 02.46 3060 2.45 0.01 0 3637
154-
espresso iso 03.65 69876 3.56 0.09 0 21695
164+
espresso je 02.51 4872 2.50 0.01 0 540
165+
espresso mi 02.43 3032 2.42 0.01 0 3630
166+
espresso iso 03.16 69608 3.07 0.07 0 30334
155167
156-
barnes je 01.62 60268 1.59 0.02 0 16687
157-
barnes mi 01.71 57672 1.68 0.02 0 16550
158-
barnes iso 01.66 74628 1.62 0.03 0 20851
168+
barnes je 01.71 59900 1.67 0.03 0 16686
169+
barnes mi 01.65 57672 1.62 0.02 0 16550
170+
barnes iso 01.65 74812 1.62 0.03 0 20849
159171
160-
gs je 00.16 37592 0.15 0.01 0 5808
161-
gs mi 00.16 32588 0.13 0.02 0 5109
162-
gs iso 00.23 71152 0.16 0.07 0 19698
172+
gs je 00.17 37748 0.15 0.01 0 5814
173+
gs mi 00.16 33888 0.14 0.01 0 5109
174+
gs iso 00.22 68136 0.15 0.06 0 18916
163175
164-
larsonN je 1.171 266596 98.81 0.92 0 409842
165-
larsonN mi 1.016 299768 99.38 0.44 0 83755
166-
larsonN iso 918.582 126528 99.64 0.37 0 31368
176+
larsonN je 1.188 261884 98.91 0.92 0 421848
177+
larsonN mi 1.016 299752 99.53 0.38 0 80202
178+
larsonN iso 1328.904 121096 6.15 69.78 0 30219
167179
168-
rocksdb je 02.48 162424 2.05 0.63 0 38384
169-
rocksdb mi 02.48 159812 2.04 0.66 0 37464
170-
rocksdb iso 02.74 197220 2.49 0.55 0 46815
180+
rocksdb je 02.46 162340 2.05 0.63 0 38383
181+
rocksdb mi 02.33 160156 1.92 0.63 0 37585
182+
rocksdb iso 02.96 195948 2.64 0.66 0 46584
171183
172-
redis je 3.180 9496 0.14 0.02 0 1538
173-
redis mi 3.080 7088 0.12 0.03 0 1256
174-
redis iso 6.880 52816 0.31 0.05 0 16317
184+
redis je 3.160 9492 0.13 0.02 0 1528
185+
redis mi 2.780 7084 0.12 0.02 0 1257
186+
redis iso 7.579 50516 0.35 0.05 0 15187
175187
```
176188

177189
IsoAlloc isn't quite ready for performance sensitive server workloads. However it's more than fast enough for client side mobile/desktop applications with risky C/C++ attack surfaces. These environments have threat models similar to what IsoAlloc was designed for.

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ When enabled, the `CPU_PIN` feature will restrict allocations from a given zone
6161
* A chunk can be permanently free'd with a call to `iso_free_permanently`.
6262
* If `SANITIZE_CHUNKS` is set all user chunks are cleared when passed to `iso_free` with the constant `0xde`.
6363
* When freeing a chunk the canary in adjacent chunks above/below are verified.
64-
* Some important zone metadata pointers are masked in-between `iso_alloc` and `iso_free` operations.
64+
* When `MASK_PTRS` is enabled (default) the `user_pages_start` and `bitmap_start` pointers stored in zone metadata are XOR'd with a per-zone random secret between alloc and free operations, making them useless to an attacker who reads or corrupts zone metadata.
6565
* Passing a pointer to `iso_free` that was not allocated with `iso_alloc` will abort.
6666
* Pointers passed to `iso_free` must be 8 byte aligned, and a multiple of the zone chunk size.
6767
* The free bit slot cache provides a chunk quarantine or delayed free mechanism.
@@ -76,6 +76,7 @@ When enabled, the `CPU_PIN` feature will restrict allocations from a given zone
7676
* Randomized hints are passed to `mmap` to ensure contiguous page ranges are not allocated.
7777
* When `ABORT_ON_NULL` is enabled IsoAlloc will abort instead of returning `NULL`.
7878
* By default `NO_ZERO_ALLOCATIONS` will return a pointer to a page marked `PROT_NONE` for all `0` sized allocations.
79+
* When `ABORT_ON_UNOWNED_PTR` is enabled (default) IsoAlloc will abort whenever it is passed a pointer it does not own.
7980
* When `ABORT_NO_ENTROPY` is enabled IsoAlloc will abort when it can't gather enough entropy.
8081
* When `RANDOMIZE_FREELIST` is enabled IsoAlloc will randomize the free list upon creation. May have a perf hit.
8182
* Zones are retired and replaced after they've allocated and freed a specific number of chunks. This is calculated as `ZONE_ALLOC_RETIRE * max_chunk_count_for_zone`.
@@ -94,6 +95,8 @@ The Makefile targets are very simple:
9495

9596
`make library` - Builds a release version of the library without C++ support
9697

98+
`make library_less_strict` - Builds a release library with `ABORT_ON_UNOWNED_PTR=0`. Recommended when using IsoAlloc via `LD_PRELOAD`.
99+
97100
`make library_debug` - Builds a debug version of the library
98101

99102
`make library_debug_no_output` - Builds a debug version of the library with no logging output

include/iso_alloc_ds.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#define SZ_TO_ZONE_LOOKUP_IDX(size) size >> 4
1313

1414
#define CHUNK_TO_ZONE_TABLE_SZ (65535 * sizeof(uint16_t))
15-
#define ADDR_TO_CHUNK_TABLE(p) (((uintptr_t) p >> 32) & 0xffff)
15+
#define ADDR_TO_CHUNK_TABLE(p) (((uintptr_t) p >> 22) & 0xffff)
1616

1717
typedef int64_t bit_slot_t;
1818
typedef int64_t bitmap_index_t;
@@ -36,7 +36,7 @@ typedef struct {
3636
int64_t next_free_bit_slot; /* The last bit slot returned by get_next_free_bit_slot */
3737
uint64_t canary_secret; /* Each zone has its own canary secret */
3838
uint64_t pointer_mask; /* Each zone has its own pointer protection secret */
39-
bitmap_index_t max_bitmap_idx; /* Max bitmap index for this bitmap */
39+
uint16_t max_bitmap_idx; /* Max bitmap index for this bitmap */
4040
uint32_t chunk_size; /* Size of chunks managed by this zone */
4141
free_bit_slot_t free_bit_slots_usable; /* The oldest members of the free cache are served first */
4242
free_bit_slot_t free_bit_slots_index; /* Tracks how many entries in the cache are filled */
@@ -50,7 +50,7 @@ typedef struct {
5050
uint8_t cpu_core; /* What CPU core this zone is pinned to */
5151
#endif
5252
/* Warm/cold fields: accessed less frequently */
53-
uint32_t bitmap_size; /* Size of the bitmap in bytes */
53+
uint16_t bitmap_size; /* Size of the bitmap in bytes */
5454
uint32_t af_count; /* Increment/Decrement with each alloc/free operation */
5555
uint32_t chunk_count; /* Total number of chunks in this zone */
5656
uint32_t alloc_count; /* Total number of lifetime allocations */
@@ -133,9 +133,12 @@ typedef struct {
133133
* it can find the next zone that holds the same size
134134
* chunks. The lookup table helps us find the first zone
135135
* that holds a specific size in O(1) time */
136-
zone_lookup_table_t zone_lookup_table[ZONE_LOOKUP_TABLE_SZ];
136+
/* Array sized to cover indices 0..(SMALL_SIZE_MAX>>4) inclusive, then
137+
* rounded to a multiple of 4 entries so the array occupies a whole
138+
* number of 8-byte words and bitmaps[] remains naturally aligned. */
139+
zone_lookup_table_t zone_lookup_table[(SMALL_SIZE_MAX >> 4) + 4];
137140
/* For chunk sizes >= 1024 our bitmap size is smaller
138-
* than a page. This optimization preallocates pages to
141+
* than a page. This optimization preallocates pages tog
139142
* hold multiple bitmaps for these zones */
140143
iso_alloc_bitmap_t bitmaps[sizeof(small_bitmap_sizes) / sizeof(int)];
141144
uint64_t zone_handle_mask;

include/iso_alloc_internal.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,14 @@ extern uint32_t g_page_size_shift;
300300
static_assert(SMALLEST_CHUNK_SZ >= 16, "SMALLEST_CHUNK_SZ is too small, must be at least 16");
301301
static_assert(SMALL_SIZE_MAX <= 131072, "SMALL_SIZE_MAX is too big, cannot exceed 131072");
302302

303+
/* bitmap_size = (ZONE_USER_SIZE / SMALLEST_CHUNK_SZ) * BITS_PER_CHUNK / BITS_PER_BYTE
304+
* max_bitmap_idx = bitmap_size / sizeof(uint64_t)
305+
* Both fields are uint16_t in iso_alloc_zone_t, so verify they fit. */
306+
static_assert((ZONE_USER_SIZE * BITS_PER_CHUNK / BITS_PER_BYTE / SMALLEST_CHUNK_SZ) <= UINT16_MAX,
307+
"bitmap_size overflows uint16_t: SMALLEST_CHUNK_SZ is too small (must be > 16)");
308+
static_assert((ZONE_USER_SIZE * BITS_PER_CHUNK / BITS_PER_BYTE / SMALLEST_CHUNK_SZ / sizeof(uint64_t)) <= UINT16_MAX,
309+
"max_bitmap_idx overflows uint16_t: SMALLEST_CHUNK_SZ is too small");
310+
303311
#if THREAD_SUPPORT
304312
#if USE_SPINLOCK
305313
extern atomic_flag root_busy_flag;
@@ -368,7 +376,7 @@ INTERNAL_HIDDEN INLINE void populate_zone_cache(iso_alloc_zone_t *zone);
368376
INTERNAL_HIDDEN INLINE void flush_chunk_quarantine(void);
369377
INTERNAL_HIDDEN INLINE void clear_zone_cache(void);
370378
INTERNAL_HIDDEN iso_alloc_big_zone_t *iso_find_big_zone(void *p, bool remove);
371-
INTERNAL_HIDDEN iso_alloc_zone_t *is_zone_usable(iso_alloc_zone_t *zone, size_t size);
379+
INTERNAL_HIDDEN FLATTEN iso_alloc_zone_t *is_zone_usable(iso_alloc_zone_t *zone, size_t size);
372380
INTERNAL_HIDDEN iso_alloc_zone_t *find_suitable_zone(size_t size);
373381
INTERNAL_HIDDEN iso_alloc_zone_t *iso_new_zone(size_t size, bool internal);
374382
INTERNAL_HIDDEN iso_alloc_zone_t *_iso_new_zone(size_t size, bool internal, int32_t index);
@@ -377,7 +385,7 @@ INTERNAL_HIDDEN iso_alloc_zone_t *iso_find_zone_range(void *p);
377385
INTERNAL_HIDDEN iso_alloc_zone_t *search_chunk_lookup_table(const void *p);
378386
INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot_slow(iso_alloc_zone_t *zone);
379387
INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot(iso_alloc_zone_t *zone);
380-
INTERNAL_HIDDEN bit_slot_t get_next_free_bit_slot(iso_alloc_zone_t *zone);
388+
INTERNAL_HIDDEN INLINE bit_slot_t get_next_free_bit_slot(iso_alloc_zone_t *zone);
381389
INTERNAL_HIDDEN iso_alloc_root *iso_alloc_new_root(void);
382390
INTERNAL_HIDDEN bool is_pow2(uint64_t sz);
383391
INTERNAL_HIDDEN bool _is_zone_retired(iso_alloc_zone_t *zone);
@@ -408,7 +416,7 @@ INTERNAL_HIDDEN void *_untag_ptr(void *p, iso_alloc_zone_t *zone);
408416
INTERNAL_HIDDEN void _free_big_zone_list(iso_alloc_big_zone_t *head);
409417
INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_big_alloc(size_t size);
410418
INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc(iso_alloc_zone_t *zone, size_t size);
411-
INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc_bitslot_from_zone(bit_slot_t bitslot, iso_alloc_zone_t *zone);
419+
INTERNAL_HIDDEN INLINE ASSUME_ALIGNED void *_iso_alloc_bitslot_from_zone(bit_slot_t bitslot, iso_alloc_zone_t *zone);
412420
INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_calloc(size_t nmemb, size_t size);
413421
INTERNAL_HIDDEN void *_iso_alloc_ptr_search(void *n, bool poison);
414422
INTERNAL_HIDDEN INLINE uint64_t us_rand_uint64(uint64_t *seed);

0 commit comments

Comments
 (0)