From 0fa5efcffb837f818cf69f1cefb4d7f148776081 Mon Sep 17 00:00:00 2001 From: Nils Goroll Date: Sat, 18 Jan 2025 20:39:45 +0100 Subject: [PATCH 1/8] Store the original size argument in struct binary_fuse{8,16}_s ... in preparation of a more compact serialization format: All other parameters except for the Seed are derived from the size parameter. The drawback is that this format is sensitive to changes of binary_fuse8_allocate(). Due to alignment, this does not need any more space on 64bit. (There were 5 32bit values inbetween two 64bit values) Yet formally, this is a breaking change of the in-core format, which should not be used to store information across versions. See follow up commits for new compact serialization formats. --- include/binaryfusefilter.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/binaryfusefilter.h b/include/binaryfusefilter.h index 7049f3f..6f5d5d1 100644 --- a/include/binaryfusefilter.h +++ b/include/binaryfusefilter.h @@ -67,6 +67,7 @@ static inline uint64_t binary_fuse_rng_splitmix64(uint64_t *seed) { typedef struct binary_fuse8_s { uint64_t Seed; + uint32_t Size; uint32_t SegmentLength; uint32_t SegmentLengthMask; uint32_t SegmentCount; @@ -222,6 +223,7 @@ static inline double binary_fuse_calculate_size_factor(uint32_t arity, static inline bool binary_fuse8_allocate(uint32_t size, binary_fuse8_t *filter) { uint32_t arity = 3; + filter->Size = size; filter->SegmentLength = size == 0 ? 4 : binary_fuse_calculate_segment_length(arity, size); if (filter->SegmentLength > 262144) { filter->SegmentLength = 262144; @@ -258,6 +260,7 @@ static inline void binary_fuse8_free(binary_fuse8_t *filter) { free(filter->Fingerprints); filter->Fingerprints = NULL; filter->Seed = 0; + filter->Size = 0; filter->SegmentLength = 0; filter->SegmentLengthMask = 0; filter->SegmentCount = 0; @@ -459,6 +462,7 @@ static inline bool binary_fuse8_populate(uint64_t *keys, uint32_t size, typedef struct binary_fuse16_s { uint64_t Seed; + uint32_t Size; uint32_t SegmentLength; uint32_t SegmentLengthMask; uint32_t SegmentCount; @@ -512,6 +516,7 @@ static inline bool binary_fuse16_contain(uint64_t key, static inline bool binary_fuse16_allocate(uint32_t size, binary_fuse16_t *filter) { uint32_t arity = 3; + filter->Size = size; filter->SegmentLength = size == 0 ? 4 : binary_fuse_calculate_segment_length(arity, size); if (filter->SegmentLength > 262144) { filter->SegmentLength = 262144; @@ -548,6 +553,7 @@ static inline void binary_fuse16_free(binary_fuse16_t *filter) { free(filter->Fingerprints); filter->Fingerprints = NULL; filter->Seed = 0; + filter->Size = 0; filter->SegmentLength = 0; filter->SegmentLengthMask = 0; filter->SegmentCount = 0; From 2e1915cac897b8eed396ffb86f81939feefe6677 Mon Sep 17 00:00:00 2001 From: Nils Goroll Date: Sun, 19 Jan 2025 16:33:44 +0100 Subject: [PATCH 2/8] Add {xor,binary_fuse}{8,16}_{pack,unpack} serialization formats. Rationale: As mentioned in the previous commit, for binary_fuse filters, we do not need to save values derived from the size, saving 5 x sizeof(uint32_t). For both filter implementations, we add a bitmap to indicate non-zero fingerprint values. This adds 1/{8,16} of the fingerprint array size, but saves one or two bytes for each zero fingerprint. The net result is a packed format which can not be compressed further by zlib for the bundled unit tests. Note that this format is incompatible with the existing _serialize() format and, in the case of binary_fuse, sensitive to changes of the derived parameters in _allocate. Interface: We add _pack_bytes() to match _serialization_bytes(). _pack() and _unpack() match _serialize() and _deserialize(). The existing _{de,}serialize() interfaces take a buffer pointer only and thus implicitly assume that the buffer will be of sufficient size. For the new functions, we add a size_t parameter indicating the size of the buffer and check its bounds in the implementation. _pack returns the used size or zero for "does not fit", so when called with a buffer of arbitrary size, the used space or error condition can be determined without an additional call to _pack_bytes(), avoiding duplicate work. Implementation: We add some XOR_bitf_* macros to address words and individual bits of bitfields. The XOR_ser and XOR_deser macros have the otherwise repeated code for bounds checking and the actual serialization. Because the implementations for the 8 and 16 bit words are equal except for the data type, we add macros and create the actual functions by expanding the macros with the possible data types. Alternatives considered: Compared to _{de,}serialize(), the new functions need to copy individual fingerprint words rather than the whole array at once, which is less efficient. Therefor, an implementation using Duff's Device with branchless code was attempted but dismissed because avoiding out-of-bounds access would require an over-allocated buffer. --- include/binaryfusefilter.h | 107 +++++++++++++++++++++++++++++++++++++ include/xorfilter.h | 104 +++++++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) diff --git a/include/binaryfusefilter.h b/include/binaryfusefilter.h index 6f5d5d1..14dbf4e 100644 --- a/include/binaryfusefilter.h +++ b/include/binaryfusefilter.h @@ -864,4 +864,111 @@ static inline bool binary_fuse8_deserialize(binary_fuse8_t * filter, const char return true; } +// minimal bitfield implementation +#define XOR_bitf_w (sizeof(uint8_t) * 8) +#define XOR_bitf_sz(bits) (((bits) + XOR_bitf_w - 1) / XOR_bitf_w) +#define XOR_bitf_word(bit) (bit / XOR_bitf_w) +#define XOR_bitf_bit(bit) ((1U << (bit % XOR_bitf_w)) % 256) + +#define XOR_ser(buf, lim, src) do { \ + if ((buf) + sizeof src > (lim)) \ + return (0); \ + memcpy(buf, &src, sizeof src); \ + buf += sizeof src; \ +} while (0) + +#define XOR_deser(dst, buf, lim) do { \ + if ((buf) + sizeof dst > (lim)) \ + return (false); \ + memcpy(&dst, buf, sizeof dst); \ + buf += sizeof dst; \ +} while (0) + +// return required space for binary_fuse{8,16}_pack() +#define XOR_bytesf(fuse) \ +static inline size_t binary_ ## fuse ## _pack_bytes(const binary_ ## fuse ## _t *filter) \ +{ \ + size_t sz = 0; \ + sz += sizeof filter->Seed; \ + sz += sizeof filter->Size; \ + sz += XOR_bitf_sz(filter->ArrayLength); \ + for (size_t i = 0; i < filter->ArrayLength; i++) { \ + if (filter->Fingerprints[i] == 0) \ + continue; \ + sz += sizeof filter->Fingerprints[i]; \ + } \ + return (sz); \ +} + +// serialize as packed format, return size used or 0 for insufficient space +#define XOR_packf(fuse) \ +static inline size_t binary_ ## fuse ## _pack(const binary_ ## fuse ## _t *filter, char *buffer, size_t space) { \ + uint8_t *s = (uint8_t *)(void *)buffer; \ + uint8_t *buf = s, *e = buf + space; \ + \ + XOR_ser(buf, e, filter->Seed); \ + XOR_ser(buf, e, filter->Size); \ + size_t bsz = XOR_bitf_sz(filter->ArrayLength); \ + if (buf + bsz > e) \ + return (0); \ + uint8_t *bitf = buf; \ + memset(bitf, 0, bsz); \ + buf += bsz; \ + \ + for (size_t i = 0; i < filter->ArrayLength; i++) { \ + if (filter->Fingerprints[i] == 0) \ + continue; \ + bitf[XOR_bitf_word(i)] |= XOR_bitf_bit(i); \ + XOR_ser(buf, e, filter->Fingerprints[i]); \ + } \ + return ((size_t)(buf - s)); \ +} + +#define XOR_unpackf(fuse) \ +static inline bool binary_ ## fuse ## _unpack(binary_ ## fuse ## _t *filter, const char *buffer, size_t len) \ +{ \ + const uint8_t *s = (const uint8_t *)(const void *)buffer; \ + const uint8_t *buf = s, *e = buf + len; \ + bool r; \ + \ + uint64_t Seed; \ + uint32_t Size; \ + \ + memset(filter, 0, sizeof *filter); \ + XOR_deser(Seed, buf, e); \ + XOR_deser(Size, buf, e); \ + r = binary_ ## fuse ## _allocate(Size, filter); \ + if (! r) \ + return (r); \ + filter->Seed = Seed; \ + const uint8_t *bitf = buf; \ + buf += XOR_bitf_sz(filter->ArrayLength); \ + for (size_t i = 0; i < filter->ArrayLength; i++) { \ + if ((bitf[XOR_bitf_word(i)] & XOR_bitf_bit(i)) == 0) \ + continue; \ + XOR_deser(filter->Fingerprints[i], buf, e); \ + } \ + return (true); \ +} + +#define XOR_packers(fuse) \ +XOR_bytesf(fuse) \ +XOR_packf(fuse) \ +XOR_unpackf(fuse) \ + +XOR_packers(fuse8) +XOR_packers(fuse16) + +#undef XOR_packers +#undef XOR_bytesf +#undef XOR_packf +#undef XOR_unpackf + +#undef XOR_bitf_w +#undef XOR_bitf_sz +#undef XOR_bitf_word +#undef XOR_bitf_bit +#undef XOR_ser +#undef XOR_deser + #endif diff --git a/include/xorfilter.h b/include/xorfilter.h index d87ef65..d2cb1ce 100644 --- a/include/xorfilter.h +++ b/include/xorfilter.h @@ -1349,5 +1349,109 @@ static inline bool xor8_deserialize(xor8_t * filter, const char *buffer) { return true; } +// minimal bitfield implementation +#define XOR_bitf_w (sizeof(uint8_t) * 8) +#define XOR_bitf_sz(bits) (((bits) + XOR_bitf_w - 1) / XOR_bitf_w) +#define XOR_bitf_word(bit) (bit / XOR_bitf_w) +#define XOR_bitf_bit(bit) ((1U << (bit % XOR_bitf_w)) % 256) + +#define XOR_ser(buf, lim, src) do { \ + if ((buf) + sizeof src > (lim)) \ + return (0); \ + memcpy(buf, &src, sizeof src); \ + buf += sizeof src; \ +} while (0) + +#define XOR_deser(dst, buf, lim) do { \ + if ((buf) + sizeof dst > (lim)) \ + return (false); \ + memcpy(&dst, buf, sizeof dst); \ + buf += sizeof dst; \ +} while (0) + +// return required space for binary_xor{8,16}_pack() +#define XOR_bytesf(xbits) \ +static inline size_t xor ## xbits ## _pack_bytes(const xor ## xbits ## _t *filter) \ +{ \ + size_t sz = 0; \ + size_t capacity = 3 * filter->blockLength; \ + sz += sizeof filter->seed; \ + sz += sizeof filter->blockLength; \ + sz += XOR_bitf_sz(capacity); \ + for (size_t i = 0; i < capacity; i++) { \ + if (filter->fingerprints[i] == 0) \ + continue; \ + sz += sizeof filter->fingerprints[i]; \ + } \ + return (sz); \ +} + +// serialize as packed format, return size used or 0 for insufficient space +#define XOR_packf(xbits) \ +static inline size_t xor ## xbits ## _pack(const xor ## xbits ## _t *filter, char *buffer, size_t space) { \ + uint8_t *s = (uint8_t *)(void *)buffer; \ + uint8_t *buf = s, *e = buf + space; \ + size_t capacity = 3 * filter->blockLength; \ + \ + XOR_ser(buf, e, filter->seed); \ + XOR_ser(buf, e, filter->blockLength); \ + size_t bsz = XOR_bitf_sz(capacity); \ + if (buf + bsz > e) \ + return (0); \ + uint8_t *bitf = buf; \ + memset(bitf, 0, bsz); \ + buf += bsz; \ + \ + for (size_t i = 0; i < capacity; i++) { \ + if (filter->fingerprints[i] == 0) \ + continue; \ + bitf[XOR_bitf_word(i)] |= XOR_bitf_bit(i); \ + XOR_ser(buf, e, filter->fingerprints[i]); \ + } \ + return ((size_t)(buf - s)); \ +} + +#define XOR_unpackf(xbits) \ +static inline bool xor ## xbits ## _unpack(xor ## xbits ## _t *filter, const char *buffer, size_t len) \ +{ \ + const uint8_t *s = (const uint8_t *)(const void *)buffer; \ + const uint8_t *buf = s, *e = buf + len; \ + \ + memset(filter, 0, sizeof *filter); \ + XOR_deser(filter->seed, buf, e); \ + XOR_deser(filter->blockLength, buf, e); \ + size_t capacity = 3 * filter->blockLength; \ + filter->fingerprints = (uint ## xbits ## _t *)calloc(capacity, sizeof filter->fingerprints[0]); \ + if (filter->fingerprints == NULL) \ + return (false); \ + const uint8_t *bitf = buf; \ + buf += XOR_bitf_sz(capacity); \ + for (size_t i = 0; i < capacity; i++) { \ + if ((bitf[XOR_bitf_word(i)] & XOR_bitf_bit(i)) == 0) \ + continue; \ + XOR_deser(filter->fingerprints[i], buf, e); \ + } \ + return (true); \ +} + +#define XOR_packers(xbits) \ +XOR_bytesf(xbits) \ +XOR_packf(xbits) \ +XOR_unpackf(xbits) \ + +XOR_packers(8) +XOR_packers(16) + +#undef XOR_packers +#undef XOR_bytesf +#undef XOR_packf +#undef XOR_unpackf + +#undef XOR_bitf_w +#undef XOR_bitf_sz +#undef XOR_bitf_word +#undef XOR_bitf_bit +#undef XOR_ser +#undef XOR_deser #endif From 777cf309054d245086298adaf7483ea7799a6e08 Mon Sep 17 00:00:00 2001 From: Nils Goroll Date: Sun, 19 Jan 2025 16:34:36 +0100 Subject: [PATCH 3/8] Adjust unit tests to new _{un,}pack() interface To exercise the new code without too much of a change to the existing unit test, we change the signature of _{un,}serialize_gen() to take an additional (const) size_t argument, which we ignore for _{un,}serialize(). We add to the reported metrics absolute and relative size information for the "in-core" and "wire" format, the latter jointly referencing to _{un,}serialize() and _{un,}pack(). --- tests/unit.c | 102 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 89 insertions(+), 13 deletions(-) diff --git a/tests/unit.c b/tests/unit.c index f6dbe00..4eec1b4 100644 --- a/tests/unit.c +++ b/tests/unit.c @@ -8,16 +8,24 @@ #define F1(t, a, rt, t1, p1) rt GFNAM(t, a)(t1 p1) { return FNAM(t, a)(p1); } #define F2(t, a, rt, t1, p1, t2, p2) rt GFNAM(t, a)(t1 p1, t2 p2) { return FNAM(t, a)(p1, p2); } #define F3(t, a, rt, t1, p1, t2, p2, t3, p3) rt GFNAM(t, a)(t1 p1, t2 p2, t3 p3) { return FNAM(t, a)(p1, p2, p3); } +// map 3-argument _gen to 2-argument, discarding last +#define F32(t, a, rt, t1, p1, t2, p2, t3, p3) rt GFNAM(t, a)(t1 p1, t2 p2, t3 p3) { (void)p3; return FNAM(t, a)(p1, p2); } +// void return, ignore return value +#define F3V(t, a, rt, t1, p1, t2, p2, t3, p3) rt GFNAM(t, a)(t1 p1, t2 p2, t3 p3) { (void)FNAM(t, a)(p1, p2, p3); } #define GEN_THUNKS(ftype) \ F2(ftype, allocate, bool, uint32_t, size, void*, filter) \ F1(ftype, free, void, void*, filter) \ F1(ftype, size_in_bytes, size_t, const void*, filter) \ F1(ftype, serialization_bytes, size_t, void*, filter) \ - F2(ftype, serialize, void, void*, filter, char*, buffer) \ - F2(ftype, deserialize, bool, void*, filter, const char*, buffer) \ + F32(ftype, serialize, void, void*, filter, char*, buffer, size_t, len) \ + F32(ftype, deserialize, bool, void*, filter, const char*, buffer, size_t, len) \ F3(ftype, populate, bool, uint64_t*, keys, uint32_t, size, void*, filter) \ - F2(ftype, contain, bool, uint64_t, key, const void*, filter) + F2(ftype, contain, bool, uint64_t, key, const void*, filter) \ + F1(ftype, pack_bytes, size_t, void*, filter) \ + F3V(ftype, pack, void, void*, filter, char*, buffer, size_t, len) \ + F3(ftype, unpack, bool, void*, filter, const char*, buffer, size_t, len) + GEN_THUNKS(xor8) GEN_THUNKS(xor16) @@ -32,8 +40,8 @@ bool test(size_t size, size_t repeated_size, void *filter, void (*free_filter)(void *filter), size_t (*size_in_bytes)(const void *filter), size_t (*serialization_bytes)(void *filter), - void (*serialize)(void *filter, char *buffer), - bool (*deserialize)(void *filter, const char *buffer), + void (*serialize)(void *filter, char *buffer, size_t len), + bool (*deserialize)(void *filter, const char *buffer, size_t len), bool (*populate)(uint64_t *keys, uint32_t size, void *filter), bool (*contain)(uint64_t key, const void *filter)) { allocate((uint32_t)size, filter); @@ -56,9 +64,9 @@ bool test(size_t size, size_t repeated_size, void *filter, size_t buffer_size = serialization_bytes(filter); char *buffer = (char*)malloc(buffer_size); - serialize(filter, buffer); + serialize(filter, buffer, buffer_size); free_filter(filter); - deserialize(filter, buffer); + deserialize(filter, buffer, buffer_size); free(buffer); for (size_t i = 0; i < size; i++) { if (!(contain)(big_set[i], filter)) { @@ -79,10 +87,14 @@ bool test(size_t size, size_t repeated_size, void *filter, } double fpp = (double)random_matches * 1.0 / (double)trials; printf(" fpp %3.5f (estimated) \n", fpp); - double bpe = (double)size_in_bytes(filter) * 8.0 / (double)size; - printf(" bits per entry %3.2f\n", bpe); - printf(" bits per entry %3.2f (theoretical lower bound)\n", - log(fpp)/log(2)); - printf(" efficiency ratio %3.3f \n", bpe /(- log(fpp)/log(2))); + size_t core_size = size_in_bytes(filter); + printf(" size in-core %zu wire %zu\n", core_size, buffer_size); + double cbpe = (double)core_size * 8.0 / (double)size; + double wbpe = (double)buffer_size * 8.0 / (double)size; + printf(" bits per entry in-core %3.2f wire %3.2f\n", cbpe, wbpe); + double bound = - log(fpp)/log(2); + printf(" bits per entry %3.2f (theoretical lower bound)\n", bound); + printf(" efficiency ratio in-core %3.3f wire %3.3f\n", cbpe/bound, wbpe/bound); free_filter(filter); free(big_set); return true; @@ -132,6 +144,35 @@ bool testxor16(size_t size) { } +bool testxor8pack(size_t size) { + printf("testing xor8 pack/unpack\n"); + xor8_t filter; + return test(size, 0, &filter, + xor8_allocate_gen, + xor8_free_gen, + xor8_size_in_bytes_gen, + xor8_pack_bytes_gen, + xor8_pack_gen, + xor8_unpack_gen, + xor8_populate_gen, + xor8_contain_gen); +} + +bool testxor16pack(size_t size) { + printf("testing xor16 pack/unpack\n"); + xor8_t filter; + return test(size, 0, &filter, + xor16_allocate_gen, + xor16_free_gen, + xor16_size_in_bytes_gen, + xor16_pack_bytes_gen, + xor16_pack_gen, + xor16_unpack_gen, + xor16_populate_gen, + xor16_contain_gen); +} + + bool testbufferedxor16(size_t size) { printf("testing buffered xor16\n"); @@ -161,8 +202,6 @@ bool testbinaryfuse8(size_t size, size_t repeated_size) { binary_fuse8_contain_gen); } - - bool testbinaryfuse16(size_t size, size_t repeated_size) { printf("testing binary fuse16 with size %zu and %zu duplicates\n", size, repeated_size); binary_fuse16_t filter; @@ -177,6 +216,35 @@ bool testbinaryfuse16(size_t size, size_t repeated_size) { binary_fuse16_contain_gen); } + +bool testbinaryfuse8pack(size_t size, size_t repeated_size) { + printf("testing binary fuse8 pack/unpack with size %zu and %zu duplicates\n", size, repeated_size); + binary_fuse8_t filter; + return test(size, repeated_size, &filter, + binary_fuse8_allocate_gen, + binary_fuse8_free_gen, + binary_fuse8_size_in_bytes_gen, + binary_fuse8_pack_bytes_gen, + binary_fuse8_pack_gen, + binary_fuse8_unpack_gen, + binary_fuse8_populate_gen, + binary_fuse8_contain_gen); +} + +bool testbinaryfuse16pack(size_t size, size_t repeated_size) { + printf("testing binary fuse16 pack/unpack with size %zu and %zu duplicates\n", size, repeated_size); + binary_fuse16_t filter; + return test(size, repeated_size, &filter, + binary_fuse16_allocate_gen, + binary_fuse16_free_gen, + binary_fuse16_size_in_bytes_gen, + binary_fuse16_pack_bytes_gen, + binary_fuse16_pack_gen, + binary_fuse16_unpack_gen, + binary_fuse16_populate_gen, + binary_fuse16_contain_gen); +} + void failure_rate_binary_fuse16() { printf("testing binary fuse16 for failure rate\n"); // we construct many 5000-long input cases and check the probability of failure. @@ -208,6 +276,10 @@ int main() { printf("\n"); if(!testbinaryfuse16(size, 0)) { abort(); } printf("\n"); + if(!testbinaryfuse8pack(size, 0)) { abort(); } + printf("\n"); + if(!testbinaryfuse16pack(size, 0)) { abort(); } + printf("\n"); if(!testbinaryfuse8(size, 10)) { abort(); } printf("\n"); if(!testbinaryfuse16(size, 10)) { abort(); } @@ -220,6 +292,10 @@ int main() { printf("\n"); if(!testxor16(size)) { abort(); } printf("\n"); + if(!testxor8pack(size)) { abort(); } + printf("\n"); + if(!testxor16pack(size)) { abort(); } + printf("\n"); printf("======\n"); } From 5e2aff2b50bbf7b6d34a78174cf50cf13f3c9b95 Mon Sep 17 00:00:00 2001 From: Nils Goroll Date: Tue, 21 Jan 2025 11:10:19 +0100 Subject: [PATCH 4/8] Document the new _{un,}pack() interface --- README.md | 40 +++++++++++++++++++++++++++++++++++++--- tests/unit.c | 27 +++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 97f072f..3de6d0c 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,19 @@ about 0.0015%. The type is `binary_fuse16_t` and you may use it with functions such as `binary_fuse16_allocate`, `binary_fuse16_populate`, `binary_fuse8_contain` and `binary_fuse8_free`. -You may serialize the data as follows: +For serialization, there is a choice between an unpacked and a packed format. + +The unpacked format is roughly of the same size as in-core data, but uses most +efficient memory copy operations. + +The packed format avoids storing zero bytes and is considered near optimal (it +can not be compressed further by zlib and its required space is very close to +the theoretical lower limit), but it needs to copy individual words, so it +should be expected to be somewhat slower. + +The two formats use slightly different APIs. + +You may serialize and deserialize in unpacked format as follows: ```C size_t buffer_size = binary_fuse16_serialization_bytes(&filter); @@ -65,9 +77,31 @@ You may serialize the data as follows: free(buffer); ``` -The serialization does not handle endianess: it is expected that you will serialize -and deserialize on the little endian systems. (Big endian systems are vanishingly rare.) +To serialize and deserialize in packed format, use the `_pack_bytes()`, +`_pack()` and `_unpack()` functions. The latter two have an additional `size_t` +argument for the buffer length. `_pack()` can be used with a buffer of arbitrary +size, it returns the used space if serialization fit into the buffer or 0 +otherwise. + +For example: + +```C + size_t buffer_size = binary_fuse16_pack_bytes(&filter); + char *buffer = (char*)malloc(buffer_size); + if (binary_fuse16_pack(&filter, buffer, buffer_size) != buffer_size) { + printf("pack failed\n"); + free(buffer); + return; + } + binary_fuse16_free(&filter); + if (! binary_fuse16_unpack(&filter, buffer, buffer_size)) { + printf("unpack failed\n"); + } + free(buffer); +``` +Either serialization does not handle endianess changes: it is expected that you +serialize and deserialize with equal byte order. ## C++ wrapper diff --git a/tests/unit.c b/tests/unit.c index 4eec1b4..cd7ad0c 100644 --- a/tests/unit.c +++ b/tests/unit.c @@ -268,7 +268,34 @@ void failure_rate_binary_fuse16() { free(big_set); } +// test code from the example in the README +void readme_pack() { + binary_fuse16_t filter = {0}; + if (! binary_fuse16_allocate(64, &filter)) { + printf("allocation failed\n"); + return; + } + + // begin example snippet + size_t buffer_size = binary_fuse16_pack_bytes(&filter); + char *buffer = (char*)malloc(buffer_size); + if (binary_fuse16_pack(&filter, buffer, buffer_size) != buffer_size) { + printf("pack failed\n"); + free(buffer); + return; + } + binary_fuse16_free(&filter); + if (! binary_fuse16_unpack(&filter, buffer, buffer_size)) { + printf("unpack failed\n"); + } + free(buffer); + // end example snippet + + binary_fuse16_free(&filter); +} + int main() { + readme_pack(); failure_rate_binary_fuse16(); for(size_t size = 1000; size <= 1000000; size *= 300) { printf("== size = %zu \n", size); From cb642596af5698452f27238b3325f7b2d4304a1c Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 21 Jan 2025 21:16:30 -0500 Subject: [PATCH 5/8] tuning the wording and adding a spaceusage benchmark --- README.md | 11 ++-- benchmarks/CMakeLists.txt | 3 + benchmarks/spaceusage.c | 119 ++++++++++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 4 +- 4 files changed, 130 insertions(+), 7 deletions(-) create mode 100644 benchmarks/spaceusage.c diff --git a/README.md b/README.md index 3de6d0c..e10ccb5 100644 --- a/README.md +++ b/README.md @@ -59,10 +59,9 @@ For serialization, there is a choice between an unpacked and a packed format. The unpacked format is roughly of the same size as in-core data, but uses most efficient memory copy operations. -The packed format avoids storing zero bytes and is considered near optimal (it -can not be compressed further by zlib and its required space is very close to -the theoretical lower limit), but it needs to copy individual words, so it -should be expected to be somewhat slower. +The packed format avoids storing zero bytes and relies on a bitset to locate them, so it +should be expected to be somewhat slower. The packed format might be smaller or larger. +When in doubt, prefer the regular (unpacked) format. The two formats use slightly different APIs. @@ -77,11 +76,13 @@ You may serialize and deserialize in unpacked format as follows: free(buffer); ``` +This should be the default. + To serialize and deserialize in packed format, use the `_pack_bytes()`, `_pack()` and `_unpack()` functions. The latter two have an additional `size_t` argument for the buffer length. `_pack()` can be used with a buffer of arbitrary size, it returns the used space if serialization fit into the buffer or 0 -otherwise. +otherwise. Note that the packed format will be slower and may not save space. For example: diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index a273c92..54d88fd 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,2 +1,5 @@ add_executable(bench bench.c) target_link_libraries(bench PUBLIC xor_singleheader) + +add_executable(spaceusage spaceusage.c) +target_link_libraries(spaceusage PUBLIC xor_singleheader) \ No newline at end of file diff --git a/benchmarks/spaceusage.c b/benchmarks/spaceusage.c new file mode 100644 index 0000000..4d6867f --- /dev/null +++ b/benchmarks/spaceusage.c @@ -0,0 +1,119 @@ +#include "binaryfusefilter.h" +#include "xorfilter.h" +#include +#include + +typedef struct { + size_t standard; + size_t pack; +} sizes; + +sizes fuse16(size_t n) { + binary_fuse16_t filter = {0}; + if (! binary_fuse16_allocate(n, &filter)) { + printf("allocation failed\n"); + return (sizes) {0, 0}; + } + uint64_t* big_set = malloc(n * sizeof(uint64_t)); + for(size_t i = 0; i < n; i++) { + big_set[i] = i; + } + bool is_ok = binary_fuse16_populate(big_set, n, &filter); + if(! is_ok ) { + printf("populating failed\n"); + } + free(big_set); + sizes s = { + .standard = binary_fuse16_serialization_bytes(&filter), + .pack = binary_fuse16_pack_bytes(&filter) + }; + binary_fuse16_free(&filter); + return s; +} + +sizes fuse8(size_t n) { + binary_fuse8_t filter = {0}; + if (! binary_fuse8_allocate(n, &filter)) { + printf("allocation failed\n"); + return (sizes) {0, 0}; + } + uint64_t* big_set = malloc(n * sizeof(uint64_t)); + for(size_t i = 0; i < n; i++) { + big_set[i] = i; + } + bool is_ok = binary_fuse8_populate(big_set, n, &filter); + if(! is_ok ) { + printf("populating failed\n"); + } + free(big_set); + sizes s = { + .standard = binary_fuse8_serialization_bytes(&filter), + .pack = binary_fuse8_pack_bytes(&filter) + }; + binary_fuse8_free(&filter); + return s; +} + +sizes xor16(size_t n) { + xor16_t filter = {0}; + if (! xor16_allocate(n, &filter)) { + printf("allocation failed\n"); + return (sizes) {0, 0}; + } + uint64_t* big_set = malloc(n * sizeof(uint64_t)); + for(size_t i = 0; i < n; i++) { + big_set[i] = i; + } + bool is_ok = xor16_populate(big_set, n, &filter); + if(! is_ok ) { + printf("populating failed\n"); + } + free(big_set); + sizes s = { + .standard = xor16_serialization_bytes(&filter), + .pack = xor16_pack_bytes(&filter) + }; + xor16_free(&filter); + return s; +} + +sizes xor8(size_t n) { + xor8_t filter = {0}; + if (! xor8_allocate(n, &filter)) { + printf("allocation failed\n"); + return (sizes) {0, 0}; + } + uint64_t* big_set = malloc(n * sizeof(uint64_t)); + for(size_t i = 0; i < n; i++) { + big_set[i] = i; + } + bool is_ok = xor8_populate(big_set, n, &filter); + if(! is_ok ) { + printf("populating failed\n"); + } + free(big_set); + sizes s = { + .standard = xor8_serialization_bytes(&filter), + .pack = xor8_pack_bytes(&filter) + }; + xor8_free(&filter); + + return s; +} + +int main() { + for (size_t n = 10; n <= 10000000; n *= 2) { + printf("%-10zu ", n); // Align number to 10 characters wide + sizes f16 = fuse16(n); + sizes f8 = fuse8(n); + sizes x16 = xor16(n); + sizes x8 = xor8(n); + + printf("fuse16: %5.2f %5.2f ", (double)f16.standard * 8.0 / n, (double)f16.pack * 8.0 / n); + printf("fuse8: %5.2f %5.2f ", (double)f8.standard * 8.0 / n, (double)f8.pack * 8.0 / n); + printf("xor16: %5.2f %5.2f ", (double)x16.standard * 8.0 / n, (double)x16.pack * 8.0 / n); + printf("xor8: %5.2f %5.2f ", (double)x8.standard * 8.0 / n, (double)x8.pack * 8.0 / n); + printf("\n"); + } + return EXIT_SUCCESS; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e23be17..3000ca9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -14,9 +14,9 @@ else() # *nix -Wall -Wextra -Wshadow -Wcast-qual -Wconversion -Wsign-conversion -Werror) if (NOT MINGW) # sanitizers are not supported under mingw - list(APPEND TEST_COMPILE_OPTIONS -fsanitize=address,undefined,leak) + list(APPEND TEST_COMPILE_OPTIONS -fsanitize=address,undefined) # sanitsizers need to be specified at link time as well - target_link_options(unit PRIVATE -fsanitize=address,leak,undefined) + target_link_options(unit PRIVATE -fsanitize=address,undefined) endif() endif() From ee6a5c7d278670210c0d516f988954b6e4d79b61 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 21 Jan 2025 21:22:12 -0500 Subject: [PATCH 6/8] changing the wording. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e10ccb5..ce51668 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ efficient memory copy operations. The packed format avoids storing zero bytes and relies on a bitset to locate them, so it should be expected to be somewhat slower. The packed format might be smaller or larger. +It might be beneficial when using 16-bit binary fuse filters. When in doubt, prefer the regular (unpacked) format. The two formats use slightly different APIs. @@ -82,7 +83,8 @@ To serialize and deserialize in packed format, use the `_pack_bytes()`, `_pack()` and `_unpack()` functions. The latter two have an additional `size_t` argument for the buffer length. `_pack()` can be used with a buffer of arbitrary size, it returns the used space if serialization fit into the buffer or 0 -otherwise. Note that the packed format will be slower and may not save space. +otherwise. Note that the packed format will be slower and may not save space +although it is likely beneficial when using the 16-bit binary fuse filters. For example: From e75ad543db8d4767ff3a7825120c4aa9d2b5ddd3 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 21 Jan 2025 21:25:35 -0500 Subject: [PATCH 7/8] rewording. --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ce51668..cc2ca15 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,8 @@ efficient memory copy operations. The packed format avoids storing zero bytes and relies on a bitset to locate them, so it should be expected to be somewhat slower. The packed format might be smaller or larger. -It might be beneficial when using 16-bit binary fuse filters. +It might be beneficial when using 16-bit binary fuse filters for users who need to preserve +every bytes, and who do not care about the computational overhead. When in doubt, prefer the regular (unpacked) format. The two formats use slightly different APIs. @@ -84,7 +85,7 @@ To serialize and deserialize in packed format, use the `_pack_bytes()`, argument for the buffer length. `_pack()` can be used with a buffer of arbitrary size, it returns the used space if serialization fit into the buffer or 0 otherwise. Note that the packed format will be slower and may not save space -although it is likely beneficial when using the 16-bit binary fuse filters. +although it is likely smaller on disk when using the 16-bit binary fuse filters. For example: From 78a7fe7c0fee71abf213b2f04592216341e1881c Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 21 Jan 2025 21:33:16 -0500 Subject: [PATCH 8/8] explicit casts --- include/xorfilter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xorfilter.h b/include/xorfilter.h index d2cb1ce..a07d397 100644 --- a/include/xorfilter.h +++ b/include/xorfilter.h @@ -1374,7 +1374,7 @@ static inline bool xor8_deserialize(xor8_t * filter, const char *buffer) { static inline size_t xor ## xbits ## _pack_bytes(const xor ## xbits ## _t *filter) \ { \ size_t sz = 0; \ - size_t capacity = 3 * filter->blockLength; \ + size_t capacity = (size_t)(3 * filter->blockLength); \ sz += sizeof filter->seed; \ sz += sizeof filter->blockLength; \ sz += XOR_bitf_sz(capacity); \ @@ -1391,7 +1391,7 @@ static inline size_t xor ## xbits ## _pack_bytes(const xor ## xbits ## _t *filte static inline size_t xor ## xbits ## _pack(const xor ## xbits ## _t *filter, char *buffer, size_t space) { \ uint8_t *s = (uint8_t *)(void *)buffer; \ uint8_t *buf = s, *e = buf + space; \ - size_t capacity = 3 * filter->blockLength; \ + size_t capacity = (size_t)(3 * filter->blockLength); \ \ XOR_ser(buf, e, filter->seed); \ XOR_ser(buf, e, filter->blockLength); \ @@ -1420,7 +1420,7 @@ static inline bool xor ## xbits ## _unpack(xor ## xbits ## _t *filter, const cha memset(filter, 0, sizeof *filter); \ XOR_deser(filter->seed, buf, e); \ XOR_deser(filter->blockLength, buf, e); \ - size_t capacity = 3 * filter->blockLength; \ + size_t capacity = (size_t)(3 * filter->blockLength); \ filter->fingerprints = (uint ## xbits ## _t *)calloc(capacity, sizeof filter->fingerprints[0]); \ if (filter->fingerprints == NULL) \ return (false); \