Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ type BinaryFuse8 struct {

When constructing the filter, you should ensure that there are not too many duplicate keys for best results.

# Generic (8-bit, 16-bit, 32-bit)
## Generic (8-bit, 16-bit, 32-bit)

By default, we use 8-bit fingerprints which provide a 0.4% false positive rate. Some user might want to reduce
this false positive rate at the expensive of more memory usage. For this purpose, we provide a generic type
this false positive rate at the expense of more memory usage. For this purpose, we provide a generic type
(`NewBinaryFuse[T]`).

```Go
Expand All @@ -80,6 +80,18 @@ The 32-bit fingerprints are provided but not recommended. Most users will want t
The Binary Fuse filters have memory usages of about 9 bits per key in the 8-bit case, 18 bits per key in the 16-bit case,
for sufficiently large sets (hundreds of thousands of keys). There is more per-key memory usage when the set is smaller.

## Memory reuse for repeated builds

When building many filters, memory can be reused (reducing allocation and GC
overhead) with a `BinaryFuseBuilder`:
```Go
var builder xorfilter.BinaryFuseBuilder
for {
filter8, _ := BuildBinaryFuse[uint8](&builder, keys)
filter16, _ := BuildBinaryFuse[uint16](&builder, keys)
...
}
```

# Implementations of xor filters in other programming languages

Expand Down
89 changes: 74 additions & 15 deletions binaryfusefilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"math"
"math/bits"
"unsafe"
)

type Unsigned interface {
Expand All @@ -20,26 +21,57 @@ type BinaryFuse[T Unsigned] struct {
Fingerprints []T
}

// NewBinaryFuse fills the filter with provided keys. For best results,
// the caller should avoid having too many duplicated keys.
// NewBinaryFuse creates a binary fuse filter with provided keys. For best
// results, the caller should avoid having too many duplicated keys.
//
// The function can mutate the given keys slice to remove duplicates.
//
// The function may return an error if the set is empty.
func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) {
var b BinaryFuseBuilder
filter, err := BuildBinaryFuse[T](&b, keys)
if err != nil {
return nil, err
}
return &filter, nil
}

// BinaryFuseBuilder can be used to reuse memory allocations across multiple
// BinaryFuse builds.
type BinaryFuseBuilder struct {
alone reusableBuffer
t2hash reusableBuffer
reverseOrder reusableBuffer
t2count reusableBuffer
reverseH reusableBuffer
startPos reusableBuffer
fingerprints reusableBuffer
}

// BuildBinaryFuse creates a binary fuse filter with provided keys, reusing
// buffers from the BinaryFuseBuilder if possible. For best results, the caller
// should avoid having too many duplicated keys.
//
// The function can mutate the given keys slice to remove duplicates.
//
// The function may return an error if the set is empty.
func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFuse[T], error) {
size := uint32(len(keys))
filter := &BinaryFuse[T]{}
filter.initializeParameters(size)
var filter BinaryFuse[T]
filter.initializeParameters(b, size)
rngcounter := uint64(1)
filter.Seed = splitmix64(&rngcounter)
capacity := uint32(len(filter.Fingerprints))

alone := make([]uint32, capacity)
alone := reuseBuffer[uint32](&b.alone, int(capacity))
// the lowest 2 bits are the h index (0, 1, or 2)
// so we only have 6 bits for counting;
// but that's sufficient
t2count := make([]T, capacity)
reverseH := make([]T, size)
t2count := reuseBuffer[T](&b.t2count, int(capacity))
reverseH := reuseBuffer[T](&b.reverseH, int(size))

t2hash := make([]uint64, capacity)
reverseOrder := make([]uint64, size+1)
t2hash := reuseBuffer[uint64](&b.t2hash, int(capacity))
reverseOrder := reuseBuffer[uint64](&b.reverseOrder, int(size+1))
reverseOrder[size] = 1

// the array h0, h1, h2, h0, h1, h2
Expand All @@ -50,16 +82,16 @@ func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) {
for {
iterations += 1
if iterations > MaxIterations {
// The probability of this happening is lower than the
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system).
return nil, errors.New("too many iterations")
// The probability of this happening is lower than the cosmic-ray
// probability (i.e., a cosmic ray corrupts your system).
return BinaryFuse[T]{}, errors.New("too many iterations")
}

blockBits := 1
for (1 << blockBits) < filter.SegmentCount {
blockBits += 1
}
startPos := make([]uint, 1<<blockBits)
startPos := reuseBuffer[uint](&b.startPos, 1<<blockBits)
for i := range startPos {
// important: we do not want i * size to overflow!!!
startPos[i] = uint((uint64(i) * uint64(size)) >> blockBits)
Expand Down Expand Up @@ -216,7 +248,7 @@ func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) {
return filter, nil
}

func (filter *BinaryFuse[T]) initializeParameters(size uint32) {
func (filter *BinaryFuse[T]) initializeParameters(b *BinaryFuseBuilder, size uint32) {
arity := uint32(3)
filter.SegmentLength = calculateSegmentLength(arity, size)
if filter.SegmentLength > 262144 {
Expand All @@ -238,7 +270,7 @@ func (filter *BinaryFuse[T]) initializeParameters(size uint32) {
}
arrayLength = (filter.SegmentCount + arity - 1) * filter.SegmentLength
filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength
filter.Fingerprints = make([]T, arrayLength)
filter.Fingerprints = reuseBuffer[T](&b.fingerprints, int(arrayLength))
}

func (filter *BinaryFuse[T]) mod3(x T) T {
Expand Down Expand Up @@ -292,3 +324,30 @@ func calculateSizeFactor(arity uint32, size uint32) float64 {
return 2.0
}
}

// reusableBuffer allows reuse of a backing buffer to avoid allocations for
// slices of integers.
type reusableBuffer struct {
buf []uint64
}

type integer interface {
~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64
}

// reuseBuffer returns an empty slice of the given size, reusing the last buffer
// if possible.
func reuseBuffer[T integer](b *reusableBuffer, size int) []T {
const sizeOfUint64 = 8
// Our backing buffer is a []uint64. Figure out how many uint64s we need
// to back a []T of the requested size.
bufSize := int((uintptr(size)*unsafe.Sizeof(T(0)) + sizeOfUint64 - 1) / sizeOfUint64)
if cap(b.buf) >= bufSize {
clear(b.buf[:bufSize])
} else {
// We need to allocate a new buffer. Increase by at least 25% to amortize
// allocations; this is what append() does for large enough slices.
b.buf = make([]uint64, max(bufSize, cap(b.buf)+cap(b.buf)/4))
}
return unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(b.buf))), size)
}
32 changes: 32 additions & 0 deletions binaryfusefilter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ package xorfilter
import (
"fmt"
"math/rand/v2"
"slices"
"testing"

"github.com/cespare/xxhash/v2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

const NUM_KEYS = 1e6
Expand Down Expand Up @@ -329,7 +331,37 @@ func TestBinaryFuseN_Issue35(t *testing.T) {
if !e {
panic(i)
}
}
}
}

func TestBinaryFuseBuilder(t *testing.T) {
// Verify that repeated builds with the same builder create the exact same
// filter as using NewBinaryFuse.
var bld BinaryFuseBuilder
for i := 0; i < 100; i++ {
n := 1 + rand.IntN(1<<rand.IntN(20))
keys := make([]uint64, n)
for j := range keys {
keys[j] = rand.Uint64()
}
switch rand.IntN(3) {
case 0:
crossCheckFuseBuilder[uint8](t, &bld, keys)
case 1:
crossCheckFuseBuilder[uint16](t, &bld, keys)
case 2:
crossCheckFuseBuilder[uint32](t, &bld, keys)
}
}
}

func crossCheckFuseBuilder[T Unsigned](t *testing.T, bld *BinaryFuseBuilder, keys []uint64) {
t.Helper()
filter, err := BuildBinaryFuse[T](bld, slices.Clone(keys))
require.NoError(t, err)
expected, err := NewBinaryFuse[T](keys)
require.NoError(t, err)
_ = expected
require.Equal(t, *expected, filter)
}
3 changes: 2 additions & 1 deletion xorfilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ func scanCount(Qi []keyindex, setsi []xorset) ([]keyindex, int) {
return Qi, QiSize
}

// The maximum number of iterations allowed before the populate function returns an error
// MaxIterations is the maximum number of iterations allowed before the populate
// function returns an error.
var MaxIterations = 1024

// Populate fills the filter with provided keys. For best results,
Expand Down