Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions examples/reservoir_example_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package examples

import (
"fmt"
"testing"

"github.com/apache/datasketches-go/sampling"
"github.com/stretchr/testify/assert"
)

func TestReservoirSamplingWithIntegers(t *testing.T) {
// Create a reservoir sketch with k=10 (max 10 samples)
sketch, err := sampling.NewReservoirItemsSketch[int64](10)
assert.NoError(t, err)

// Add 1000 items to the stream
for i := int64(1); i <= 1000; i++ {
sketch.Update(i)
}

// The sketch maintains exactly k samples
assert.Equal(t, 10, sketch.NumSamples())
assert.Equal(t, int64(1000), sketch.N())

samples := sketch.Samples()
fmt.Printf("Sampled %d integers from stream of %d\n", len(samples), sketch.N())
fmt.Printf("Samples: %v\n", samples)
}

func TestReservoirSamplingWithStrings(t *testing.T) {
// Generic sketch can work with any type
sketch, err := sampling.NewReservoirItemsSketch[string](5)
assert.NoError(t, err)

words := []string{"apple", "banana", "cherry", "date", "elderberry", "fig", "grape", "honeydew"}
for _, word := range words {
sketch.Update(word)
}

assert.Equal(t, 5, sketch.NumSamples())
fmt.Printf("Sampled words: %v\n", sketch.Samples())
}

func TestReservoirSamplingWithStructs(t *testing.T) {
type LogEntry struct {
Timestamp int64
Message string
}

sketch, err := sampling.NewReservoirItemsSketch[LogEntry](3)
assert.NoError(t, err)

// Simulate streaming log entries
for i := int64(1); i <= 100; i++ {
sketch.Update(LogEntry{
Timestamp: i,
Message: fmt.Sprintf("Log message %d", i),
})
}

assert.Equal(t, 3, sketch.NumSamples())
fmt.Printf("Sampled log entries: %+v\n", sketch.Samples())
}

func TestReservoirUnion(t *testing.T) {
// Distributed sampling: each node samples independently
node1, _ := sampling.NewReservoirItemsSketch[int64](10)
node2, _ := sampling.NewReservoirItemsSketch[int64](10)

for i := int64(1); i <= 500; i++ {
node1.Update(i)
}
for i := int64(501); i <= 1000; i++ {
node2.Update(i)
}

// Merge samples from both nodes
union, err := sampling.NewReservoirItemsUnion[int64](10)
assert.NoError(t, err)

union.UpdateSketch(node1)
union.UpdateSketch(node2)

result, err := union.Result()
assert.NoError(t, err)

assert.Equal(t, 10, result.NumSamples())
fmt.Printf("Union result: %d samples from combined stream\n", result.NumSamples())
}
113 changes: 113 additions & 0 deletions sampling/reservoir_items_sketch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package sampling

import (
"errors"
"math/rand"
)

// ResizeFactor controls how the internal array grows.
// Note: Go's slice append has automatic resizing, so this is kept for
// API compatibility with the Java version. Can be removed if not needed.
type ResizeFactor int

const (
ResizeX1 ResizeFactor = 1
ResizeX2 ResizeFactor = 2
ResizeX4 ResizeFactor = 4
ResizeX8 ResizeFactor = 8

defaultResizeFactor = ResizeX8
minK = 1
)

// ReservoirItemsSketch provides a uniform random sample of items
// from a stream of unknown size using the reservoir sampling algorithm.
//
// The algorithm works in two phases:
// - Initial phase (n < k): all items are stored
// - Steady state (n >= k): each new item replaces a random item with probability k/n
//
// This ensures each item has equal probability k/n of being in the final sample.
type ReservoirItemsSketch[T any] struct {
k int // maximum reservoir size
n int64 // total items seen
data []T // reservoir storage
}

// NewReservoirItemsSketch creates a new reservoir sketch with the given capacity k.
func NewReservoirItemsSketch[T any](k int) (*ReservoirItemsSketch[T], error) {
if k < minK {
return nil, errors.New("k must be at least 1")
}

return &ReservoirItemsSketch[T]{
k: k,
n: 0,
data: make([]T, 0, min(k, int(defaultResizeFactor))),
}, nil
}

// Update adds an item to the sketch using reservoir sampling algorithm.
func (s *ReservoirItemsSketch[T]) Update(item T) {
if s.n < int64(s.k) {
// Initial phase: store all items until reservoir is full
s.data = append(s.data, item)
} else {
// Steady state: replace with probability k/n
j := rand.Int63n(s.n + 1)
if j < int64(s.k) {
s.data[j] = item
}
}
s.n++
}

// K returns the maximum reservoir capacity.
func (s *ReservoirItemsSketch[T]) K() int {
return s.k
}

// N returns the total number of items seen by the sketch.
func (s *ReservoirItemsSketch[T]) N() int64 {
return s.n
}

// NumSamples returns the number of items currently in the reservoir.
func (s *ReservoirItemsSketch[T]) NumSamples() int {
return len(s.data)
}

// Samples returns a copy of the items in the reservoir.
func (s *ReservoirItemsSketch[T]) Samples() []T {
result := make([]T, len(s.data))
copy(result, s.data)
return result
}

// IsEmpty returns true if no items have been seen.
func (s *ReservoirItemsSketch[T]) IsEmpty() bool {
return s.n == 0
}

// Reset clears the sketch while preserving capacity k.
func (s *ReservoirItemsSketch[T]) Reset() {
s.n = 0
s.data = s.data[:0]
}
Loading