From ad5c163b85981762d08db7bd7aad40f253a13d3b Mon Sep 17 00:00:00 2001 From: Fengzdadi Date: Wed, 31 Dec 2025 21:05:49 -0500 Subject: [PATCH] feat: add example tests for Theta, Bloom Filter, Count-Min, and Tuple sketches --- examples/bloom_filter_example_test.go | 90 +++++++++++++++++++ examples/count_min_example_test.go | 84 ++++++++++++++++++ examples/theta_example_test.go | 89 +++++++++++++++++++ examples/tuple_example_test.go | 121 ++++++++++++++++++++++++++ 4 files changed, 384 insertions(+) create mode 100644 examples/bloom_filter_example_test.go create mode 100644 examples/count_min_example_test.go create mode 100644 examples/theta_example_test.go create mode 100644 examples/tuple_example_test.go diff --git a/examples/bloom_filter_example_test.go b/examples/bloom_filter_example_test.go new file mode 100644 index 0000000..4fb3904 --- /dev/null +++ b/examples/bloom_filter_example_test.go @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package examples + +import ( + "fmt" + "testing" + + "github.com/apache/datasketches-go/filters" + "github.com/stretchr/testify/assert" +) + +func TestBloomFilter(t *testing.T) { + // Create a Bloom filter for 1000 items with 1% false positive rate + filter, err := filters.NewBloomFilterByAccuracy(1000, 0.01) + assert.NoError(t, err) + assert.True(t, filter.IsEmpty()) + + // Add items to the filter + for i := 0; i < 500; i++ { + err := filter.UpdateString(fmt.Sprintf("user_%d", i)) + assert.NoError(t, err) + } + assert.False(t, filter.IsEmpty()) + + // Query for items in the filter + assert.True(t, filter.QueryString("user_0")) + assert.True(t, filter.QueryString("user_100")) + + // Query for items not in the filter (may have false positives) + notFoundCount := 0 + for i := 1000; i < 1100; i++ { + if !filter.QueryString(fmt.Sprintf("user_%d", i)) { + notFoundCount++ + } + } + assert.Greater(t, notFoundCount, 90) + + // Use different data types + _ = filter.UpdateInt64(12345) + assert.True(t, filter.QueryInt64(12345)) + + // QueryAndUpdate for atomic test-and-set + wasPresent := filter.QueryAndUpdateString("new_item") + assert.False(t, wasPresent) + wasPresent = filter.QueryAndUpdateString("new_item") + assert.True(t, wasPresent) + + // Create a second filter for union + filter2, err := filters.NewBloomFilterByAccuracy(1000, 0.01) + assert.NoError(t, err) + for i := 250; i < 750; i++ { + _ = filter2.UpdateString(fmt.Sprintf("user_%d", i)) + } + + // Union two filters + filter3, err := filters.NewBloomFilterByAccuracy(1000, 0.01) + assert.NoError(t, err) + for i := 0; i < 500; i++ { + _ = filter3.UpdateString(fmt.Sprintf("user_%d", i)) + } + err = filter3.Union(filter2) + assert.NoError(t, err) + assert.True(t, filter3.QueryString("user_0")) + assert.True(t, filter3.QueryString("user_600")) + + // Serialize and deserialize + bytes, err := filter.ToCompactSlice() + assert.NoError(t, err) + + restored, err := filters.NewBloomFilterFromSlice(bytes) + assert.NoError(t, err) + assert.True(t, restored.QueryString("user_0")) + assert.Equal(t, filter.BitsUsed(), restored.BitsUsed()) +} diff --git a/examples/count_min_example_test.go b/examples/count_min_example_test.go new file mode 100644 index 0000000..b49313d --- /dev/null +++ b/examples/count_min_example_test.go @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package examples + +import ( + "bytes" + "fmt" + "testing" + + "github.com/apache/datasketches-go/count" + "github.com/stretchr/testify/assert" +) + +func TestCountMinSketch(t *testing.T) { + seed := int64(12345) + + // Create a Count-Min Sketch with suggested parameters + numBuckets, err := count.SuggestNumBuckets(0.1) + assert.NoError(t, err) + numHashes, err := count.SuggestNumHashes(0.99) + assert.NoError(t, err) + + sketch, err := count.NewCountMinSketch(numHashes, numBuckets, seed) + assert.NoError(t, err) + + // Update with frequency data + for i := 0; i < 1000; i++ { + _ = sketch.UpdateString("apple", 1) + } + for i := 0; i < 500; i++ { + _ = sketch.UpdateString("banana", 1) + } + for i := 0; i < 100; i++ { + _ = sketch.UpdateString(fmt.Sprintf("item_%d", i), 1) + } + + // Get frequency estimates (Count-Min never underestimates) + assert.GreaterOrEqual(t, sketch.GetEstimateString("apple"), int64(1000)) + assert.GreaterOrEqual(t, sketch.GetEstimateString("banana"), int64(500)) + + // Update with weight + _ = sketch.UpdateString("grape", 50) + assert.GreaterOrEqual(t, sketch.GetEstimateString("grape"), int64(50)) + + // Create a second sketch for merging + sketch2, err := count.NewCountMinSketch(numHashes, numBuckets, seed) + assert.NoError(t, err) + for i := 0; i < 500; i++ { + _ = sketch2.UpdateString("apple", 1) + } + for i := 0; i < 300; i++ { + _ = sketch2.UpdateString("orange", 1) + } + + // Merge sketches + err = sketch.Merge(sketch2) + assert.NoError(t, err) + assert.GreaterOrEqual(t, sketch.GetEstimateString("apple"), int64(1500)) + assert.GreaterOrEqual(t, sketch.GetEstimateString("orange"), int64(300)) + + // Serialize and deserialize + var buf bytes.Buffer + err = sketch.Serialize(&buf) + assert.NoError(t, err) + + restored, err := sketch.Deserialize(buf.Bytes(), seed) + assert.NoError(t, err) + assert.Equal(t, sketch.GetTotalWeight(), restored.GetTotalWeight()) +} diff --git a/examples/theta_example_test.go b/examples/theta_example_test.go new file mode 100644 index 0000000..76cfbdf --- /dev/null +++ b/examples/theta_example_test.go @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package examples + +import ( + "fmt" + "testing" + + "github.com/apache/datasketches-go/theta" + "github.com/stretchr/testify/assert" +) + +func TestThetaSketch(t *testing.T) { + // Create a new Theta sketch + sketch, err := theta.NewQuickSelectUpdateSketch() + assert.NoError(t, err) + + // Update the sketch with 1000 items + for i := 0; i < 1000; i++ { + _ = sketch.UpdateString(fmt.Sprintf("item_%d", i)) + } + + // Get the estimate of the number of unique items + estimate := sketch.Estimate() + assert.InDelta(t, 1000, estimate, 1000*0.05) + + // Create a second sketch with overlapping items + sketch2, err := theta.NewQuickSelectUpdateSketch(theta.WithUpdateSketchLgK(14)) + assert.NoError(t, err) + + for i := 500; i < 1500; i++ { + _ = sketch2.UpdateString(fmt.Sprintf("item_%d", i)) + } + + // Convert to compact form for set operations + compact1 := sketch.Compact(true) + compact2 := sketch2.Compact(true) + + // Compute union of two sketches + union, err := theta.NewUnion() + assert.NoError(t, err) + err = union.Update(compact1) + assert.NoError(t, err) + err = union.Update(compact2) + assert.NoError(t, err) + + unionResult, err := union.OrderedResult() + assert.NoError(t, err) + assert.InDelta(t, 1500, unionResult.Estimate(), 1500*0.05) + + // Compute intersection of two sketches + intersection := theta.NewIntersection() + err = intersection.Update(compact1) + assert.NoError(t, err) + err = intersection.Update(compact2) + assert.NoError(t, err) + + intersectionResult, err := intersection.OrderedResult() + assert.NoError(t, err) + assert.InDelta(t, 500, intersectionResult.Estimate(), 500*0.1) + + // Compute set difference (A \ B) + aNotBResult, err := theta.ANotB(compact1, compact2, theta.DefaultSeed, true) + assert.NoError(t, err) + assert.InDelta(t, 500, aNotBResult.Estimate(), 500*0.1) + + // Serialize and deserialize + bytes1, err := compact1.MarshalBinary() + assert.NoError(t, err) + + deserialized, err := theta.WrapCompactSketch(bytes1, theta.DefaultSeed) + assert.NoError(t, err) + assert.InDelta(t, estimate, deserialized.Estimate(), 1) +} diff --git a/examples/tuple_example_test.go b/examples/tuple_example_test.go new file mode 100644 index 0000000..4867a42 --- /dev/null +++ b/examples/tuple_example_test.go @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package examples + +import ( + "fmt" + "testing" + + "github.com/apache/datasketches-go/tuple" + "github.com/stretchr/testify/assert" +) + +// SumSummary is a custom Summary type that sums float64 values. +type SumSummary struct { + sum float64 + count int +} + +func (s *SumSummary) Reset() { + s.sum = 0 + s.count = 0 +} + +func (s *SumSummary) Clone() tuple.Summary { + return &SumSummary{sum: s.sum, count: s.count} +} + +func (s *SumSummary) Update(value float64) { + s.sum += value + s.count++ +} + +func (s *SumSummary) GetSum() float64 { return s.sum } +func (s *SumSummary) GetCount() int { return s.count } +func (s *SumSummary) String() string { return fmt.Sprintf("{sum: %.2f, count: %d}", s.sum, s.count) } +func newSumSummary() *SumSummary { return &SumSummary{} } + +// SumMergePolicy implements tuple.Policy for merging SumSummary instances. +type SumMergePolicy struct{} + +func (p *SumMergePolicy) Apply(internal *SumSummary, incoming *SumSummary) { + internal.sum += incoming.sum + internal.count += incoming.count +} + +func TestTupleSketch(t *testing.T) { + // Create a Tuple Sketch with custom Summary + sketch, err := tuple.NewUpdateSketch[*SumSummary, float64](newSumSummary) + assert.NoError(t, err) + + // Update with aggregated data (customer spending) + _ = sketch.UpdateString("alice", 100.50) + _ = sketch.UpdateString("alice", 50.25) + _ = sketch.UpdateString("alice", 75.00) + _ = sketch.UpdateString("bob", 200.00) + _ = sketch.UpdateString("bob", 30.00) + + for i := 0; i < 100; i++ { + _ = sketch.UpdateString(fmt.Sprintf("customer_%d", i), 10.0) + } + + // Verify distinct count + assert.InDelta(t, 102, sketch.Estimate(), 10) + + // Verify aggregated values + for _, summary := range sketch.All() { + if summary.GetCount() == 3 { + assert.InDelta(t, 225.75, summary.GetSum(), 0.01) + } + if summary.GetCount() == 2 && summary.GetSum() > 200 { + assert.InDelta(t, 230.00, summary.GetSum(), 0.01) + } + } + + // Create a second sketch for set operations + sketch2, err := tuple.NewUpdateSketch[*SumSummary, float64](newSumSummary) + assert.NoError(t, err) + _ = sketch2.UpdateString("alice", 150.00) + _ = sketch2.UpdateString("diana", 300.00) + + // Compact the sketches + compact1, err := sketch.Compact(true) + assert.NoError(t, err) + compact2, err := sketch2.Compact(true) + assert.NoError(t, err) + + // Union with custom merge policy + mergePolicy := &SumMergePolicy{} + union, err := tuple.NewUnion[*SumSummary](mergePolicy) + assert.NoError(t, err) + _ = union.Update(compact1) + _ = union.Update(compact2) + + unionResult, err := union.Result(true) + assert.NoError(t, err) + assert.InDelta(t, 103, unionResult.Estimate(), 10) + + // Intersection + intersection := tuple.NewIntersection[*SumSummary](mergePolicy) + _ = intersection.Update(compact1) + _ = intersection.Update(compact2) + + intersectionResult, err := intersection.Result(true) + assert.NoError(t, err) + assert.InDelta(t, 1, intersectionResult.Estimate(), 1) +}