diff --git a/BENCHMARK_OVERHEAD.md b/BENCHMARK_OVERHEAD.md new file mode 100644 index 000000000..973aee3ed --- /dev/null +++ b/BENCHMARK_OVERHEAD.md @@ -0,0 +1,312 @@ +# OTel Observability Performance Overhead Benchmarking + +This directory contains a comprehensive benchmarking suite to measure the performance overhead of the OpenTelemetry (OTel) observability implementation in go-redis. + +## ๐Ÿ“‹ Overview + +The benchmarking suite performs a **3-way comparison** to measure: + +1. **Baseline Performance** - Upstream master without any OTel code +2. **Dormant Code Overhead** - Current branch with OTel code present but disabled +3. **Active Metrics Overhead** - Current branch with OTel metrics enabled + +## ๐ŸŽฏ Goals + +### Primary Goals +- **Prove zero overhead when disabled**: The no-op pattern should add negligible overhead (<1%) when metrics are disabled +- **Measure active overhead**: Quantify the performance cost when metrics are actively collected +- **Validate production readiness**: Ensure overhead is acceptable for production use + +### Success Criteria +- **Disabled vs Master**: ~0% overhead (within statistical noise) +- **Enabled vs Disabled**: <5-10% overhead for most operations +- **Memory allocations**: Minimal increase in allocations per operation + +## ๐Ÿ“ Files + +### Core Files +- **`benchmark_overhead_test.go`** - Go benchmark suite with table-driven tests +- **`compare_perf.sh`** - Automated comparison script +- **`BENCHMARK_OVERHEAD.md`** - This documentation + +### Generated Files (after running benchmarks) +- **`benchmark_results_*/`** - Results directory with timestamp + - `current_branch.txt` - Raw results from current branch + - `upstream_master.txt` - Raw results from upstream/master + - `otel_enabled.txt` - Extracted enabled results + - `otel_disabled.txt` - Extracted disabled results + - `comparison_*.txt` - benchstat comparison reports + - `README.md` - Summary of the benchmark run + +## ๐Ÿš€ Quick Start + +### Prerequisites + +1. **Redis server running**: + ```bash + docker run -d -p 6379:6379 redis:latest + ``` + +2. **benchstat installed** (script will auto-install if missing): + ```bash + go install golang.org/x/perf/cmd/benchstat@latest + ``` + +### Running the Full Comparison + +```bash +# Run with default settings (5 iterations, 10s per benchmark) +./compare_perf.sh + +# Run with custom settings +BENCHMARK_COUNT=10 BENCHMARK_TIME=30s ./compare_perf.sh + +# Run specific benchmarks only +BENCHMARK_FILTER="BenchmarkOTelOverhead/.*Ping" ./compare_perf.sh +``` + +### Running Individual Benchmarks + +```bash +# Run all OTel overhead benchmarks +go test -bench=BenchmarkOTelOverhead -benchmem -benchtime=10s -count=5 + +# Run specific scenario +go test -bench=BenchmarkOTelOverhead/OTel_Enabled -benchmem -benchtime=10s -count=5 + +# Run specific operation +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping -benchmem -benchtime=10s -count=5 + +# Run connection pool benchmarks +go test -bench=BenchmarkOTelOverhead_ConnectionPool -benchmem -benchtime=10s -count=5 +``` + +## ๐Ÿ“Š Understanding the Results + +### benchstat Output Format + +``` +name old time/op new time/op delta +Ping-8 156ยตs ยฑ 2% 158ยตs ยฑ 3% +1.28% (p=0.008 n=5+5) + +name old alloc/op new alloc/op delta +Ping-8 112B ยฑ 0% 112B ยฑ 0% ~ (all equal) + +name old allocs/op new allocs/op delta +Ping-8 4.00 ยฑ 0% 4.00 ยฑ 0% ~ (all equal) +``` + +### Interpreting Results + +- **`~`** - No statistically significant difference (excellent for disabled mode!) +- **`+X%`** - Slower by X% (overhead) +- **`-X%`** - Faster by X% (unlikely, usually measurement variance) +- **`p-value`** - Statistical significance (p < 0.05 means difference is real) +- **`n=X+Y`** - Number of samples used for comparison + +### What to Look For + +#### Comparison 1: Master vs Disabled +``` +โœ… GOOD: ~0% difference, p > 0.05 (no significant difference) +โŒ BAD: >1% overhead with p < 0.05 (dormant code has measurable cost) +``` + +#### Comparison 2: Disabled vs Enabled +``` +โœ… GOOD: <5% overhead for simple operations (Ping, Get, Set) +โœ… ACCEPTABLE: <10% overhead for complex operations (Pipeline) +โš ๏ธ REVIEW: >10% overhead (may need optimization) +``` + +#### Comparison 3: Master vs Enabled +``` +โœ… GOOD: Total overhead <10% for production workloads +โš ๏ธ REVIEW: >15% overhead (consider if metrics value justifies cost) +``` + +## ๐Ÿ”ฌ Benchmark Coverage + +### Operations Tested + +1. **Ping** - Simplest operation, measures baseline overhead +2. **Set** - Write operation with key generation +3. **Get** - Read operation with cache hits +4. **SetGet_Mixed** - Realistic workload (70% reads, 30% writes) +5. **Pipeline** - Batch operations (10 commands per pipeline) + +### Scenarios Tested + +1. **OTel_Enabled** - Full metrics collection +2. **OTel_Disabled** - Code present but disabled +3. **No_OTel** - Baseline from upstream/master + +### Concurrency + +All benchmarks use `b.RunParallel()` to simulate real-world concurrent access patterns. + +## ๐Ÿ› ๏ธ Customization + +### Environment Variables + +```bash +# Number of benchmark iterations (default: 5) +BENCHMARK_COUNT=10 ./compare_perf.sh + +# Time per benchmark (default: 10s) +BENCHMARK_TIME=30s ./compare_perf.sh + +# Filter benchmarks by name (default: BenchmarkOTelOverhead) +BENCHMARK_FILTER="BenchmarkOTelOverhead/.*Ping" ./compare_perf.sh + +# Upstream remote name (default: upstream) +UPSTREAM_REMOTE=origin ./compare_perf.sh + +# Upstream branch name (default: master) +UPSTREAM_BRANCH=main ./compare_perf.sh +``` + +### Combining Options + +```bash +# Run 10 iterations of 30s each, only Ping benchmarks +BENCHMARK_COUNT=10 \ +BENCHMARK_TIME=30s \ +BENCHMARK_FILTER="BenchmarkOTelOverhead/.*Ping" \ +./compare_perf.sh +``` + +## ๐Ÿ“ˆ Example Results + +### Expected Results (Hypothetical) + +#### Comparison 1: Master vs Disabled (Dormant Code Overhead) +``` +name old time/op new time/op delta +OTelOverhead/OTel_Disabled/Ping-8 156ยตs ยฑ 2% 157ยตs ยฑ 3% ~ (p=0.234 n=5+5) +OTelOverhead/OTel_Disabled/Set-8 189ยตs ยฑ 1% 190ยตs ยฑ 2% ~ (p=0.421 n=5+5) +OTelOverhead/OTel_Disabled/Get-8 145ยตs ยฑ 2% 146ยตs ยฑ 1% ~ (p=0.548 n=5+5) + +name old alloc/op new alloc/op delta +OTelOverhead/OTel_Disabled/Ping-8 112B ยฑ 0% 112B ยฑ 0% ~ (all equal) +``` +**โœ… Result: No measurable overhead when disabled** + +#### Comparison 2: Disabled vs Enabled (Active Metrics Overhead) +``` +name old time/op new time/op delta +OTelOverhead/OTel_Enabled/Ping-8 157ยตs ยฑ 3% 164ยตs ยฑ 2% +4.46% (p=0.008 n=5+5) +OTelOverhead/OTel_Enabled/Set-8 190ยตs ยฑ 2% 199ยตs ยฑ 3% +4.74% (p=0.016 n=5+5) +OTelOverhead/OTel_Enabled/Get-8 146ยตs ยฑ 1% 153ยตs ยฑ 2% +4.79% (p=0.008 n=5+5) + +name old alloc/op new alloc/op delta +OTelOverhead/OTel_Enabled/Ping-8 112B ยฑ 0% 128B ยฑ 0% +14.29% (p=0.000 n=5+5) +``` +**โœ… Result: ~5% latency overhead, acceptable for production** + +## ๐Ÿ” Troubleshooting + +### Redis Not Running +``` +โŒ Redis is not running on localhost:6379 +๐Ÿ’ก Start Redis with: docker run -d -p 6379:6379 redis:latest +``` + +### benchstat Not Found +The script will auto-install benchstat. If it fails: +```bash +go install golang.org/x/perf/cmd/benchstat@latest +export PATH=$PATH:$(go env GOPATH)/bin +``` + +### Benchmark Timeout +Increase the timeout: +```bash +# In compare_perf.sh, modify the timeout flag: +go test -bench=... -timeout=60m ... +``` + +### High Variance in Results +- Ensure system is not under load +- Increase `BENCHMARK_COUNT` for more samples +- Increase `BENCHMARK_TIME` for longer runs +- Close other applications + +## ๐Ÿ“ Best Practices + +### Before Running Benchmarks + +1. **Close unnecessary applications** to reduce system noise +2. **Ensure stable system load** (no background tasks) +3. **Use consistent Redis configuration** (same version, same settings) +4. **Run multiple iterations** (at least 5) for statistical significance + +### Interpreting Results + +1. **Focus on p-values** - Only trust differences with p < 0.05 +2. **Look at trends** - Consistent overhead across operations is more meaningful +3. **Consider absolute values** - 10% of 1ยตs is less concerning than 10% of 1ms +4. **Check allocations** - Memory overhead can be as important as latency + +### Reporting Results + +When sharing benchmark results: +1. Include system information (CPU, RAM, OS) +2. Include Redis version and configuration +3. Include full benchstat output +4. Note any anomalies or special conditions +5. Include multiple runs to show consistency + +## ๐ŸŽ“ Advanced Usage + +### Profiling + +```bash +# CPU profile +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping \ + -cpuprofile=cpu.prof -benchtime=30s + +# Memory profile +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping \ + -memprofile=mem.prof -benchtime=30s + +# Analyze profiles +go tool pprof cpu.prof +go tool pprof mem.prof +``` + +### Comparing Specific Commits + +```bash +# Benchmark commit A +git checkout commit-a +go test -bench=BenchmarkOTelOverhead -benchmem -count=5 > commit-a.txt + +# Benchmark commit B +git checkout commit-b +go test -bench=BenchmarkOTelOverhead -benchmem -count=5 > commit-b.txt + +# Compare +benchstat commit-a.txt commit-b.txt +``` + +## ๐Ÿ“š References + +- [Go Benchmarking Guide](https://pkg.go.dev/testing#hdr-Benchmarks) +- [benchstat Documentation](https://pkg.go.dev/golang.org/x/perf/cmd/benchstat) +- [OpenTelemetry Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/) +- [go-redis Documentation](https://redis.uptrace.dev/) + +## ๐Ÿค Contributing + +When adding new benchmarks: +1. Follow the existing naming convention +2. Use `b.RunParallel()` for realistic concurrency +3. Use `b.ReportAllocs()` to track memory +4. Add documentation to this file +5. Update the comparison script if needed + +## ๐Ÿ“„ License + +Same as go-redis (BSD 2-Clause License) + diff --git a/BENCHMARK_QUICKSTART.md b/BENCHMARK_QUICKSTART.md new file mode 100644 index 000000000..5902709ef --- /dev/null +++ b/BENCHMARK_QUICKSTART.md @@ -0,0 +1,245 @@ +# Benchmark Quick Start Guide + +## TL;DR + +```bash +# 1. Start Redis +docker run -d -p 6379:6379 redis:latest + +# 2. Run automated comparison +./compare_perf.sh + +# 3. View results +cat benchmark_results_*/comparison_master_vs_disabled.txt +``` + +## What This Proves + +### Goal 1: Zero Overhead When Disabled โœ… +**Comparison**: `upstream/master` vs `OTel_Disabled` + +**Expected Result**: +``` +name old time/op new time/op delta +OTelOverhead_Baseline/Ping-8 30.0ยตs ยฑ 2% 30.1ยตs ยฑ 3% ~ (p=0.234) +``` + +**Interpretation**: The `~` symbol means no statistically significant difference. This proves the no-op pattern works perfectly. + +### Goal 2: Acceptable Overhead When Enabled โœ… +**Comparison**: `OTel_Disabled` vs `OTel_Enabled` + +**Expected Result**: +``` +name old time/op new time/op delta +OTelOverhead/OTel_Enabled/Ping-8 30.1ยตs ยฑ 3% 31.5ยตs ยฑ 2% +4.65% (p=0.008) +``` + +**Interpretation**: ~5% overhead is acceptable for production observability. + +## Manual Testing + +### Test 1: Baseline (No OTel Code) + +```bash +# Run on current branch +go test -bench=BenchmarkOTelOverhead_Baseline/Ping -benchmem -count=5 + +# Example output: +# BenchmarkOTelOverhead_Baseline/Ping-8 37743 29957 ns/op 172 B/op 6 allocs/op +``` + +### Test 2: OTel Disabled (Dormant Code) + +```bash +cd extra/redisotel-native +go test -bench=BenchmarkOTelOverhead/OTel_Disabled/Ping -benchmem -count=5 + +# Example output: +# BenchmarkOTelOverhead/OTel_Disabled/Ping-8 39856 30994 ns/op 5738 B/op 46 allocs/op +``` + +### Test 3: OTel Enabled (Active Metrics) + +```bash +cd extra/redisotel-native +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping -benchmem -count=5 + +# Example output: +# BenchmarkOTelOverhead/OTel_Enabled/Ping-8 38123 32456 ns/op 5890 B/op 48 allocs/op +``` + +## Comparing Results + +### Using benchstat + +```bash +# Save baseline +go test -bench=BenchmarkOTelOverhead_Baseline -benchmem -count=5 > baseline.txt + +# Save OTel disabled +cd extra/redisotel-native +go test -bench=BenchmarkOTelOverhead/OTel_Disabled -benchmem -count=5 > disabled.txt + +# Compare +benchstat ../baseline.txt disabled.txt +``` + +### Reading benchstat Output + +``` +name old time/op new time/op delta +Ping-8 156ยตs ยฑ 2% 158ยตs ยฑ 3% +1.28% (p=0.008 n=5+5) +``` + +- **old time/op**: Baseline performance +- **new time/op**: New performance +- **delta**: Percentage change +- **ยฑX%**: Variance (lower is better) +- **p-value**: Statistical significance (p < 0.05 means difference is real) +- **n=5+5**: Number of samples + +**Symbols**: +- `~` = No significant difference (GOOD for disabled mode!) +- `+X%` = Slower by X% +- `-X%` = Faster by X% + +## Environment Variables + +```bash +# Run 10 iterations instead of 5 +BENCHMARK_COUNT=10 ./compare_perf.sh + +# Run each benchmark for 30 seconds instead of 10 +BENCHMARK_TIME=30s ./compare_perf.sh + +# Run only Ping benchmarks +BENCHMARK_FILTER="BenchmarkOTelOverhead/.*Ping" ./compare_perf.sh + +# Combine options +BENCHMARK_COUNT=10 BENCHMARK_TIME=30s ./compare_perf.sh +``` + +## Troubleshooting + +### Redis Not Running + +```bash +# Error: connection refused +# Solution: +docker run -d -p 6379:6379 redis:latest +``` + +### benchstat Not Found + +```bash +# Error: benchstat: command not found +# Solution: +go install golang.org/x/perf/cmd/benchstat@latest +export PATH=$PATH:$(go env GOPATH)/bin +``` + +### High Variance + +```bash +# If you see ยฑ10% or more variance: +# 1. Close other applications +# 2. Run more iterations +BENCHMARK_COUNT=10 BENCHMARK_TIME=30s ./compare_perf.sh +``` + +## What to Include in PR + +### Minimum + +```markdown +## Performance Impact + +Ran benchmarks comparing upstream/master vs current branch: + +**OTel Disabled (Dormant Code Overhead)**: +- Ping: ~0% overhead (p=0.234, not significant) +- Set: ~0% overhead (p=0.421, not significant) +- Get: ~0% overhead (p=0.548, not significant) + +**OTel Enabled (Active Metrics Overhead)**: +- Ping: +4.5% overhead +- Set: +4.7% overhead +- Get: +4.8% overhead + +Conclusion: Zero overhead when disabled, acceptable overhead when enabled. +``` + +### Detailed + +Include the full benchstat output: + +```markdown +## Performance Benchmarks + +### Comparison 1: Master vs Disabled (Dormant Code) + +\`\`\` +name old time/op new time/op delta +OTelOverhead_Baseline/Ping-8 30.0ยตs ยฑ 2% 30.1ยตs ยฑ 3% ~ (p=0.234 n=5+5) +OTelOverhead_Baseline/Set-8 35.2ยตs ยฑ 1% 35.4ยตs ยฑ 2% ~ (p=0.421 n=5+5) +OTelOverhead_Baseline/Get-8 28.5ยตs ยฑ 2% 28.7ยตs ยฑ 1% ~ (p=0.548 n=5+5) +\`\`\` + +### Comparison 2: Disabled vs Enabled (Active Metrics) + +\`\`\` +name old time/op new time/op delta +OTelOverhead/OTel_Enabled/Ping-8 30.1ยตs ยฑ 3% 31.5ยตs ยฑ 2% +4.65% (p=0.008 n=5+5) +OTelOverhead/OTel_Enabled/Set-8 35.4ยตs ยฑ 2% 37.1ยตs ยฑ 3% +4.80% (p=0.016 n=5+5) +OTelOverhead/OTel_Enabled/Get-8 28.7ยตs ยฑ 1% 30.1ยตs ยฑ 2% +4.88% (p=0.008 n=5+5) +\`\`\` +``` + +## Advanced Usage + +### CPU Profiling + +```bash +cd extra/redisotel-native +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping \ + -cpuprofile=cpu.prof -benchtime=30s + +go tool pprof -http=:8080 cpu.prof +``` + +### Memory Profiling + +```bash +cd extra/redisotel-native +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping \ + -memprofile=mem.prof -benchtime=30s + +go tool pprof -http=:8080 mem.prof +``` + +### Trace Analysis + +```bash +cd extra/redisotel-native +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping \ + -trace=trace.out -benchtime=10s + +go tool trace trace.out +``` + +## Files Reference + +- **`benchmark_overhead_test.go`** - Baseline benchmarks (root package) +- **`extra/redisotel-native/benchmark_overhead_test.go`** - OTel benchmarks +- **`compare_perf.sh`** - Automated comparison script +- **`BENCHMARK_OVERHEAD.md`** - Comprehensive documentation +- **`extra/redisotel-native/BENCHMARKS.md`** - OTel-specific guide + +## See Also + +- [BENCHMARK_OVERHEAD.md](BENCHMARK_OVERHEAD.md) - Full documentation +- [extra/redisotel-native/BENCHMARKS.md](extra/redisotel-native/BENCHMARKS.md) - OTel benchmarks +- [Go Benchmarking](https://pkg.go.dev/testing#hdr-Benchmarks) +- [benchstat](https://pkg.go.dev/golang.org/x/perf/cmd/benchstat) + diff --git a/benchmark_overhead_test.go b/benchmark_overhead_test.go new file mode 100644 index 000000000..087495087 --- /dev/null +++ b/benchmark_overhead_test.go @@ -0,0 +1,205 @@ +package redis_test + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/redis/go-redis/v9" +) + +// BenchmarkOTelOverhead_Baseline measures baseline performance without OTel +// This benchmark is designed to run on both the current branch and upstream/master +// On the current branch with OTel code, use the benchmarks in extra/redisotel-native/ +func BenchmarkOTelOverhead_Baseline(b *testing.B) { + ctx := context.Background() + + // Create Redis client + rdb := redis.NewClient(&redis.Options{ + Addr: ":6379", + DialTimeout: time.Second, + ReadTimeout: time.Second, + WriteTimeout: time.Second, + PoolSize: 10, + MinIdleConns: 5, + }) + defer rdb.Close() + + // Verify connection + if err := rdb.Ping(ctx).Err(); err != nil { + b.Skipf("Redis server not available: %v", err) + } + + // Clean up + if err := rdb.FlushDB(ctx).Err(); err != nil { + b.Fatalf("Failed to flush DB: %v", err) + } + + // Run sub-benchmarks for different operations + b.Run("Ping", func(b *testing.B) { + benchmarkPing(b, rdb, ctx) + }) + + b.Run("Set", func(b *testing.B) { + benchmarkSet(b, rdb, ctx) + }) + + b.Run("Get", func(b *testing.B) { + benchmarkGet(b, rdb, ctx) + }) + + b.Run("SetGet_Mixed", func(b *testing.B) { + benchmarkSetGetMixed(b, rdb, ctx) + }) + + b.Run("Pipeline", func(b *testing.B) { + benchmarkPipeline(b, rdb, ctx) + }) +} + +// benchmarkPing measures PING command performance +func benchmarkPing(b *testing.B, rdb *redis.Client, ctx context.Context) { + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if err := rdb.Ping(ctx).Err(); err != nil { + b.Fatal(err) + } + } + }) +} + +// benchmarkSet measures SET command performance +func benchmarkSet(b *testing.B, rdb *redis.Client, ctx context.Context) { + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + key := fmt.Sprintf("key:%d", i) + if err := rdb.Set(ctx, key, "value", 0).Err(); err != nil { + b.Fatal(err) + } + i++ + } + }) +} + +// benchmarkGet measures GET command performance +func benchmarkGet(b *testing.B, rdb *redis.Client, ctx context.Context) { + // Pre-populate keys + for i := 0; i < 1000; i++ { + key := fmt.Sprintf("key:%d", i) + if err := rdb.Set(ctx, key, "value", 0).Err(); err != nil { + b.Fatal(err) + } + } + + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + key := fmt.Sprintf("key:%d", i%1000) + if err := rdb.Get(ctx, key).Err(); err != nil && err != redis.Nil { + b.Fatal(err) + } + i++ + } + }) +} + +// benchmarkSetGetMixed measures mixed SET/GET workload (70% GET, 30% SET) +func benchmarkSetGetMixed(b *testing.B, rdb *redis.Client, ctx context.Context) { + // Pre-populate keys + for i := 0; i < 1000; i++ { + key := fmt.Sprintf("key:%d", i) + if err := rdb.Set(ctx, key, "value", 0).Err(); err != nil { + b.Fatal(err) + } + } + + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + key := fmt.Sprintf("key:%d", i%1000) + + // 70% GET, 30% SET + if i%10 < 7 { + if err := rdb.Get(ctx, key).Err(); err != nil && err != redis.Nil { + b.Fatal(err) + } + } else { + if err := rdb.Set(ctx, key, "value", 0).Err(); err != nil { + b.Fatal(err) + } + } + i++ + } + }) +} + +// benchmarkPipeline measures pipelined operations performance +func benchmarkPipeline(b *testing.B, rdb *redis.Client, ctx context.Context) { + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + pipe := rdb.Pipeline() + + // Pipeline 10 commands + for j := 0; j < 10; j++ { + key := fmt.Sprintf("key:%d:%d", i, j) + pipe.Set(ctx, key, "value", 0) + } + + if _, err := pipe.Exec(ctx); err != nil { + b.Fatal(err) + } + i++ + } + }) +} + +// BenchmarkOTelOverhead_ConnectionPool measures connection pool overhead +func BenchmarkOTelOverhead_ConnectionPool(b *testing.B) { + ctx := context.Background() + + // Create Redis client with larger pool + rdb := redis.NewClient(&redis.Options{ + Addr: ":6379", + DialTimeout: time.Second, + ReadTimeout: time.Second, + WriteTimeout: time.Second, + PoolSize: 100, + MinIdleConns: 10, + }) + defer rdb.Close() + + // Verify connection + if err := rdb.Ping(ctx).Err(); err != nil { + b.Skipf("Redis server not available: %v", err) + } + + b.ReportAllocs() + b.ResetTimer() + + // Stress test connection pool with high concurrency + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if err := rdb.Ping(ctx).Err(); err != nil { + b.Fatal(err) + } + } + }) +} diff --git a/compare_perf.sh b/compare_perf.sh new file mode 100755 index 000000000..441c6afe7 --- /dev/null +++ b/compare_perf.sh @@ -0,0 +1,322 @@ +#!/bin/bash + +# Performance Comparison Script for OTel Observability Overhead +# This script automates the 3-way comparison: +# 1. Current branch with OTel Enabled +# 2. Current branch with OTel Disabled +# 3. Upstream master (baseline without OTel code) + +set -e # Exit on error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration (can be overridden via environment variables) +# Optimized defaults: 3 iterations ร— 3s = statistically valid in ~5-8 minutes total +BENCHMARK_COUNT=${BENCHMARK_COUNT:-3} # Number of times to run each benchmark (3 is minimum for benchstat) +BENCHMARK_TIME=${BENCHMARK_TIME:-3s} # How long to run each benchmark (3s gives stable results) +BENCHMARK_FILTER=${BENCHMARK_FILTER:-"BenchmarkOTelOverhead"} # Benchmark name filter +UPSTREAM_REMOTE=${UPSTREAM_REMOTE:-"upstream"} +UPSTREAM_BRANCH=${UPSTREAM_BRANCH:-"master"} + +echo -e "${BLUE}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${BLUE}โ•‘ Redis Go Client - OTel Observability Performance Comparison โ•‘${NC}" +echo -e "${BLUE}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo "" +echo -e "${BLUE}โฑ๏ธ Estimated time: ~5-8 minutes (count=${BENCHMARK_COUNT}, time=${BENCHMARK_TIME})${NC}" +echo -e "${YELLOW}๐Ÿ’ก Customize: BENCHMARK_COUNT=5 BENCHMARK_TIME=5s ./compare_perf.sh${NC}" +echo "" + +# Check if benchstat is installed +if ! command -v benchstat &> /dev/null; then + echo -e "${YELLOW}โš ๏ธ benchstat not found. Installing...${NC}" + go install golang.org/x/perf/cmd/benchstat@latest + if ! command -v benchstat &> /dev/null; then + echo -e "${RED}โŒ Failed to install benchstat. Please install manually:${NC}" + echo -e "${RED} go install golang.org/x/perf/cmd/benchstat@latest${NC}" + exit 1 + fi + echo -e "${GREEN}โœ… benchstat installed${NC}" +fi + +# Check if Redis is running +echo -e "${BLUE}๐Ÿ” Checking Redis connection...${NC}" + +# Try redis-cli first (works on both Linux and macOS) +if command -v redis-cli &> /dev/null; then + if redis-cli -h localhost -p 6379 ping &> /dev/null; then + echo -e "${GREEN}โœ… Redis is running${NC}" + else + echo -e "${RED}โŒ Redis is not running on localhost:6379${NC}" + echo -e "${YELLOW}๐Ÿ’ก Start Redis with: docker run -d -p 6379:6379 redis:latest${NC}" + exit 1 + fi +# Fallback to nc (netcat) - works on both Linux and macOS +elif command -v nc &> /dev/null; then + if nc -z localhost 6379 2>/dev/null; then + echo -e "${GREEN}โœ… Redis is running (detected via port check)${NC}" + else + echo -e "${RED}โŒ Redis is not running on localhost:6379${NC}" + echo -e "${YELLOW}๐Ÿ’ก Start Redis with: docker run -d -p 6379:6379 redis:latest${NC}" + exit 1 + fi +# Fallback to /dev/tcp (works on Linux with bash, not macOS) +elif timeout 2 bash -c "echo > /dev/tcp/localhost/6379" 2>/dev/null; then + echo -e "${GREEN}โœ… Redis is running (detected via /dev/tcp)${NC}" +else + echo -e "${RED}โŒ Redis is not running on localhost:6379${NC}" + echo -e "${YELLOW}๐Ÿ’ก Start Redis with: docker run -d -p 6379:6379 redis:latest${NC}" + exit 1 +fi +echo "" + +# Save current state +CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD) +echo -e "${BLUE}๐Ÿ“ Current branch: ${GREEN}${CURRENT_BRANCH}${NC}" + +# Check for uncommitted changes +if ! git diff-index --quiet HEAD --; then + echo -e "${YELLOW}โš ๏ธ You have uncommitted changes. Stashing...${NC}" + git stash push -m "benchmark_comparison_$(date +%s)" + STASHED=true +else + STASHED=false +fi + +# Create results directory +RESULTS_DIR="benchmark_results_$(date +%Y%m%d_%H%M%S)" +mkdir -p "$RESULTS_DIR" +echo -e "${BLUE}๐Ÿ“ Results directory: ${GREEN}${RESULTS_DIR}${NC}" +echo "" + +# Function to run benchmarks +run_benchmark() { + local output_file=$1 + local description=$2 + local test_dir=$3 + + echo -e "${BLUE}๐Ÿƒ Running benchmarks: ${YELLOW}${description}${NC}" + echo -e "${BLUE} Directory: ${test_dir}${NC}" + echo -e "${BLUE} Benchmark count: ${BENCHMARK_COUNT}${NC}" + echo -e "${BLUE} Benchmark time: ${BENCHMARK_TIME}${NC}" + echo -e "${BLUE} Filter: ${BENCHMARK_FILTER}${NC}" + + # Run benchmark multiple times for statistical significance + (cd "$test_dir" && go test -bench="${BENCHMARK_FILTER}" \ + -benchmem \ + -benchtime="${BENCHMARK_TIME}" \ + -count="${BENCHMARK_COUNT}" \ + -timeout=30m \ + -run=^$ \ + .) > "$output_file" 2>&1 + + if [ $? -eq 0 ]; then + echo -e "${GREEN}โœ… Benchmarks completed${NC}" + else + echo -e "${RED}โŒ Benchmarks failed. Check ${output_file} for details${NC}" + return 1 + fi +} + +# Cleanup function +cleanup() { + echo "" + echo -e "${BLUE}๐Ÿงน Cleaning up...${NC}" + + # Return to original branch + if [ "$(git rev-parse --abbrev-ref HEAD)" != "$CURRENT_BRANCH" ]; then + echo -e "${BLUE} Returning to branch: ${CURRENT_BRANCH}${NC}" + git checkout "$CURRENT_BRANCH" 2>/dev/null || true + fi + + # Restore stashed changes + if [ "$STASHED" = true ]; then + echo -e "${BLUE} Restoring stashed changes...${NC}" + git stash pop 2>/dev/null || true + fi + + echo -e "${GREEN}โœ… Cleanup complete${NC}" +} + +# Set trap to cleanup on exit +trap cleanup EXIT + +# ============================================================================ +# STEP 1: Run benchmarks on current branch (OTel Enabled + Disabled) +# ============================================================================ +echo -e "${BLUE}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${BLUE}โ•‘ STEP 1: Benchmarking Current Branch (with OTel code) โ•‘${NC}" +echo -e "${BLUE}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" +echo "" + +run_benchmark "$RESULTS_DIR/current_branch.txt" "Current Branch (OTel Enabled + Disabled)" "extra/redisotel-native" + +# ============================================================================ +# STEP 2: Run benchmarks on upstream/master (baseline) +# ============================================================================ +echo "" +echo -e "${BLUE}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${BLUE}โ•‘ STEP 2: Benchmarking Upstream Master (baseline) โ•‘${NC}" +echo -e "${BLUE}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" +echo "" + +# Fetch upstream +echo -e "${BLUE}๐Ÿ“ฅ Fetching ${UPSTREAM_REMOTE}/${UPSTREAM_BRANCH}...${NC}" +git fetch "$UPSTREAM_REMOTE" "$UPSTREAM_BRANCH" + +# Checkout upstream/master +echo -e "${BLUE}๐Ÿ”€ Checking out ${UPSTREAM_REMOTE}/${UPSTREAM_BRANCH}...${NC}" +git checkout "${UPSTREAM_REMOTE}/${UPSTREAM_BRANCH}" + +# Check if benchmark file exists on master +if [ ! -f "benchmark_overhead_test.go" ]; then + echo -e "${YELLOW}โš ๏ธ benchmark_overhead_test.go not found on ${UPSTREAM_REMOTE}/${UPSTREAM_BRANCH}${NC}" + echo -e "${YELLOW} Running baseline benchmarks using BenchmarkOTelOverhead_Baseline${NC}" + + # Run baseline benchmarks + go test -bench="BenchmarkOTelOverhead_Baseline" \ + -benchmem \ + -benchtime="${BENCHMARK_TIME}" \ + -count="${BENCHMARK_COUNT}" \ + -timeout=30m \ + -run=^$ \ + . > "$RESULTS_DIR/upstream_master.txt" 2>&1 +else + # Run the same benchmark on master + run_benchmark "$RESULTS_DIR/upstream_master.txt" "Upstream Master (No OTel code)" "." +fi + +# Return to original branch +echo -e "${BLUE}๐Ÿ”€ Returning to ${CURRENT_BRANCH}...${NC}" +git checkout "$CURRENT_BRANCH" + +# ============================================================================ +# STEP 3: Analyze results with benchstat +# ============================================================================ +echo "" +echo -e "${BLUE}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${BLUE}โ•‘ STEP 3: Analyzing Results โ•‘${NC}" +echo -e "${BLUE}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" +echo "" + +# Extract OTel_Enabled and OTel_Disabled results from current branch +echo -e "${BLUE}๐Ÿ“Š Extracting results...${NC}" +grep "OTel_Enabled" "$RESULTS_DIR/current_branch.txt" > "$RESULTS_DIR/otel_enabled.txt" || true +grep "OTel_Disabled" "$RESULTS_DIR/current_branch.txt" > "$RESULTS_DIR/otel_disabled.txt" || true + +# Generate comparison reports +echo "" +echo -e "${GREEN}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${GREEN}โ•‘ COMPARISON 1: Upstream Master vs OTel Disabled โ•‘${NC}" +echo -e "${GREEN}โ•‘ (Measures overhead of dormant OTel code) โ•‘${NC}" +echo -e "${GREEN}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" +echo "" + +if [ -s "$RESULTS_DIR/upstream_master.txt" ] && [ -s "$RESULTS_DIR/otel_disabled.txt" ]; then + benchstat "$RESULTS_DIR/upstream_master.txt" "$RESULTS_DIR/otel_disabled.txt" | tee "$RESULTS_DIR/comparison_master_vs_disabled.txt" +else + echo -e "${YELLOW}โš ๏ธ Insufficient data for comparison${NC}" +fi + +echo "" +echo -e "${GREEN}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${GREEN}โ•‘ COMPARISON 2: OTel Disabled vs OTel Enabled โ•‘${NC}" +echo -e "${GREEN}โ•‘ (Measures overhead when metrics are enabled) โ•‘${NC}" +echo -e "${GREEN}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" +echo "" + +if [ -s "$RESULTS_DIR/otel_disabled.txt" ] && [ -s "$RESULTS_DIR/otel_enabled.txt" ]; then + benchstat "$RESULTS_DIR/otel_disabled.txt" "$RESULTS_DIR/otel_enabled.txt" | tee "$RESULTS_DIR/comparison_disabled_vs_enabled.txt" +else + echo -e "${YELLOW}โš ๏ธ Insufficient data for comparison${NC}" +fi + +echo "" +echo -e "${GREEN}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${GREEN}โ•‘ COMPARISON 3: Upstream Master vs OTel Enabled โ•‘${NC}" +echo -e "${GREEN}โ•‘ (Measures total overhead with metrics enabled) โ•‘${NC}" +echo -e "${GREEN}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" +echo "" + +if [ -s "$RESULTS_DIR/upstream_master.txt" ] && [ -s "$RESULTS_DIR/otel_enabled.txt" ]; then + benchstat "$RESULTS_DIR/upstream_master.txt" "$RESULTS_DIR/otel_enabled.txt" | tee "$RESULTS_DIR/comparison_master_vs_enabled.txt" +else + echo -e "${YELLOW}โš ๏ธ Insufficient data for comparison${NC}" +fi + +# ============================================================================ +# STEP 4: Generate summary +# ============================================================================ +echo "" +echo -e "${BLUE}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${BLUE}โ•‘ SUMMARY โ•‘${NC}" +echo -e "${BLUE}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" +echo "" + +cat > "$RESULTS_DIR/README.md" << EOF +# OTel Observability Performance Comparison + +**Date:** $(date) +**Branch:** ${CURRENT_BRANCH} +**Benchmark Count:** ${BENCHMARK_COUNT} +**Benchmark Time:** ${BENCHMARK_TIME} + +## Files + +- \`current_branch.txt\` - Raw benchmark results from current branch (both enabled and disabled) +- \`upstream_master.txt\` - Raw benchmark results from upstream/master +- \`otel_enabled.txt\` - Extracted OTel enabled results +- \`otel_disabled.txt\` - Extracted OTel disabled results +- \`comparison_master_vs_disabled.txt\` - Comparison showing dormant code overhead +- \`comparison_disabled_vs_enabled.txt\` - Comparison showing metrics collection overhead +- \`comparison_master_vs_enabled.txt\` - Comparison showing total overhead + +## How to Read Results + +### Comparison 1: Master vs Disabled +This shows the overhead of having the OTel code present but disabled. +**Goal:** Should be ~0% overhead (proves no-op pattern works) + +### Comparison 2: Disabled vs Enabled +This shows the overhead when metrics are actively collected. +**Goal:** Should be acceptable for production use (<5-10% for most operations) + +### Comparison 3: Master vs Enabled +This shows the total overhead with metrics enabled. +**Goal:** Combined overhead should still be acceptable + +## Interpreting benchstat Output + +- **~** means no significant difference (good for dormant code!) +- **+X%** means slower (overhead) +- **-X%** means faster (unlikely but possible due to variance) +- **p-value < 0.05** means the difference is statistically significant + +## Running Benchmarks Manually + +\`\`\`bash +# Run all benchmarks +go test -bench=BenchmarkOTelOverhead -benchmem -benchtime=10s -count=5 + +# Run specific benchmark +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping -benchmem -benchtime=10s -count=5 + +# Compare two runs +benchstat old.txt new.txt +\`\`\` +EOF + +echo -e "${GREEN}โœ… Results saved to: ${BLUE}${RESULTS_DIR}${NC}" +echo -e "${GREEN}โœ… Summary saved to: ${BLUE}${RESULTS_DIR}/README.md${NC}" +echo "" +echo -e "${BLUE}๐Ÿ“– To view detailed results:${NC}" +echo -e "${BLUE} cat ${RESULTS_DIR}/comparison_*.txt${NC}" +echo "" +echo -e "${GREEN}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" +echo -e "${GREEN}โ•‘ โœ… Performance comparison complete! โ•‘${NC}" +echo -e "${GREEN}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" + diff --git a/example/otel-metrics/README.md b/example/otel-metrics/README.md new file mode 100644 index 000000000..25ae8d7b2 --- /dev/null +++ b/example/otel-metrics/README.md @@ -0,0 +1,67 @@ +# OpenTelemetry Metrics Example + +This example demonstrates how to enable OpenTelemetry metrics for Redis operations using the `extra/redisotel-native` package. + +## Features + +- โœ… OTLP exporter configuration +- โœ… Periodic metric export (every 10 seconds) +- โœ… Concurrent Redis operations +- โœ… Automatic metric collection for: + - Operation duration + - Connection metrics + - Error tracking + +## Prerequisites + +- Go 1.23.0 or later +- Redis server running on `localhost:6379` +- OTLP collector running on `localhost:4317` (optional) + +## Running the Example + +```bash +# Start Redis (if not already running) +redis-server + +# Optional: Start OTLP collector +# See: https://opentelemetry.io/docs/collector/ + +# Run the example +go run main.go +``` + +## What It Does + +1. Creates an OTLP exporter that sends metrics to a collector +2. Sets up a meter provider with periodic export (every 10 seconds) +3. Initializes Redis client with OTel instrumentation +4. Executes concurrent Redis operations (SET commands) +5. Waits for metrics to be exported + +## Metrics Collected + +The example automatically collects: + +- **db.client.operation.duration** - Operation latency histogram +- **db.client.connection.create_time** - Connection creation time +- **db.client.connection.count** - Active connection count +- **db.client.errors** - Error counter with error type classification + +## Configuration + +To use with a production OTLP collector: + +```go +exporter, err := otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint("your-collector:4317"), + otlpmetricgrpc.WithTLSCredentials(credentials.NewClientTLSFromCert(certPool, "")), +) +``` + +## See Also + +- [OpenTelemetry Go SDK](https://opentelemetry.io/docs/languages/go/) +- [OTLP Exporter Documentation](https://opentelemetry.io/docs/specs/otlp/) +- [Redis OTel Native Package](../../extra/redisotel-native/) + diff --git a/example/otel-metrics/go.mod b/example/otel-metrics/go.mod new file mode 100644 index 000000000..8cee8554a --- /dev/null +++ b/example/otel-metrics/go.mod @@ -0,0 +1,39 @@ +module github.com/redis/go-redis/example/otel-metrics + +go 1.23.0 + +toolchain go1.24.2 + +replace github.com/redis/go-redis/v9 => ../.. + +replace github.com/redis/go-redis/extra/redisotel-native/v9 => ../../extra/redisotel-native + +require ( + github.com/redis/go-redis/extra/redisotel-native/v9 v9.0.0-00010101000000-000000000000 + github.com/redis/go-redis/v9 v9.7.0 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 + go.opentelemetry.io/otel/sdk/metric v1.38.0 +) + +require ( + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect + go.opentelemetry.io/auto/sdk v1.1.0 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/sdk v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/grpc v1.75.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect +) diff --git a/example/otel-metrics/go.sum b/example/otel-metrics/go.sum new file mode 100644 index 000000000..f7db960cf --- /dev/null +++ b/example/otel-metrics/go.sum @@ -0,0 +1,63 @@ +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY= +google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc= +google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4= +google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/example/otel-metrics/main.go b/example/otel-metrics/main.go new file mode 100644 index 000000000..a1f46d6f3 --- /dev/null +++ b/example/otel-metrics/main.go @@ -0,0 +1,121 @@ +// EXAMPLE: otel_metrics +// HIDE_START +package main + +import ( + "context" + "log" + "math/rand" + "strconv" + "sync" + "time" + + redisotel "github.com/redis/go-redis/extra/redisotel-native/v9" + "github.com/redis/go-redis/v9" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/sdk/metric" +) + +// ExampleClient_otel_metrics demonstrates how to enable OpenTelemetry metrics +// for Redis operations and export them to an OTLP collector. +func main() { + ctx := context.Background() + + // HIDE_END + + // STEP_START otel_exporter_setup + // Create OTLP exporter that sends metrics to the collector + // Default endpoint is localhost:4317 (gRPC) + exporter, err := otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithInsecure(), // Use insecure for local development + // For production, configure TLS and authentication: + // otlpmetricgrpc.WithEndpoint("your-collector:4317"), + // otlpmetricgrpc.WithTLSCredentials(...), + ) + if err != nil { + log.Fatalf("Failed to create OTLP exporter: %v", err) + } + // STEP_END + + // STEP_START otel_meter_provider + // Create meter provider with periodic reader + // Metrics are exported every 10 seconds + meterProvider := metric.NewMeterProvider( + metric.WithReader( + metric.NewPeriodicReader(exporter, + metric.WithInterval(10*time.Second), + ), + ), + ) + defer func() { + if err := meterProvider.Shutdown(ctx); err != nil { + log.Printf("Error shutting down meter provider: %v", err) + } + }() + + // Set the global meter provider + otel.SetMeterProvider(meterProvider) + // STEP_END + + // STEP_START redis_client_setup + // Create Redis client + rdb := redis.NewClient(&redis.Options{ + Addr: "localhost:6379", + }) + defer rdb.Close() + + // Initialize OTel instrumentation (uses global meter provider) + if err := redisotel.Init(rdb); err != nil { + log.Fatalf("Failed to initialize OTel: %v", err) + } + defer redisotel.Shutdown() + // STEP_END + + // STEP_START redis_operations + // Execute Redis operations - metrics are automatically collected + log.Println("Executing Redis operations...") + var wg sync.WaitGroup + wg.Add(50) + for i := range 50 { + go func(i int) { + defer wg.Done() + + for j := range 10 { + if err := rdb.Set(ctx, "key"+strconv.Itoa(i*10+j), "value", 0).Err(); err != nil { + log.Printf("Error setting key: %v", err) + } + time.Sleep(time.Millisecond * time.Duration(rand.Intn(400))) + } + }(i) + } + wg.Wait() + + wg.Add(10) + for i := range 10 { + go func(i int) { + defer wg.Done() + + for j := range 10 { + if err := rdb.Set(ctx, "key"+strconv.Itoa(i*10+j), "value", 0).Err(); err != nil { + log.Printf("Error setting key: %v", err) + } + time.Sleep(time.Millisecond * time.Duration(rand.Intn(400))) + } + }(i) + } + wg.Wait() + + for j := range 10 { + if err := rdb.Set(ctx, "key"+strconv.Itoa(j), "value", 0).Err(); err != nil { + log.Printf("Error setting key: %v", err) + } + time.Sleep(time.Millisecond * time.Duration(rand.Intn(400))) + } + + log.Println("Operations complete. Waiting for metrics to be exported...") + + // Wait for metrics to be exported + time.Sleep(15 * time.Second) + // STEP_END +} diff --git a/extra/redisotel-native/BENCHMARKS.md b/extra/redisotel-native/BENCHMARKS.md new file mode 100644 index 000000000..e88374d1a --- /dev/null +++ b/extra/redisotel-native/BENCHMARKS.md @@ -0,0 +1,191 @@ +# OTel Native Instrumentation Benchmarks + +This directory contains benchmarks to measure the performance overhead of the native OpenTelemetry instrumentation. + +## Quick Start + +### Run All Benchmarks + +```bash +# From the repository root +cd extra/redisotel-native +go test -bench=BenchmarkOTelOverhead -benchmem -benchtime=10s -count=5 +``` + +### Run Specific Benchmarks + +```bash +# OTel Disabled (measures dormant code overhead) +go test -bench=BenchmarkOTelOverhead/OTel_Disabled -benchmem -benchtime=10s -count=5 + +# OTel Enabled (measures active metrics overhead) +go test -bench=BenchmarkOTelOverhead/OTel_Enabled -benchmem -benchtime=10s -count=5 + +# Specific operation +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping -benchmem -benchtime=10s -count=5 + +# Connection pool stress test +go test -bench=BenchmarkOTelOverhead_ConnectionPool -benchmem -benchtime=10s -count=5 +``` + +## Automated Comparison + +For a comprehensive 3-way comparison (Baseline vs Disabled vs Enabled), use the automated script from the repository root: + +```bash +# From the repository root +./compare_perf.sh +``` + +This script will: +1. Run benchmarks on the current branch (OTel Enabled + Disabled) +2. Checkout upstream/master and run baseline benchmarks +3. Generate comparison reports using benchstat +4. Save all results to a timestamped directory + +## Understanding Results + +### Expected Overhead + +#### OTel Disabled vs Baseline +- **Target**: ~0% overhead (within statistical noise) +- **Acceptable**: <1% overhead +- **Indicates**: Cost of dormant code (should be negligible due to no-op pattern) + +#### OTel Enabled vs Disabled +- **Target**: <5% overhead for simple operations +- **Acceptable**: <10% overhead for complex operations +- **Indicates**: Cost of active metrics collection + +### Sample Output + +``` +name old time/op new time/op delta +OTelOverhead/OTel_Disabled/Ping-8 30.0ยตs ยฑ 2% 30.1ยตs ยฑ 3% ~ (p=0.234 n=5+5) +OTelOverhead/OTel_Enabled/Ping-8 30.1ยตs ยฑ 3% 31.5ยตs ยฑ 2% +4.65% (p=0.008 n=5+5) + +name old alloc/op new alloc/op delta +OTelOverhead/OTel_Disabled/Ping-8 172B ยฑ 0% 172B ยฑ 0% ~ (all equal) +OTelOverhead/OTel_Enabled/Ping-8 172B ยฑ 0% 5738B ยฑ 0% +3236% (p=0.000 n=5+5) +``` + +**Interpretation**: +- Disabled mode: No measurable latency overhead (โœ…) +- Enabled mode: ~4.65% latency overhead (โœ… acceptable) +- Memory: Enabled mode allocates more for metrics collection (expected) + +## Benchmark Coverage + +### Operations +- **Ping** - Simplest operation, baseline overhead +- **Set** - Write operations +- **Get** - Read operations +- **SetGet_Mixed** - Realistic workload (70% reads, 30% writes) +- **Pipeline** - Batch operations + +### Scenarios +- **OTel_Enabled** - Full metrics collection +- **OTel_Disabled** - Code present but disabled +- **Baseline** - No OTel code (run on upstream/master) + +## Prerequisites + +1. **Redis server running**: + ```bash + docker run -d -p 6379:6379 redis:latest + ``` + +2. **benchstat installed** (for comparisons): + ```bash + go install golang.org/x/perf/cmd/benchstat@latest + ``` + +## Manual Comparison + +```bash +# Run benchmarks on current branch +cd extra/redisotel-native +go test -bench=BenchmarkOTelOverhead -benchmem -count=5 > current.txt + +# Checkout master and run baseline +cd ../.. +git stash +git checkout upstream/master +go test -bench=BenchmarkOTelOverhead_Baseline -benchmem -count=5 > baseline.txt + +# Compare +benchstat baseline.txt current.txt + +# Return to your branch +git checkout - +git stash pop +``` + +## Profiling + +### CPU Profile + +```bash +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping \ + -cpuprofile=cpu.prof -benchtime=30s + +go tool pprof cpu.prof +``` + +### Memory Profile + +```bash +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping \ + -memprofile=mem.prof -benchtime=30s + +go tool pprof mem.prof +``` + +### Trace + +```bash +go test -bench=BenchmarkOTelOverhead/OTel_Enabled/Ping \ + -trace=trace.out -benchtime=10s + +go tool trace trace.out +``` + +## Tips for Accurate Benchmarks + +1. **Close unnecessary applications** to reduce system noise +2. **Run multiple iterations** (at least 5) for statistical significance +3. **Use longer benchmark times** (10s+) for stable results +4. **Check p-values** - only trust differences with p < 0.05 +5. **Run on a quiet system** - avoid background tasks + +## Troubleshooting + +### High Variance +If you see high variance (ยฑ10% or more): +- Increase benchmark time: `-benchtime=30s` +- Increase iteration count: `-count=10` +- Close other applications +- Disable CPU frequency scaling (if possible) + +### Redis Connection Errors +```bash +# Check if Redis is running +redis-cli ping + +# Start Redis if needed +docker run -d -p 6379:6379 redis:latest +``` + +### Benchmark Timeout +Increase the timeout: +```bash +go test -bench=... -timeout=60m +``` + +## See Also + +- [../../BENCHMARK_OVERHEAD.md](../../BENCHMARK_OVERHEAD.md) - Comprehensive benchmarking guide +- [../../compare_perf.sh](../../compare_perf.sh) - Automated comparison script +- [Go Benchmarking Guide](https://pkg.go.dev/testing#hdr-Benchmarks) +- [benchstat Documentation](https://pkg.go.dev/golang.org/x/perf/cmd/benchstat) + diff --git a/extra/redisotel-native/benchmark_overhead_test.go b/extra/redisotel-native/benchmark_overhead_test.go new file mode 100644 index 000000000..81a1d6238 --- /dev/null +++ b/extra/redisotel-native/benchmark_overhead_test.go @@ -0,0 +1,280 @@ +package redisotel_test + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/redis/go-redis/v9" + redisotel "github.com/redis/go-redis/extra/redisotel-native/v9" + "go.opentelemetry.io/otel/exporters/prometheus" + "go.opentelemetry.io/otel/sdk/metric" +) + +// BenchmarkOTelOverhead measures the performance overhead of OTel observability +// This benchmark compares two scenarios: +// 1. OTel Enabled - Full metrics collection +// 2. OTel Disabled - Feature code present but disabled (measures dormant code cost) +// +// To compare with baseline (no OTel code), run the benchmarks in the root package +// on upstream/master using the compare_perf.sh script +func BenchmarkOTelOverhead(b *testing.B) { + ctx := context.Background() + + scenarios := []struct { + name string + enableMetrics bool + }{ + { + name: "OTel_Enabled", + enableMetrics: true, + }, + { + name: "OTel_Disabled", + enableMetrics: false, + }, + } + + for _, scenario := range scenarios { + b.Run(scenario.name, func(b *testing.B) { + // Create Redis client + rdb := redis.NewClient(&redis.Options{ + Addr: ":6379", + DialTimeout: time.Second, + ReadTimeout: time.Second, + WriteTimeout: time.Second, + PoolSize: 10, + MinIdleConns: 5, + }) + defer rdb.Close() + + // Create a no-op Prometheus exporter (metrics are collected but not exported) + exporter, err := prometheus.New() + if err != nil { + b.Fatalf("Failed to create Prometheus exporter: %v", err) + } + provider := metric.NewMeterProvider(metric.WithReader(exporter)) + + // Initialize OTel with enabled/disabled flag + if err := redisotel.Init(rdb, + redisotel.WithEnabled(scenario.enableMetrics), + redisotel.WithMeterProvider(provider), + ); err != nil { + b.Fatalf("Failed to initialize OTel: %v", err) + } + defer redisotel.Shutdown() + + // Verify connection + if err := rdb.Ping(ctx).Err(); err != nil { + b.Skipf("Redis server not available: %v", err) + } + + // Clean up + if err := rdb.FlushDB(ctx).Err(); err != nil { + b.Fatalf("Failed to flush DB: %v", err) + } + + // Run sub-benchmarks for different operations + b.Run("Ping", func(b *testing.B) { + benchmarkPing(b, rdb, ctx) + }) + + b.Run("Set", func(b *testing.B) { + benchmarkSet(b, rdb, ctx) + }) + + b.Run("Get", func(b *testing.B) { + benchmarkGet(b, rdb, ctx) + }) + + b.Run("SetGet_Mixed", func(b *testing.B) { + benchmarkSetGetMixed(b, rdb, ctx) + }) + + b.Run("Pipeline", func(b *testing.B) { + benchmarkPipeline(b, rdb, ctx) + }) + }) + } +} + +// benchmarkPing measures PING command performance +func benchmarkPing(b *testing.B, rdb *redis.Client, ctx context.Context) { + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if err := rdb.Ping(ctx).Err(); err != nil { + b.Fatal(err) + } + } + }) +} + +// benchmarkSet measures SET command performance +func benchmarkSet(b *testing.B, rdb *redis.Client, ctx context.Context) { + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + key := fmt.Sprintf("key:%d", i) + if err := rdb.Set(ctx, key, "value", 0).Err(); err != nil { + b.Fatal(err) + } + i++ + } + }) +} + +// benchmarkGet measures GET command performance +func benchmarkGet(b *testing.B, rdb *redis.Client, ctx context.Context) { + // Pre-populate keys + for i := 0; i < 1000; i++ { + key := fmt.Sprintf("key:%d", i) + if err := rdb.Set(ctx, key, "value", 0).Err(); err != nil { + b.Fatal(err) + } + } + + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + key := fmt.Sprintf("key:%d", i%1000) + if err := rdb.Get(ctx, key).Err(); err != nil && err != redis.Nil { + b.Fatal(err) + } + i++ + } + }) +} + +// benchmarkSetGetMixed measures mixed SET/GET workload (70% GET, 30% SET) +func benchmarkSetGetMixed(b *testing.B, rdb *redis.Client, ctx context.Context) { + // Pre-populate keys + for i := 0; i < 1000; i++ { + key := fmt.Sprintf("key:%d", i) + if err := rdb.Set(ctx, key, "value", 0).Err(); err != nil { + b.Fatal(err) + } + } + + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + key := fmt.Sprintf("key:%d", i%1000) + + // 70% GET, 30% SET + if i%10 < 7 { + if err := rdb.Get(ctx, key).Err(); err != nil && err != redis.Nil { + b.Fatal(err) + } + } else { + if err := rdb.Set(ctx, key, "value", 0).Err(); err != nil { + b.Fatal(err) + } + } + i++ + } + }) +} + +// benchmarkPipeline measures pipelined operations performance +func benchmarkPipeline(b *testing.B, rdb *redis.Client, ctx context.Context) { + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + pipe := rdb.Pipeline() + + // Pipeline 10 commands + for j := 0; j < 10; j++ { + key := fmt.Sprintf("key:%d:%d", i, j) + pipe.Set(ctx, key, "value", 0) + } + + if _, err := pipe.Exec(ctx); err != nil { + b.Fatal(err) + } + i++ + } + }) +} + +// BenchmarkOTelOverhead_ConnectionPool measures connection pool overhead with high concurrency +func BenchmarkOTelOverhead_ConnectionPool(b *testing.B) { + ctx := context.Background() + + scenarios := []struct { + name string + enableMetrics bool + }{ + { + name: "OTel_Enabled", + enableMetrics: true, + }, + { + name: "OTel_Disabled", + enableMetrics: false, + }, + } + + for _, scenario := range scenarios { + b.Run(scenario.name, func(b *testing.B) { + // Create Redis client with larger pool + rdb := redis.NewClient(&redis.Options{ + Addr: ":6379", + DialTimeout: time.Second, + ReadTimeout: time.Second, + WriteTimeout: time.Second, + PoolSize: 100, + MinIdleConns: 10, + }) + defer rdb.Close() + + // Setup OTel + exporter, err := prometheus.New() + if err != nil { + b.Fatalf("Failed to create Prometheus exporter: %v", err) + } + provider := metric.NewMeterProvider(metric.WithReader(exporter)) + + if err := redisotel.Init(rdb, + redisotel.WithEnabled(scenario.enableMetrics), + redisotel.WithMeterProvider(provider), + ); err != nil { + b.Fatalf("Failed to initialize OTel: %v", err) + } + defer redisotel.Shutdown() + + // Verify connection + if err := rdb.Ping(ctx).Err(); err != nil { + b.Skipf("Redis server not available: %v", err) + } + + b.ReportAllocs() + b.ResetTimer() + + // Stress test connection pool with high concurrency + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if err := rdb.Ping(ctx).Err(); err != nil { + b.Fatal(err) + } + } + }) + }) + } +} + diff --git a/extra/redisotel-native/config.go b/extra/redisotel-native/config.go new file mode 100644 index 000000000..bfeed77e6 --- /dev/null +++ b/extra/redisotel-native/config.go @@ -0,0 +1,237 @@ +package redisotel + +import ( + "go.opentelemetry.io/otel/metric" +) + +type MetricGroup string + +const ( + MetricGroupCommand MetricGroup = "command" + MetricGroupConnectionBasic MetricGroup = "connection-basic" + MetricGroupResiliency MetricGroup = "resiliency" + MetricGroupConnectionAdvanced MetricGroup = "connection-advanced" + MetricGroupStream MetricGroup = "stream" +) + +type HistogramAggregation string + +const ( + HistogramAggregationExplicitBucket HistogramAggregation = "explicit_bucket_histogram" + HistogramAggregationBase2Exponential HistogramAggregation = "base2_exponential_bucket_histogram" +) + +// config holds the configuration for the instrumentation +type config struct { + // Core settings + meterProvider metric.MeterProvider + enabled bool + + // Metric group settings + enabledMetricGroups map[MetricGroup]bool + + // Command filtering + includeCommands map[string]bool // nil means include all + excludeCommands map[string]bool // nil means exclude none + + // Cardinality reduction + hidePubSubChannelNames bool + hideStreamNames bool + + // Histogram settings + histAggregation HistogramAggregation + + // Bucket configurations for different histogram metrics + bucketsOperationDuration []float64 + bucketsStreamProcessingDuration []float64 + bucketsConnectionCreateTime []float64 + bucketsConnectionWaitTime []float64 + bucketsConnectionUseTime []float64 +} + +func defaultConfig() config { + return config{ + meterProvider: nil, // Will use global otel.GetMeterProvider() if nil + enabled: false, + + // Default metric groups: connection-basic, resiliency + enabledMetricGroups: map[MetricGroup]bool{ + MetricGroupConnectionBasic: true, + MetricGroupResiliency: true, + }, + + // No command filtering by default + includeCommands: nil, + excludeCommands: nil, + + // Don't hide labels by default + hidePubSubChannelNames: false, + hideStreamNames: false, + + // Use explicit bucket histogram by default + histAggregation: HistogramAggregationExplicitBucket, + + // Default buckets for all duration metrics + bucketsOperationDuration: defaultHistogramBuckets(), + bucketsStreamProcessingDuration: defaultHistogramBuckets(), + bucketsConnectionCreateTime: defaultHistogramBuckets(), + bucketsConnectionWaitTime: defaultHistogramBuckets(), + bucketsConnectionUseTime: defaultHistogramBuckets(), + } +} + +// isMetricGroupEnabled checks if a metric group is enabled +func (c *config) isMetricGroupEnabled(group MetricGroup) bool { + return c.enabledMetricGroups[group] +} + +// isCommandIncluded checks if a command should be included in metrics +func (c *config) isCommandIncluded(command string) bool { + if c.excludeCommands != nil && c.excludeCommands[command] { + return false + } + + if c.includeCommands != nil { + return c.includeCommands[command] + } + + return true +} + +// defaultHistogramBuckets returns the default histogram buckets for all duration metrics. +// These buckets are designed to capture typical Redis operation and connection latencies: +// - Sub-millisecond: 0.0001s (0.1ms), 0.0005s (0.5ms) +// - Milliseconds: 0.001s (1ms), 0.005s (5ms), 0.01s (10ms), 0.05s (50ms), 0.1s (100ms) +// - Sub-second: 0.5s (500ms) +// - Seconds: 1s, 5s, 10s +// +// This covers the range from 0.1ms to 10s, which is suitable for: +// - db.client.operation.duration (command execution time) +// - db.client.connection.create_time (connection establishment) +// - db.client.connection.wait_time (waiting for connection from pool) +// - db.client.connection.use_time (time connection is checked out) +// - redis.client.stream.processing_duration (stream message processing) +func defaultHistogramBuckets() []float64 { + return []float64{ + 0.0001, // 0.1ms + 0.0005, // 0.5ms + 0.001, // 1ms + 0.005, // 5ms + 0.01, // 10ms + 0.05, // 50ms + 0.1, // 100ms + 0.5, // 500ms + 1.0, // 1s + 5.0, // 5s + 10.0, // 10s + } +} + +// Option is a functional option for configuring the instrumentation +type Option interface { + apply(*config) +} + +// optionFunc wraps a function to implement the Option interface +type optionFunc func(*config) + +func (f optionFunc) apply(c *config) { + f(c) +} + +// WithMeterProvider sets the meter provider to use for creating metrics. +// If not provided, the global meter provider from otel.GetMeterProvider() will be used. +func WithMeterProvider(provider metric.MeterProvider) Option { + return optionFunc(func(c *config) { + c.meterProvider = provider + }) +} + +// WithEnabled enables or disables metrics emission +func WithEnabled(enabled bool) Option { + return optionFunc(func(c *config) { + c.enabled = enabled + }) +} + +// WithEnabledMetricGroups sets which metric groups to register +// Default: ["connection-basic", "resiliency"] +func WithEnabledMetricGroups(groups []MetricGroup) Option { + return optionFunc(func(c *config) { + c.enabledMetricGroups = make(map[MetricGroup]bool) + for _, group := range groups { + c.enabledMetricGroups[group] = true + } + }) +} + +// WithIncludeCommands sets a command allow-list for metrics +// Only commands in this list will have metrics recorded +// If not set, all commands are included (unless excluded) +func WithIncludeCommands(commands []string) Option { + return optionFunc(func(c *config) { + c.includeCommands = make(map[string]bool) + for _, cmd := range commands { + c.includeCommands[cmd] = true + } + }) +} + +// WithExcludeCommands sets a command deny-list for metrics +// Commands in this list will not have metrics recorded +func WithExcludeCommands(commands []string) Option { + return optionFunc(func(c *config) { + c.excludeCommands = make(map[string]bool) + for _, cmd := range commands { + c.excludeCommands[cmd] = true + } + }) +} + +// WithHidePubSubChannelNames omits channel label from Pub/Sub metrics to reduce cardinality +func WithHidePubSubChannelNames(hide bool) Option { + return optionFunc(func(c *config) { + c.hidePubSubChannelNames = hide + }) +} + +// WithHideStreamNames omits stream label from stream metrics to reduce cardinality +func WithHideStreamNames(hide bool) Option { + return optionFunc(func(c *config) { + c.hideStreamNames = hide + }) +} + +// WithHistogramAggregation sets the histogram aggregation mode +// Controls whether bucket overrides apply +func WithHistogramAggregation(agg HistogramAggregation) Option { + return optionFunc(func(c *config) { + c.histAggregation = agg + }) +} + +// WithHistogramBuckets sets custom histogram buckets for ALL duration metrics. +// If not set, uses defaultHistogramBuckets() which covers 0.1ms to 10s. +// Buckets should be in seconds (e.g., 0.001 = 1ms, 0.1 = 100ms, 1.0 = 1s). +// +// This applies to all duration histograms: +// - db.client.operation.duration +// - db.client.connection.create_time +// - db.client.connection.wait_time +// - db.client.connection.use_time +// - redis.client.stream.processing_duration +// +// Example: +// +// redisotel.Init(rdb, +// redisotel.WithHistogramBuckets([]float64{0.001, 0.01, 0.1, 1.0}), +// ) +func WithHistogramBuckets(buckets []float64) Option { + return optionFunc(func(c *config) { + c.bucketsOperationDuration = buckets + c.bucketsStreamProcessingDuration = buckets + c.bucketsConnectionCreateTime = buckets + c.bucketsConnectionWaitTime = buckets + c.bucketsConnectionUseTime = buckets + }) +} diff --git a/extra/redisotel-native/go.mod b/extra/redisotel-native/go.mod new file mode 100644 index 000000000..f7c1a4a23 --- /dev/null +++ b/extra/redisotel-native/go.mod @@ -0,0 +1,37 @@ +module github.com/redis/go-redis/extra/redisotel-native/v9 + +go 1.23.0 + +toolchain go1.24.2 + +replace github.com/redis/go-redis/v9 => ../.. + +require ( + github.com/redis/go-redis/v9 v9.7.0 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/metric v1.38.0 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/prometheus/client_golang v1.23.0 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.65.0 // indirect + github.com/prometheus/otlptranslator v0.0.2 // indirect + github.com/prometheus/procfs v0.17.0 // indirect + go.opentelemetry.io/auto/sdk v1.1.0 // indirect + go.opentelemetry.io/otel/exporters/prometheus v0.60.0 // indirect + go.opentelemetry.io/otel/sdk v1.38.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect + go.uber.org/atomic v1.11.0 // indirect + golang.org/x/sys v0.35.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect +) diff --git a/extra/redisotel-native/go.sum b/extra/redisotel-native/go.sum new file mode 100644 index 000000000..b8d091a4b --- /dev/null +++ b/extra/redisotel-native/go.sum @@ -0,0 +1,61 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248= +github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= +github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= +github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= +github.com/prometheus/otlptranslator v0.0.2 h1:+1CdeLVrRQ6Psmhnobldo0kTp96Rj80DRXRd5OSnMEQ= +github.com/prometheus/otlptranslator v0.0.2/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo= +go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= +go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/extra/redisotel-native/metrics.go b/extra/redisotel-native/metrics.go new file mode 100644 index 000000000..edce4cdff --- /dev/null +++ b/extra/redisotel-native/metrics.go @@ -0,0 +1,509 @@ +package redisotel + +import ( + "context" + "fmt" + "net" + "strconv" + "strings" + "time" + + "github.com/redis/go-redis/v9" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +const ( + // Library name for redis.client.library attribute + libraryName = "go-redis" +) + +// getLibraryVersionAttr returns the redis.client.library attribute +// This is computed once and reused to avoid repeated string formatting +func getLibraryVersionAttr() attribute.KeyValue { + return attribute.String("redis.client.library", fmt.Sprintf("%s:%s", libraryName, redis.Version())) +} + +// addServerPortIfNonDefault adds server.port attribute if port is not the default (6379) +func addServerPortIfNonDefault(attrs []attribute.KeyValue, serverPort string) []attribute.KeyValue { + if serverPort != "" && serverPort != "6379" { + return append(attrs, attribute.String("server.port", serverPort)) + } + return attrs +} + +// formatPoolName formats the pool name from server address and port +func formatPoolName(serverAddr, serverPort string) string { + if serverPort != "" && serverPort != "6379" { + return fmt.Sprintf("%s:%s", serverAddr, serverPort) + } + return serverAddr +} + +// metricsRecorder implements the otel.Recorder interface +type metricsRecorder struct { + operationDuration metric.Float64Histogram + connectionCount metric.Int64UpDownCounter + connectionCreateTime metric.Float64Histogram + connectionRelaxedTimeout metric.Int64UpDownCounter + connectionHandoff metric.Int64Counter + clientErrors metric.Int64Counter + maintenanceNotifications metric.Int64Counter + + // Client configuration for attributes (used for operation metrics only) + serverAddr string + serverPort string + dbIndex string +} + +// RecordOperationDuration records db.client.operation.duration metric +func (r *metricsRecorder) RecordOperationDuration( + ctx context.Context, + duration time.Duration, + cmd redis.Cmder, + attempts int, + cn redis.ConnInfo, +) { + if r.operationDuration == nil { + return + } + + // Convert duration to seconds (OTel convention for duration metrics) + durationSeconds := duration.Seconds() + + // Build attributes + attrs := []attribute.KeyValue{ + // Required attributes + attribute.String("db.operation.name", cmd.FullName()), + getLibraryVersionAttr(), + attribute.Int("redis.client.operation.retry_attempts", attempts-1), // attempts-1 = retry count + attribute.Bool("redis.client.operation.blocking", isBlockingCommand(cmd)), + + // Recommended attributes + attribute.String("db.system.name", "redis"), + attribute.String("server.address", r.serverAddr), + } + + // Add server.port if not default + attrs = addServerPortIfNonDefault(attrs, r.serverPort) + + // Add db.namespace (database index) if available + if r.dbIndex != "" { + attrs = append(attrs, attribute.String("db.namespace", r.dbIndex)) + } + + // Add network.peer.address and network.peer.port from connection + if cn != nil { + remoteAddr := cn.RemoteAddr() + if remoteAddr != nil { + peerAddr, peerPort := splitHostPort(remoteAddr.String()) + if peerAddr != "" { + attrs = append(attrs, attribute.String("network.peer.address", peerAddr)) + } + if peerPort != "" { + attrs = append(attrs, attribute.String("network.peer.port", peerPort)) + } + } + } + + // Add error.type if command failed + if err := cmd.Err(); err != nil { + attrs = append(attrs, attribute.String("error.type", classifyError(err))) + } + + // Add db.response.status_code if error is a Redis error + if err := cmd.Err(); err != nil { + if statusCode := extractRedisErrorPrefix(err); statusCode != "" { + attrs = append(attrs, attribute.String("db.response.status_code", statusCode)) + } + } + + // Record the histogram + r.operationDuration.Record(ctx, durationSeconds, metric.WithAttributes(attrs...)) +} + +// isBlockingCommand checks if a command is a blocking operation +// Blocking commands have a timeout parameter and include: BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, +// BZPOPMIN, BZPOPMAX, BZMPOP, BLMPOP, XREAD with BLOCK, XREADGROUP with BLOCK +func isBlockingCommand(cmd redis.Cmder) bool { + name := strings.ToLower(cmd.Name()) + + // Commands that start with 'b' and are blocking + if strings.HasPrefix(name, "b") { + switch name { + case "blpop", "brpop", "brpoplpush", "blmove", "bzpopmin", "bzpopmax", "bzmpop", "blmpop": + return true + } + } + + // XREAD and XREADGROUP with BLOCK option + if name == "xread" || name == "xreadgroup" { + args := cmd.Args() + for i, arg := range args { + if argStr, ok := arg.(string); ok { + if strings.ToLower(argStr) == "block" && i+1 < len(args) { + return true + } + } + } + } + + return false +} + +// classifyError returns the error.type attribute value +// Format: :: +func classifyError(err error) string { + if err == nil { + return "" + } + + errStr := err.Error() + + // Network errors + if isNetworkError(err) { + return fmt.Sprintf("network:%s", errStr) + } + + // Timeout errors + if isTimeoutError(err) { + return "timeout" + } + + // Redis errors (start with error prefix like ERR, WRONGTYPE, etc.) + if prefix := extractRedisErrorPrefix(err); prefix != "" { + return fmt.Sprintf("redis:%s", prefix) + } + + // Generic error + return errStr +} + +// extractRedisErrorPrefix extracts the Redis error prefix (e.g., "ERR", "WRONGTYPE") +// Redis errors typically start with an uppercase prefix followed by a space +func extractRedisErrorPrefix(err error) string { + if err == nil { + return "" + } + + errStr := err.Error() + + // Redis errors typically start with an uppercase prefix + // Examples: "ERR ...", "WRONGTYPE ...", "CLUSTERDOWN ..." + parts := strings.SplitN(errStr, " ", 2) + if len(parts) > 0 { + prefix := parts[0] + // Check if it's all uppercase (Redis error convention) + if prefix == strings.ToUpper(prefix) && len(prefix) > 0 { + return prefix + } + } + + return "" +} + +// isNetworkError checks if an error is a network-related error +func isNetworkError(err error) bool { + if err == nil { + return false + } + + // Check for net.Error interface (standard way to detect network errors) + _, ok := err.(net.Error) + return ok +} + +// isTimeoutError checks if an error is a timeout error +func isTimeoutError(err error) bool { + if err == nil { + return false + } + + // Check for net.Error with Timeout() method (standard way) + if netErr, ok := err.(net.Error); ok { + return netErr.Timeout() + } + + return false +} + +// splitHostPort splits a host:port string into host and port +// This is a simplified version that handles the common cases +func splitHostPort(addr string) (host, port string) { + // Handle Unix sockets + if strings.HasPrefix(addr, "/") || strings.HasPrefix(addr, "@") { + return addr, "" + } + + host, port, err := net.SplitHostPort(addr) + if err != nil { + // If split fails, return the whole address as host + return addr, "" + } + + return host, port +} + +// parseAddr parses a Redis address into host and port +func parseAddr(addr string) (host, port string) { + // Handle Unix sockets + if strings.HasPrefix(addr, "/") || strings.HasPrefix(addr, "unix://") { + return addr, "" + } + + // Remove protocol prefix if present + addr = strings.TrimPrefix(addr, "redis://") + addr = strings.TrimPrefix(addr, "rediss://") + + host, port, err := net.SplitHostPort(addr) + if err != nil { + // No port specified, use default + return addr, "6379" + } + + return host, port +} + +// formatDBIndex formats the database index as a string +func formatDBIndex(db int) string { + if db < 0 { + return "" + } + return strconv.Itoa(db) +} + +// RecordConnectionStateChange records a change in connection state +// This is called from the pool when connections transition between states +func (r *metricsRecorder) RecordConnectionStateChange( + ctx context.Context, + cn redis.ConnInfo, + fromState, toState string, +) { + if r.connectionCount == nil { + return + } + + // Extract server address from connection + serverAddr, serverPort := extractServerInfo(cn) + + // Build base attributes + attrs := []attribute.KeyValue{ + attribute.String("db.system", "redis"), + attribute.String("server.address", serverAddr), + } + + // Add server.port if not default + if serverPort != "" && serverPort != "6379" { + attrs = append(attrs, attribute.String("server.port", serverPort)) + } + + // Decrement old state (if not empty) + if fromState != "" { + fromAttrs := append([]attribute.KeyValue{}, attrs...) + fromAttrs = append(fromAttrs, attribute.String("state", fromState)) + r.connectionCount.Add(ctx, -1, metric.WithAttributes(fromAttrs...)) + } + + // Increment new state + if toState != "" { + toAttrs := append([]attribute.KeyValue{}, attrs...) + toAttrs = append(toAttrs, attribute.String("state", toState)) + r.connectionCount.Add(ctx, 1, metric.WithAttributes(toAttrs...)) + } +} + +// extractServerInfo extracts server address and port from connection info +// For client connections, this is the remote endpoint (server address) +func extractServerInfo(cn redis.ConnInfo) (addr, port string) { + if cn == nil { + return "", "" + } + + remoteAddr := cn.RemoteAddr() + if remoteAddr == nil { + return "", "" + } + + addrStr := remoteAddr.String() + host, portStr := parseAddr(addrStr) + return host, portStr +} + +// RecordConnectionCreateTime records the time it took to create a new connection +func (r *metricsRecorder) RecordConnectionCreateTime( + ctx context.Context, + duration time.Duration, + cn redis.ConnInfo, +) { + if r.connectionCreateTime == nil { + return + } + + // Convert duration to seconds (OTel convention) + durationSeconds := duration.Seconds() + + // Extract server address from connection + serverAddr, serverPort := extractServerInfo(cn) + + // Build attributes + attrs := []attribute.KeyValue{ + attribute.String("db.system", "redis"), + attribute.String("server.address", serverAddr), + getLibraryVersionAttr(), + } + + // Add server.port if not default + attrs = addServerPortIfNonDefault(attrs, serverPort) + + // Add pool name (using server.address:server.port format) + poolName := formatPoolName(serverAddr, serverPort) + attrs = append(attrs, attribute.String("db.client.connection.pool.name", poolName)) + + // Record the histogram + r.connectionCreateTime.Record(ctx, durationSeconds, metric.WithAttributes(attrs...)) +} + +// RecordConnectionRelaxedTimeout records when connection timeout is relaxed/unrelaxed +func (r *metricsRecorder) RecordConnectionRelaxedTimeout( + ctx context.Context, + delta int, + cn redis.ConnInfo, + poolName, notificationType string, +) { + if r.connectionRelaxedTimeout == nil { + return + } + + // Extract server address from connection + serverAddr, serverPort := extractServerInfo(cn) + + // Build attributes + attrs := []attribute.KeyValue{ + attribute.String("db.system.name", "redis"), + attribute.String("server.address", serverAddr), + getLibraryVersionAttr(), + attribute.String("db.client.connection.pool.name", poolName), + attribute.String("redis.client.connection.notification", notificationType), + } + + // Add server.port if not default + attrs = addServerPortIfNonDefault(attrs, serverPort) + + // Record the counter (delta can be +1 or -1) + r.connectionRelaxedTimeout.Add(ctx, int64(delta), metric.WithAttributes(attrs...)) +} + +// RecordConnectionHandoff records when a connection is handed off to another node +func (r *metricsRecorder) RecordConnectionHandoff( + ctx context.Context, + cn redis.ConnInfo, + poolName string, +) { + if r.connectionHandoff == nil { + return + } + + // Extract server address from connection + serverAddr, serverPort := extractServerInfo(cn) + + // Build attributes + attrs := []attribute.KeyValue{ + attribute.String("db.system", "redis"), + attribute.String("server.address", serverAddr), + getLibraryVersionAttr(), + attribute.String("db.client.connection.pool.name", poolName), + } + + // Add server.port if not default + attrs = addServerPortIfNonDefault(attrs, serverPort) + + // Record the counter + r.connectionHandoff.Add(ctx, 1, metric.WithAttributes(attrs...)) +} + +// RecordError records client errors (ASK, MOVED, handshake failures, etc.) +func (r *metricsRecorder) RecordError( + ctx context.Context, + errorType string, + cn redis.ConnInfo, + statusCode string, + isInternal bool, + retryAttempts int, +) { + if r.clientErrors == nil { + return + } + + // Extract server address and peer address from connection (may be nil for some errors) + // For client connections, peer address is the same as server address (remote endpoint) + var serverAddr, serverPort, peerAddr, peerPort string + if cn != nil { + serverAddr, serverPort = extractServerInfo(cn) + peerAddr, peerPort = serverAddr, serverPort // Peer is same as server for client connections + } + + // Build attributes + attrs := []attribute.KeyValue{ + attribute.String("db.system.name", "redis"), + attribute.String("error.type", errorType), + attribute.String("db.response.status_code", statusCode), + attribute.Bool("redis.client.errors.internal", isInternal), + attribute.Int("redis.client.operation.retry_attempts", retryAttempts), + getLibraryVersionAttr(), + } + + // Add server info if available + if serverAddr != "" { + attrs = append(attrs, attribute.String("server.address", serverAddr)) + attrs = addServerPortIfNonDefault(attrs, serverPort) + } + + // Add peer info if available + if peerAddr != "" { + attrs = append(attrs, attribute.String("network.peer.address", peerAddr)) + if peerPort != "" { + attrs = append(attrs, attribute.String("network.peer.port", peerPort)) + } + } + + // Record the counter + r.clientErrors.Add(ctx, 1, metric.WithAttributes(attrs...)) +} + +// RecordMaintenanceNotification records when a maintenance notification is received +func (r *metricsRecorder) RecordMaintenanceNotification( + ctx context.Context, + cn redis.ConnInfo, + notificationType string, +) { + if r.maintenanceNotifications == nil { + return + } + + // Extract server address and peer address from connection + // For client connections, peer address is the same as server address (remote endpoint) + serverAddr, serverPort := extractServerInfo(cn) + peerAddr, peerPort := serverAddr, serverPort // Peer is same as server for client connections + + // Build attributes + attrs := []attribute.KeyValue{ + attribute.String("db.system.name", "redis"), + attribute.String("server.address", serverAddr), + getLibraryVersionAttr(), + attribute.String("redis.client.connection.notification", notificationType), + } + + // Add server.port if not default + attrs = addServerPortIfNonDefault(attrs, serverPort) + + // Add peer info if available + if peerAddr != "" { + attrs = append(attrs, attribute.String("network.peer.address", peerAddr)) + if peerPort != "" { + attrs = append(attrs, attribute.String("network.peer.port", peerPort)) + } + } + + // Record the counter + r.maintenanceNotifications.Add(ctx, 1, metric.WithAttributes(attrs...)) +} diff --git a/extra/redisotel-native/redisotel.go b/extra/redisotel-native/redisotel.go new file mode 100644 index 000000000..b692f05f0 --- /dev/null +++ b/extra/redisotel-native/redisotel.go @@ -0,0 +1,267 @@ +// Package redisotel provides native OpenTelemetry instrumentation for go-redis. +// +// This package implements the OpenTelemetry Semantic Conventions for database clients, +// providing metrics, traces, and logs for Redis operations. +// +// Basic Usage (with global MeterProvider): +// +// import ( +// "github.com/redis/go-redis/v9" +// redisotel "github.com/redis/go-redis/extra/redisotel-native/v9" +// "go.opentelemetry.io/otel" +// ) +// +// func main() { +// // Initialize OpenTelemetry globally (meter provider, etc.) +// otel.SetMeterProvider(myMeterProvider) +// +// // Create Redis client +// rdb := redis.NewClient(&redis.Options{ +// Addr: "localhost:6379", +// DB: 0, +// }) +// +// // Initialize native OTel instrumentation (uses global MeterProvider) +// if err := redisotel.Init(rdb); err != nil { +// panic(err) +// } +// +// // Use the client normally - metrics are automatically recorded +// rdb.Set(ctx, "key", "value", 0) +// } +// +// Advanced Usage (with custom MeterProvider): +// +// // Pass a custom MeterProvider +// if err := redisotel.Init(rdb, redisotel.WithMeterProvider(customProvider)); err != nil { +// panic(err) +// } +package redisotel + +import ( + "fmt" + "sync" + + "github.com/redis/go-redis/v9" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" +) + +var ( + // Global singleton instance + globalInstance *metricsRecorder + globalInstanceOnce sync.Once + initErr error +) + +// Init initializes native OpenTelemetry instrumentation for the given Redis client. +// This function should be called once per application, typically during startup. +// Subsequent calls are no-ops and return nil. +// +// The function extracts configuration from the client (server address, port, database index) +// and registers a global metrics recorder. +// +// If no MeterProvider is provided via WithMeterProvider option, the global MeterProvider +// from otel.GetMeterProvider() will be used. Make sure to call otel.SetMeterProvider() +// before calling Init() if you want to use a custom provider. +// +// Example (using global MeterProvider): +// +// otel.SetMeterProvider(myMeterProvider) +// rdb := redis.NewClient(&redis.Options{ +// Addr: "localhost:6379", +// DB: 0, +// }) +// if err := redisotel.Init(rdb); err != nil { +// log.Fatal(err) +// } +// +// Example (using custom MeterProvider): +// +// if err := redisotel.Init(rdb, redisotel.WithMeterProvider(customProvider)); err != nil { +// log.Fatal(err) +// } +func Init(client redis.UniversalClient, opts ...Option) error { + globalInstanceOnce.Do(func() { + initErr = initOnce(client, opts...) + }) + return initErr +} + +// initOnce performs the actual initialization (called once by sync.Once) +func initOnce(client redis.UniversalClient, opts ...Option) error { + // Apply options + cfg := defaultConfig() + for _, opt := range opts { + opt.apply(&cfg) + } + + // Extract client configuration + serverAddr, serverPort, dbIndex, err := extractClientConfig(client) + if err != nil { + return fmt.Errorf("failed to extract client config: %w", err) + } + + // Get meter provider (use global if not provided) + meterProvider := cfg.meterProvider + if meterProvider == nil { + meterProvider = otel.GetMeterProvider() + } + + // Create meter + meter := meterProvider.Meter( + "github.com/redis/go-redis", + metric.WithInstrumentationVersion(redis.Version()), + ) + + // Create histogram for operation duration + var operationDurationOpts []metric.Float64HistogramOption + operationDurationOpts = append(operationDurationOpts, + metric.WithDescription("Duration of database client operations"), + metric.WithUnit("s"), + ) + if cfg.histAggregation == HistogramAggregationExplicitBucket { + operationDurationOpts = append(operationDurationOpts, + metric.WithExplicitBucketBoundaries(cfg.bucketsOperationDuration...), + ) + } + operationDuration, err := meter.Float64Histogram( + "db.client.operation.duration", + operationDurationOpts..., + ) + if err != nil { + return fmt.Errorf("failed to create operation duration histogram: %w", err) + } + + // Create synchronous UpDownCounter for connection count + connectionCount, err := meter.Int64UpDownCounter( + "db.client.connection.count", + metric.WithDescription("The number of connections that are currently in state described by the state attribute"), + metric.WithUnit("{connection}"), + ) + if err != nil { + return fmt.Errorf("failed to create connection count metric: %w", err) + } + + // Create histogram for connection creation time + var connectionCreateTimeOpts []metric.Float64HistogramOption + connectionCreateTimeOpts = append(connectionCreateTimeOpts, + metric.WithDescription("The time it took to create a new connection"), + metric.WithUnit("s"), + ) + if cfg.histAggregation == HistogramAggregationExplicitBucket { + connectionCreateTimeOpts = append(connectionCreateTimeOpts, + metric.WithExplicitBucketBoundaries(cfg.bucketsConnectionCreateTime...), + ) + } + connectionCreateTime, err := meter.Float64Histogram( + "db.client.connection.create_time", + connectionCreateTimeOpts..., + ) + if err != nil { + return fmt.Errorf("failed to create connection create time histogram: %w", err) + } + + // Create UpDownCounter for relaxed timeout tracking + connectionRelaxedTimeout, err := meter.Int64UpDownCounter( + "redis.client.connection.relaxed_timeout", + metric.WithDescription("How many times the connection timeout has been increased/decreased (after a server maintenance notification)"), + metric.WithUnit("{relaxation}"), + ) + if err != nil { + return fmt.Errorf("failed to create connection relaxed timeout metric: %w", err) + } + + // Create Counter for connection handoffs + connectionHandoff, err := meter.Int64Counter( + "redis.client.connection.handoff", + metric.WithDescription("Connections that have been handed off to another node (e.g after a MOVING notification)"), + ) + if err != nil { + return fmt.Errorf("failed to create connection handoff metric: %w", err) + } + + // Create Counter for client errors + clientErrors, err := meter.Int64Counter( + "redis.client.errors", + metric.WithDescription("Number of errors handled by the Redis client"), + metric.WithUnit("{error}"), + ) + if err != nil { + return fmt.Errorf("failed to create client errors metric: %w", err) + } + + // Create Counter for maintenance notifications + maintenanceNotifications, err := meter.Int64Counter( + "redis.client.maintenance.notifications", + metric.WithDescription("Number of maintenance notifications received"), + metric.WithUnit("{notification}"), + ) + if err != nil { + return fmt.Errorf("failed to create maintenance notifications metric: %w", err) + } + + // Create recorder + recorder := &metricsRecorder{ + operationDuration: operationDuration, + connectionCount: connectionCount, + connectionCreateTime: connectionCreateTime, + connectionRelaxedTimeout: connectionRelaxedTimeout, + connectionHandoff: connectionHandoff, + clientErrors: clientErrors, + maintenanceNotifications: maintenanceNotifications, + serverAddr: serverAddr, + serverPort: serverPort, + dbIndex: dbIndex, + } + + // Register global recorder + redis.SetOTelRecorder(recorder) + globalInstance = recorder + + return nil +} + +// extractClientConfig extracts server address, port, and database index from a Redis client +func extractClientConfig(client redis.UniversalClient) (serverAddr, serverPort, dbIndex string, err error) { + switch c := client.(type) { + case *redis.Client: + opts := c.Options() + host, port := parseAddr(opts.Addr) + return host, port, formatDBIndex(opts.DB), nil + + case *redis.ClusterClient: + opts := c.Options() + if len(opts.Addrs) > 0 { + // Use first address for server.address attribute + host, port := parseAddr(opts.Addrs[0]) + return host, port, "", nil + } + return "", "", "", fmt.Errorf("cluster client has no addresses") + + case *redis.Ring: + opts := c.Options() + if len(opts.Addrs) > 0 { + // Use first address for server.address attribute + for _, addr := range opts.Addrs { + host, port := parseAddr(addr) + return host, port, formatDBIndex(opts.DB), nil + } + } + return "", "", "", fmt.Errorf("ring client has no addresses") + + default: + return "", "", "", fmt.Errorf("unsupported client type: %T", client) + } +} + +// Shutdown cleans up resources (for testing purposes) +func Shutdown() { + if globalInstance != nil { + redis.SetOTelRecorder(nil) + globalInstance = nil + } + // Reset the sync.Once so Init can be called again (useful for tests) + globalInstanceOnce = sync.Once{} + initErr = nil +} diff --git a/internal/otel/metrics.go b/internal/otel/metrics.go new file mode 100644 index 000000000..329000ea3 --- /dev/null +++ b/internal/otel/metrics.go @@ -0,0 +1,130 @@ +package otel + +import ( + "context" + "time" + + "github.com/redis/go-redis/v9/internal/pool" +) + +// Cmder is a minimal interface for command information needed for metrics. +// This avoids circular dependencies with the main redis package. +type Cmder interface { + Name() string + FullName() string + Args() []interface{} + Err() error +} + +// Recorder is the interface for recording metrics. +// Implementations are provided by extra/redisotel-native package. +type Recorder interface { + // RecordOperationDuration records the total operation duration (including all retries) + RecordOperationDuration(ctx context.Context, duration time.Duration, cmd Cmder, attempts int, cn *pool.Conn) + + // RecordConnectionStateChange records when a connection changes state + RecordConnectionStateChange(ctx context.Context, cn *pool.Conn, fromState, toState string) + + // RecordConnectionCreateTime records the time it took to create a new connection + RecordConnectionCreateTime(ctx context.Context, duration time.Duration, cn *pool.Conn) + + // RecordConnectionRelaxedTimeout records when connection timeout is relaxed/unrelaxed + // delta: +1 for relaxed, -1 for unrelaxed + // poolName: name of the connection pool (e.g., "main", "pubsub") + // notificationType: the notification type that triggered the timeout relaxation (e.g., "MOVING") + RecordConnectionRelaxedTimeout(ctx context.Context, delta int, cn *pool.Conn, poolName, notificationType string) + + // RecordConnectionHandoff records when a connection is handed off to another node + // poolName: name of the connection pool (e.g., "main", "pubsub") + RecordConnectionHandoff(ctx context.Context, cn *pool.Conn, poolName string) + + // RecordError records client errors (ASK, MOVED, handshake failures, etc.) + // errorType: type of error (e.g., "ASK", "MOVED", "HANDSHAKE_FAILED") + // statusCode: Redis response status code if available (e.g., "MOVED", "ASK") + // isInternal: whether this is an internal error + // retryAttempts: number of retry attempts made + RecordError(ctx context.Context, errorType string, cn *pool.Conn, statusCode string, isInternal bool, retryAttempts int) + + // RecordMaintenanceNotification records when a maintenance notification is received + // notificationType: the type of notification (e.g., "MOVING", "MIGRATING", etc.) + RecordMaintenanceNotification(ctx context.Context, cn *pool.Conn, notificationType string) +} + +// Global recorder instance (initialized by extra/redisotel-native) +var globalRecorder Recorder = noopRecorder{} + +// SetGlobalRecorder sets the global recorder (called by Init() in extra/redisotel-native) +func SetGlobalRecorder(r Recorder) { + if r == nil { + globalRecorder = noopRecorder{} + // Unregister pool callbacks + pool.SetConnectionStateChangeCallback(nil) + pool.SetConnectionCreateTimeCallback(nil) + pool.SetConnectionRelaxedTimeoutCallback(nil) + pool.SetConnectionHandoffCallback(nil) + pool.SetErrorCallback(nil) + pool.SetMaintenanceNotificationCallback(nil) + return + } + globalRecorder = r + + // Register pool callback to forward state changes to recorder + pool.SetConnectionStateChangeCallback(func(ctx context.Context, cn *pool.Conn, fromState, toState string) { + globalRecorder.RecordConnectionStateChange(ctx, cn, fromState, toState) + }) + + // Register pool callback to forward connection creation time to recorder + pool.SetConnectionCreateTimeCallback(func(ctx context.Context, duration time.Duration, cn *pool.Conn) { + globalRecorder.RecordConnectionCreateTime(ctx, duration, cn) + }) + + // Register pool callback to forward connection relaxed timeout changes to recorder + pool.SetConnectionRelaxedTimeoutCallback(func(ctx context.Context, delta int, cn *pool.Conn, poolName, notificationType string) { + globalRecorder.RecordConnectionRelaxedTimeout(ctx, delta, cn, poolName, notificationType) + }) + + // Register pool callback to forward connection handoffs to recorder + pool.SetConnectionHandoffCallback(func(ctx context.Context, cn *pool.Conn, poolName string) { + globalRecorder.RecordConnectionHandoff(ctx, cn, poolName) + }) + + // Register pool callback to forward errors to recorder + pool.SetErrorCallback(func(ctx context.Context, errorType string, cn *pool.Conn, statusCode string, isInternal bool, retryAttempts int) { + globalRecorder.RecordError(ctx, errorType, cn, statusCode, isInternal, retryAttempts) + }) + + // Register pool callback to forward maintenance notifications to recorder + pool.SetMaintenanceNotificationCallback(func(ctx context.Context, cn *pool.Conn, notificationType string) { + globalRecorder.RecordMaintenanceNotification(ctx, cn, notificationType) + }) +} + +// RecordOperationDuration records the total operation duration. +// This is called from redis.go after command execution completes. +func RecordOperationDuration(ctx context.Context, duration time.Duration, cmd Cmder, attempts int, cn *pool.Conn) { + globalRecorder.RecordOperationDuration(ctx, duration, cmd, attempts, cn) +} + +// RecordConnectionStateChange records when a connection changes state. +// This is called from pool.go when connections transition between states. +func RecordConnectionStateChange(ctx context.Context, cn *pool.Conn, fromState, toState string) { + globalRecorder.RecordConnectionStateChange(ctx, cn, fromState, toState) +} + +// RecordConnectionCreateTime records the time it took to create a new connection. +// This is called from pool.go when a new connection is successfully created. +func RecordConnectionCreateTime(ctx context.Context, duration time.Duration, cn *pool.Conn) { + globalRecorder.RecordConnectionCreateTime(ctx, duration, cn) +} + +// noopRecorder is a no-op implementation (zero overhead when metrics disabled) +type noopRecorder struct{} + +func (noopRecorder) RecordOperationDuration(context.Context, time.Duration, Cmder, int, *pool.Conn) {} +func (noopRecorder) RecordConnectionStateChange(context.Context, *pool.Conn, string, string) {} +func (noopRecorder) RecordConnectionCreateTime(context.Context, time.Duration, *pool.Conn) {} +func (noopRecorder) RecordConnectionRelaxedTimeout(context.Context, int, *pool.Conn, string, string) { +} +func (noopRecorder) RecordConnectionHandoff(context.Context, *pool.Conn, string) {} +func (noopRecorder) RecordError(context.Context, string, *pool.Conn, string, bool, int) {} +func (noopRecorder) RecordMaintenanceNotification(context.Context, *pool.Conn, string) {} diff --git a/internal/pool/conn.go b/internal/pool/conn.go index 95d83bfde..36fa4812d 100644 --- a/internal/pool/conn.go +++ b/internal/pool/conn.go @@ -418,6 +418,8 @@ func (cn *Conn) IsPubSub() bool { // SetRelaxedTimeout sets relaxed timeouts for this connection during maintenanceNotifications upgrades. // These timeouts will be used for all subsequent commands until the deadline expires. // Uses atomic operations for lock-free access. +// Note: Metrics should be recorded by the caller (notification handler) which has context about +// the notification type and pool name. func (cn *Conn) SetRelaxedTimeout(readTimeout, writeTimeout time.Duration) { cn.relaxedCounter.Add(1) cn.relaxedReadTimeoutNs.Store(int64(readTimeout)) @@ -452,6 +454,11 @@ func (cn *Conn) clearRelaxedTimeout() { cn.relaxedWriteTimeoutNs.Store(0) cn.relaxedDeadlineNs.Store(0) cn.relaxedCounter.Store(0) + + // Note: Metrics for timeout unrelaxing are not recorded here because we don't have + // context about which notification type or pool triggered the relaxation. + // In practice, relaxed timeouts expire automatically via deadline, so explicit + // unrelaxing metrics are less critical than the initial relaxation metrics. } // HasRelaxedTimeout returns true if relaxed timeouts are currently active on this connection. diff --git a/internal/pool/pool.go b/internal/pool/pool.go index d757d1f4f..ad18b7f92 100644 --- a/internal/pool/pool.go +++ b/internal/pool/pool.go @@ -32,6 +32,27 @@ var ( // errConnNotPooled is returned when trying to return a non-pooled connection to the pool. errConnNotPooled = errors.New("connection not pooled") + // Global callback for connection state changes (set by otel package) + connectionStateChangeCallback func(ctx context.Context, cn *Conn, fromState, toState string) + + // Global callback for connection creation time (set by otel package) + connectionCreateTimeCallback func(ctx context.Context, duration time.Duration, cn *Conn) + + // Global callback for connection relaxed timeout changes (set by otel package) + // Parameters: ctx, delta (+1/-1), cn, poolName, notificationType + connectionRelaxedTimeoutCallback func(ctx context.Context, delta int, cn *Conn, poolName, notificationType string) + + // Global callback for connection handoff (set by otel package) + // Parameters: ctx, cn, poolName + connectionHandoffCallback func(ctx context.Context, cn *Conn, poolName string) + + // Global callback for error tracking (set by otel package) + // Parameters: ctx, errorType, cn, statusCode, isInternal, retryAttempts + errorCallback func(ctx context.Context, errorType string, cn *Conn, statusCode string, isInternal bool, retryAttempts int) + + // Global callback for maintenance notifications (set by otel package) + // Parameters: ctx, cn, notificationType + maintenanceNotificationCallback func(ctx context.Context, cn *Conn, notificationType string) // popAttempts is the maximum number of attempts to find a usable connection // when popping from the idle connection pool. This handles cases where connections @@ -51,6 +72,66 @@ var ( noExpiration = maxTime ) +// SetConnectionStateChangeCallback sets the global callback for connection state changes. +// This is called by the otel package to register metrics recording. +func SetConnectionStateChangeCallback(fn func(ctx context.Context, cn *Conn, fromState, toState string)) { + connectionStateChangeCallback = fn +} + +// SetConnectionCreateTimeCallback sets the global callback for connection creation time. +// This is called by the otel package to register metrics recording. +func SetConnectionCreateTimeCallback(fn func(ctx context.Context, duration time.Duration, cn *Conn)) { + connectionCreateTimeCallback = fn +} + +// SetConnectionRelaxedTimeoutCallback sets the global callback for connection relaxed timeout changes. +// This is called by the otel package to register metrics recording. +func SetConnectionRelaxedTimeoutCallback(fn func(ctx context.Context, delta int, cn *Conn, poolName, notificationType string)) { + connectionRelaxedTimeoutCallback = fn +} + +// GetConnectionRelaxedTimeoutCallback returns the global callback for connection relaxed timeout changes. +// This is used by maintnotifications to record relaxed timeout metrics. +func GetConnectionRelaxedTimeoutCallback() func(ctx context.Context, delta int, cn *Conn, poolName, notificationType string) { + return connectionRelaxedTimeoutCallback +} + +// SetConnectionHandoffCallback sets the global callback for connection handoffs. +// This is called by the otel package to register metrics recording. +func SetConnectionHandoffCallback(fn func(ctx context.Context, cn *Conn, poolName string)) { + connectionHandoffCallback = fn +} + +// GetConnectionHandoffCallback returns the global callback for connection handoffs. +// This is used by maintnotifications to record handoff metrics. +func GetConnectionHandoffCallback() func(ctx context.Context, cn *Conn, poolName string) { + return connectionHandoffCallback +} + +// SetErrorCallback sets the global callback for error tracking. +// This is called by the otel package to register metrics recording. +func SetErrorCallback(fn func(ctx context.Context, errorType string, cn *Conn, statusCode string, isInternal bool, retryAttempts int)) { + errorCallback = fn +} + +// GetErrorCallback returns the global callback for error tracking. +// This is used by cluster and client code to record error metrics. +func GetErrorCallback() func(ctx context.Context, errorType string, cn *Conn, statusCode string, isInternal bool, retryAttempts int) { + return errorCallback +} + +// SetMaintenanceNotificationCallback sets the global callback for maintenance notifications. +// This is called by the otel package to register metrics recording. +func SetMaintenanceNotificationCallback(fn func(ctx context.Context, cn *Conn, notificationType string)) { + maintenanceNotificationCallback = fn +} + +// GetMaintenanceNotificationCallback returns the global callback for maintenance notifications. +// This is used by maintnotifications to record notification metrics. +func GetMaintenanceNotificationCallback() func(ctx context.Context, cn *Conn, notificationType string) { + return maintenanceNotificationCallback +} + // Stats contains pool state information and accumulated stats. type Stats struct { Hits uint32 // number of times free connection was found in the pool @@ -359,10 +440,18 @@ func (p *ConnPool) newConn(ctx context.Context, pooled bool) (*Conn, error) { } } + // Notify metrics: new connection created and idle + if connectionStateChangeCallback != nil { + connectionStateChangeCallback(ctx, cn, "", "idle") + } + return cn, nil } func (p *ConnPool) dialConn(ctx context.Context, pooled bool) (*Conn, error) { + // Start measuring connection creation time + startTime := time.Now() + if p.closed() { return nil, ErrClosed } @@ -413,6 +502,12 @@ func (p *ConnPool) dialConn(ctx context.Context, pooled bool) (*Conn, error) { cn.expiresAt = noExpiration } + // Record connection creation time + if connectionCreateTimeCallback != nil { + duration := time.Since(startTime) + connectionCreateTimeCallback(ctx, duration, cn) + } + return cn, nil } @@ -524,6 +619,12 @@ func (p *ConnPool) getConn(ctx context.Context) (*Conn, error) { } atomic.AddUint32(&p.stats.Hits, 1) + + // Notify metrics: connection moved from idle to used + if connectionStateChangeCallback != nil { + connectionStateChangeCallback(ctx, cn, "idle", "used") + } + return cn, nil } @@ -546,6 +647,12 @@ func (p *ConnPool) getConn(ctx context.Context) (*Conn, error) { return nil, err } } + + // Notify metrics: new connection is created and used + if connectionStateChangeCallback != nil { + connectionStateChangeCallback(ctx, newcn, "", "used") + } + return newcn, nil } @@ -840,9 +947,19 @@ func (p *ConnPool) putConn(ctx context.Context, cn *Conn, freeTurn bool) { p.connsMu.Unlock() p.idleConnsLen.Add(1) } + + // Notify metrics: connection moved from used to idle + if connectionStateChangeCallback != nil { + connectionStateChangeCallback(ctx, cn, "used", "idle") + } } else { shouldCloseConn = true p.removeConnWithLock(cn) + + // Notify metrics: connection removed (used -> nothing) + if connectionStateChangeCallback != nil { + connectionStateChangeCallback(ctx, cn, "used", "") + } } if freeTurn { @@ -883,6 +1000,11 @@ func (p *ConnPool) removeConnInternal(ctx context.Context, cn *Conn, reason erro p.freeTurn() } + // Notify metrics: connection removed (assume from used state) + if connectionStateChangeCallback != nil { + connectionStateChangeCallback(ctx, cn, "used", "") + } + _ = p.closeConn(cn) // Check if we need to create new idle connections to maintain MinIdleConns diff --git a/maintnotifications/handoff_worker.go b/maintnotifications/handoff_worker.go index 53f28f49c..632a3d842 100644 --- a/maintnotifications/handoff_worker.go +++ b/maintnotifications/handoff_worker.go @@ -434,6 +434,11 @@ func (hwm *handoffWorkerManager) performHandoffInternal( deadline := time.Now().Add(hwm.config.PostHandoffRelaxedDuration) conn.SetRelaxedTimeoutWithDeadline(relaxedTimeout, relaxedTimeout, deadline) + // Record relaxed timeout metric (post-handoff) + if relaxedTimeoutCallback := pool.GetConnectionRelaxedTimeoutCallback(); relaxedTimeoutCallback != nil { + relaxedTimeoutCallback(ctx, 1, conn, "main", "HANDOFF") + } + if internal.LogLevel.InfoOrAbove() { internal.Logger.Printf(context.Background(), logs.ApplyingRelaxedTimeoutDueToPostHandoff(connID, relaxedTimeout, deadline.Format("15:04:05.000"))) } @@ -462,6 +467,11 @@ func (hwm *handoffWorkerManager) performHandoffInternal( internal.Logger.Printf(ctx, logs.HandoffSucceeded(connID, newEndpoint)) // successfully completed the handoff, no retry needed and no error + // Notify metrics: connection handoff succeeded + if handoffCallback := pool.GetConnectionHandoffCallback(); handoffCallback != nil { + handoffCallback(ctx, conn, "main") + } + return false, nil } diff --git a/maintnotifications/push_notification_handler.go b/maintnotifications/push_notification_handler.go index 937b4ae82..1db035037 100644 --- a/maintnotifications/push_notification_handler.go +++ b/maintnotifications/push_notification_handler.go @@ -40,14 +40,44 @@ func (snh *NotificationHandler) HandlePushNotification(ctx context.Context, hand var err error switch notificationType { case NotificationMoving: + // Record maintenance notification metric + if maintenanceCallback := pool.GetMaintenanceNotificationCallback(); maintenanceCallback != nil { + if conn, ok := handlerCtx.Conn.(*pool.Conn); ok { + maintenanceCallback(ctx, conn, notificationType) + } + } err = snh.handleMoving(ctx, handlerCtx, modifiedNotification) case NotificationMigrating: + // Record maintenance notification metric + if maintenanceCallback := pool.GetMaintenanceNotificationCallback(); maintenanceCallback != nil { + if conn, ok := handlerCtx.Conn.(*pool.Conn); ok { + maintenanceCallback(ctx, conn, notificationType) + } + } err = snh.handleMigrating(ctx, handlerCtx, modifiedNotification) case NotificationMigrated: + // Record maintenance notification metric + if maintenanceCallback := pool.GetMaintenanceNotificationCallback(); maintenanceCallback != nil { + if conn, ok := handlerCtx.Conn.(*pool.Conn); ok { + maintenanceCallback(ctx, conn, notificationType) + } + } err = snh.handleMigrated(ctx, handlerCtx, modifiedNotification) case NotificationFailingOver: + // Record maintenance notification metric + if maintenanceCallback := pool.GetMaintenanceNotificationCallback(); maintenanceCallback != nil { + if conn, ok := handlerCtx.Conn.(*pool.Conn); ok { + maintenanceCallback(ctx, conn, notificationType) + } + } err = snh.handleFailingOver(ctx, handlerCtx, modifiedNotification) case NotificationFailedOver: + // Record maintenance notification metric + if maintenanceCallback := pool.GetMaintenanceNotificationCallback(); maintenanceCallback != nil { + if conn, ok := handlerCtx.Conn.(*pool.Conn); ok { + maintenanceCallback(ctx, conn, notificationType) + } + } err = snh.handleFailedOver(ctx, handlerCtx, modifiedNotification) default: // Ignore other notification types (e.g., pub/sub messages) @@ -191,6 +221,12 @@ func (snh *NotificationHandler) handleMigrating(ctx context.Context, handlerCtx internal.Logger.Printf(ctx, logs.RelaxedTimeoutDueToNotification(conn.GetID(), "MIGRATING", snh.manager.config.RelaxedTimeout)) } conn.SetRelaxedTimeout(snh.manager.config.RelaxedTimeout, snh.manager.config.RelaxedTimeout) + + // Record relaxed timeout metric + if relaxedTimeoutCallback := pool.GetConnectionRelaxedTimeoutCallback(); relaxedTimeoutCallback != nil { + relaxedTimeoutCallback(ctx, 1, conn, "main", "MIGRATING") + } + return nil } @@ -249,6 +285,12 @@ func (snh *NotificationHandler) handleFailingOver(ctx context.Context, handlerCt internal.Logger.Printf(ctx, logs.RelaxedTimeoutDueToNotification(connID, "FAILING_OVER", snh.manager.config.RelaxedTimeout)) } conn.SetRelaxedTimeout(snh.manager.config.RelaxedTimeout, snh.manager.config.RelaxedTimeout) + + // Record relaxed timeout metric + if relaxedTimeoutCallback := pool.GetConnectionRelaxedTimeoutCallback(); relaxedTimeoutCallback != nil { + relaxedTimeoutCallback(ctx, 1, conn, "main", "FAILING_OVER") + } + return nil } diff --git a/osscluster.go b/osscluster.go index 6994ae83f..7e63c5e97 100644 --- a/osscluster.go +++ b/osscluster.go @@ -1172,6 +1172,18 @@ func (c *ClusterClient) process(ctx context.Context, cmd Cmder) error { if moved || ask { c.state.LazyReload() + // Record error metrics + if errorCallback := pool.GetErrorCallback(); errorCallback != nil { + errorType := "MOVED" + statusCode := "MOVED" + if ask { + errorType = "ASK" + statusCode = "ASK" + } + // MOVED/ASK are not internal errors, and this is the first attempt (retry count = 0) + errorCallback(ctx, errorType, nil, statusCode, false, 0) + } + var err error node, err = c.nodes.GetOrCreate(addr) if err != nil { diff --git a/otel.go b/otel.go new file mode 100644 index 000000000..9d66c5bf3 --- /dev/null +++ b/otel.go @@ -0,0 +1,111 @@ +package redis + +import ( + "context" + "net" + "time" + + "github.com/redis/go-redis/v9/internal/otel" + "github.com/redis/go-redis/v9/internal/pool" +) + +// ConnInfo provides information about a Redis connection for metrics. +// This is a public interface to avoid exposing internal types. +type ConnInfo interface { + // RemoteAddr returns the remote network address + RemoteAddr() net.Addr +} + +// OTelRecorder is the interface for recording OpenTelemetry metrics. +// Implementations are provided by extra/redisotel-native package. +// +// This interface is exported to allow external packages to implement +// custom recorders without depending on internal packages. +type OTelRecorder interface { + // RecordOperationDuration records the total operation duration (including all retries) + RecordOperationDuration(ctx context.Context, duration time.Duration, cmd Cmder, attempts int, cn ConnInfo) + + // RecordConnectionStateChange records when a connection changes state (e.g., idle -> used) + RecordConnectionStateChange(ctx context.Context, cn ConnInfo, fromState, toState string) + + // RecordConnectionCreateTime records the time it took to create a new connection + RecordConnectionCreateTime(ctx context.Context, duration time.Duration, cn ConnInfo) + + // RecordConnectionRelaxedTimeout records when connection timeout is relaxed/unrelaxed + // delta: +1 for relaxed, -1 for unrelaxed + // poolName: name of the connection pool (e.g., "main", "pubsub") + // notificationType: the notification type that triggered the timeout relaxation (e.g., "MOVING", "HANDOFF") + RecordConnectionRelaxedTimeout(ctx context.Context, delta int, cn ConnInfo, poolName, notificationType string) + + // RecordConnectionHandoff records when a connection is handed off to another node + // poolName: name of the connection pool (e.g., "main", "pubsub") + RecordConnectionHandoff(ctx context.Context, cn ConnInfo, poolName string) + + // RecordError records client errors (ASK, MOVED, handshake failures, etc.) + // errorType: type of error (e.g., "ASK", "MOVED", "HANDSHAKE_FAILED") + // statusCode: Redis response status code if available (e.g., "MOVED", "ASK") + // isInternal: whether this is an internal error + // retryAttempts: number of retry attempts made + RecordError(ctx context.Context, errorType string, cn ConnInfo, statusCode string, isInternal bool, retryAttempts int) + + // RecordMaintenanceNotification records when a maintenance notification is received + // notificationType: the type of notification (e.g., "MOVING", "MIGRATING", etc.) + RecordMaintenanceNotification(ctx context.Context, cn ConnInfo, notificationType string) +} + +// SetOTelRecorder sets the global OpenTelemetry recorder. +// This is typically called by Init() in extra/redisotel-native package. +// +// Setting a nil recorder disables metrics collection. +func SetOTelRecorder(r OTelRecorder) { + if r == nil { + otel.SetGlobalRecorder(nil) + return + } + otel.SetGlobalRecorder(&otelRecorderAdapter{r}) +} + +// otelRecorderAdapter adapts the public OTelRecorder interface to the internal otel.Recorder interface +type otelRecorderAdapter struct { + recorder OTelRecorder +} + +// toConnInfo converts internal pool.Conn to public ConnInfo interface +// Returns nil if cn is nil, otherwise returns cn (which implements ConnInfo) +func toConnInfo(cn *pool.Conn) ConnInfo { + if cn != nil { + return cn + } + return nil +} + +func (a *otelRecorderAdapter) RecordOperationDuration(ctx context.Context, duration time.Duration, cmd otel.Cmder, attempts int, cn *pool.Conn) { + // Convert internal Cmder to public Cmder + if publicCmd, ok := cmd.(Cmder); ok { + a.recorder.RecordOperationDuration(ctx, duration, publicCmd, attempts, toConnInfo(cn)) + } +} + +func (a *otelRecorderAdapter) RecordConnectionStateChange(ctx context.Context, cn *pool.Conn, fromState, toState string) { + a.recorder.RecordConnectionStateChange(ctx, toConnInfo(cn), fromState, toState) +} + +func (a *otelRecorderAdapter) RecordConnectionCreateTime(ctx context.Context, duration time.Duration, cn *pool.Conn) { + a.recorder.RecordConnectionCreateTime(ctx, duration, toConnInfo(cn)) +} + +func (a *otelRecorderAdapter) RecordConnectionRelaxedTimeout(ctx context.Context, delta int, cn *pool.Conn, poolName, notificationType string) { + a.recorder.RecordConnectionRelaxedTimeout(ctx, delta, toConnInfo(cn), poolName, notificationType) +} + +func (a *otelRecorderAdapter) RecordConnectionHandoff(ctx context.Context, cn *pool.Conn, poolName string) { + a.recorder.RecordConnectionHandoff(ctx, toConnInfo(cn), poolName) +} + +func (a *otelRecorderAdapter) RecordError(ctx context.Context, errorType string, cn *pool.Conn, statusCode string, isInternal bool, retryAttempts int) { + a.recorder.RecordError(ctx, errorType, toConnInfo(cn), statusCode, isInternal, retryAttempts) +} + +func (a *otelRecorderAdapter) RecordMaintenanceNotification(ctx context.Context, cn *pool.Conn, notificationType string) { + a.recorder.RecordMaintenanceNotification(ctx, toConnInfo(cn), notificationType) +} diff --git a/redis.go b/redis.go index a6a710677..7c963ceb3 100644 --- a/redis.go +++ b/redis.go @@ -13,6 +13,7 @@ import ( "github.com/redis/go-redis/v9/internal" "github.com/redis/go-redis/v9/internal/auth/streaming" "github.com/redis/go-redis/v9/internal/hscan" + "github.com/redis/go-redis/v9/internal/otel" "github.com/redis/go-redis/v9/internal/pool" "github.com/redis/go-redis/v9/internal/proto" "github.com/redis/go-redis/v9/maintnotifications" @@ -559,6 +560,13 @@ func (c *baseClient) initConn(ctx context.Context, cn *pool.Conn) error { // enabled mode, fail the connection c.optLock.Unlock() cn.GetStateMachine().Transition(pool.StateClosed) + + // Record handshake failure metric + if errorCallback := pool.GetErrorCallback(); errorCallback != nil { + // Handshake failures are internal errors with no retry attempts + errorCallback(ctx, "HANDSHAKE_FAILED", cn, "HANDSHAKE_FAILED", true, 0) + } + return fmt.Errorf("failed to enable maintnotifications: %w", maintNotifHandshakeErr) default: // will handle auto and any other // Disabling logging here as it's too noisy. @@ -662,17 +670,34 @@ func (c *baseClient) dial(ctx context.Context, network, addr string) (net.Conn, } func (c *baseClient) process(ctx context.Context, cmd Cmder) error { + // Start measuring total operation duration (includes all retries) + operationStart := time.Now() + var lastConn *pool.Conn + var lastErr error + totalAttempts := 0 for attempt := 0; attempt <= c.opt.MaxRetries; attempt++ { + totalAttempts++ attempt := attempt - retry, err := c._process(ctx, cmd, attempt) + retry, cn, err := c._process(ctx, cmd, attempt) + if cn != nil { + lastConn = cn + } if err == nil || !retry { + // Record total operation duration + operationDuration := time.Since(operationStart) + otel.RecordOperationDuration(ctx, operationDuration, cmd, totalAttempts, lastConn) return err } lastErr = err } + + // Record failed operation after all retries + operationDuration := time.Since(operationStart) + otel.RecordOperationDuration(ctx, operationDuration, cmd, totalAttempts, lastConn) + return lastErr } @@ -689,15 +714,17 @@ func (c *baseClient) assertUnstableCommand(cmd Cmder) (bool, error) { } } -func (c *baseClient) _process(ctx context.Context, cmd Cmder, attempt int) (bool, error) { +func (c *baseClient) _process(ctx context.Context, cmd Cmder, attempt int) (bool, *pool.Conn, error) { if attempt > 0 { if err := internal.Sleep(ctx, c.retryBackoff(attempt)); err != nil { - return false, err + return false, nil, err } } + var usedConn *pool.Conn retryTimeout := uint32(0) if err := c.withConn(ctx, func(ctx context.Context, cn *pool.Conn) error { + usedConn = cn // Process any pending push notifications before executing the command if err := c.processPushNotifications(ctx, cn); err != nil { internal.Logger.Printf(ctx, "push: error processing pending notifications before command: %v", err) @@ -738,10 +765,10 @@ func (c *baseClient) _process(ctx context.Context, cmd Cmder, attempt int) (bool return nil }); err != nil { retry := shouldRetry(err, atomic.LoadUint32(&retryTimeout) == 1) - return retry, err + return retry, usedConn, err } - return false, nil + return false, usedConn, nil } func (c *baseClient) retryBackoff(attempt int) time.Duration {