@@ -1085,7 +1085,15 @@ func BenchmarkCancel_Atomic_Done_Parallel(b *testing.B) {
10851085}
10861086` ` `
10871087
1088- # ## 4.2 `internal/queue/queue_bench_test.go`
1088+ # ## 4.2 Queue Benchmarks: Goroutine Topology
1089+
1090+ Queue performance varies dramatically based on how goroutines interact. We benchmark three scenarios :
1091+
1092+ # ### 4.2.1 Single Goroutine (Baseline)
1093+
1094+ Push+pop in the same goroutine—no lock contention :
1095+
1096+ **File:** `internal/queue/queue_bench_test.go`
10891097
10901098` ` ` go
10911099package queue_test
@@ -1099,6 +1107,7 @@ import (
10991107var sinkInt int
11001108var sinkOK bool
11011109
1110+ // Single goroutine: push+pop in same routine (no contention)
11021111func BenchmarkQueue_Channel_PushPop_Direct(b *testing.B) {
11031112 q := queue.NewChannel[int](1024)
11041113 b.ReportAllocs()
@@ -1128,38 +1137,250 @@ func BenchmarkQueue_RingBuffer_PushPop_Direct(b *testing.B) {
11281137 sinkInt = val
11291138 sinkOK = ok
11301139}
1140+ ` ` `
1141+
1142+ **Expected results:**
1143+
1144+ | Implementation | Latency | Notes |
1145+ |----------------|---------|-------|
1146+ | Channel | ~39 ns | Go channel with no contention |
1147+ | RingBuffer (guarded) | ~36 ns | SPSC guards add overhead |
1148+ | RingBuffer (unguarded) | ~9.5 ns | True lock-free performance |
1149+
1150+ # ### 4.2.2 SPSC: 1 Producer → 1 Consumer (2 Goroutines)
1151+
1152+ The classic producer/consumer pattern—one goroutine writes, another reads :
1153+
1154+ **File:** `internal/combined/combined_bench_test.go`
1155+
1156+ ` ` ` go
1157+ // BenchmarkPipeline_Channel benchmarks 2-goroutine SPSC with channels.
1158+ func BenchmarkPipeline_Channel(b *testing.B) {
1159+ q := queue.NewChannel[int](1024)
1160+ done := make(chan struct{})
1161+
1162+ // Consumer goroutine
1163+ go func() {
1164+ for {
1165+ select {
1166+ case <-done:
1167+ return
1168+ default:
1169+ q.Pop()
1170+ }
1171+ }
1172+ }()
11311173
1132- func BenchmarkQueue_Channel_PushPop_Interface(b *testing.B) {
1133- var q queue.Queue[int] = queue.NewChannel[int](1024)
11341174 b.ReportAllocs()
11351175 b.ResetTimer()
11361176
1137- var val int
1138- var ok bool
1177+ // Producer (benchmark loop)
11391178 for i := 0; i < b.N; i++ {
1140- q.Push(i)
1141- val, ok = q.Pop()
1179+ for !q.Push(i) {
1180+ // Spin until push succeeds
1181+ }
11421182 }
1143- sinkInt = val
1144- sinkOK = ok
1183+
1184+ b.StopTimer()
1185+ close(done)
11451186}
11461187
1147- func BenchmarkQueue_RingBuffer_PushPop_Interface(b *testing.B) {
1148- var q queue.Queue[int] = queue.NewRingBuffer[int](1024)
1188+ // BenchmarkPipeline_RingBuffer benchmarks 2-goroutine SPSC with ring buffer.
1189+ func BenchmarkPipeline_RingBuffer(b *testing.B) {
1190+ q := queue.NewRingBuffer[int](1024)
1191+ done := make(chan struct{})
1192+
1193+ // Consumer goroutine (single consumer - SPSC contract)
1194+ go func() {
1195+ for {
1196+ select {
1197+ case <-done:
1198+ return
1199+ default:
1200+ q.Pop()
1201+ }
1202+ }
1203+ }()
1204+
11491205 b.ReportAllocs()
11501206 b.ResetTimer()
11511207
1152- var val int
1153- var ok bool
1208+ // Producer (single producer - SPSC contract)
11541209 for i := 0; i < b.N; i++ {
1155- q.Push(i)
1156- val, ok = q.Pop()
1210+ for !q.Push(i) {}
11571211 }
1158- sinkInt = val
1159- sinkOK = ok
1212+
1213+ b.StopTimer()
1214+ close(done)
1215+ }
1216+ ` ` `
1217+
1218+ **Expected results:**
1219+
1220+ | Implementation | Latency | Speedup |
1221+ |----------------|---------|---------|
1222+ | Channel | ~128 ns | baseline |
1223+ | RingBuffer (guarded) | ~147 ns | 0.9x (slower due to guards!) |
1224+ | RingBuffer (unguarded) | ~39 ns | **3.3x** |
1225+
1226+ # ### 4.2.3 MPSC: N Producers → 1 Consumer (Channels Only)
1227+
1228+ Multiple producers sending to one consumer—a very common Go pattern :
1229+
1230+ ` ` ` go
1231+ // BenchmarkMPSC_Channel_2Producers benchmarks 2 producers -> 1 consumer.
1232+ func BenchmarkMPSC_Channel_2Producers(b *testing.B) {
1233+ ch := make(chan int, 1024)
1234+ done := make(chan struct{})
1235+ consumerDone := make(chan struct{})
1236+
1237+ // Consumer goroutine
1238+ go func() {
1239+ defer close(consumerDone)
1240+ for {
1241+ select {
1242+ case <-done:
1243+ return
1244+ case <-ch:
1245+ default:
1246+ }
1247+ }
1248+ }()
1249+
1250+ b.ReportAllocs()
1251+ b.ResetTimer()
1252+
1253+ b.RunParallel(func(pb *testing.PB) {
1254+ i := 0
1255+ for pb.Next() {
1256+ for {
1257+ select {
1258+ case ch <- i:
1259+ goto sent
1260+ default:
1261+ }
1262+ }
1263+ sent:
1264+ i++
1265+ }
1266+ })
1267+
1268+ b.StopTimer()
1269+ close(done)
1270+ <-consumerDone
11601271}
11611272` ` `
11621273
1274+ **Expected results (showing channel lock contention):**
1275+
1276+ | Producers | Channel Latency | vs SPSC |
1277+ |-----------|-----------------|---------|
1278+ | 1 (SPSC) | ~128 ns | baseline |
1279+ | 2 | ~5.9 µs | 46x slower |
1280+ | 4 | ~26 µs | 200x slower |
1281+ | 8 | ~49 µs | 380x slower |
1282+
1283+ > **Why this matters:** Channel lock contention scales poorly. For high-throughput MPSC, use go-lock-free-ring.
1284+
1285+ # ### 4.2.4 go-lock-free-ring Comparison
1286+
1287+ The [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) library provides a sharded MPSC ring buffer that dramatically outperforms channels under contention.
1288+
1289+ **File:** `internal/combined/lockfreering_bench_test.go`
1290+
1291+ ` ` ` go
1292+ import ring "github.com/randomizedcoder/go-lock-free-ring"
1293+
1294+ // BenchmarkLFR_SPSC_ShardedRing1 - go-lock-free-ring with 1 shard
1295+ func BenchmarkLFR_SPSC_ShardedRing1(b *testing.B) {
1296+ r, _ := ring.NewShardedRing(1024, 1)
1297+ done := make(chan struct{})
1298+
1299+ go func() {
1300+ for {
1301+ select {
1302+ case <-done:
1303+ return
1304+ default:
1305+ r.TryRead()
1306+ }
1307+ }
1308+ }()
1309+
1310+ b.ResetTimer()
1311+ for i := 0; i < b.N; i++ {
1312+ for !r.Write(0, i) {}
1313+ }
1314+ b.StopTimer()
1315+ close(done)
1316+ }
1317+
1318+ // BenchmarkLFR_MPSC_ShardedRing_8P_8S - 8 producers, 8 shards
1319+ func BenchmarkLFR_MPSC_ShardedRing_8P_8S(b *testing.B) {
1320+ r, _ := ring.NewShardedRing(2048, 8)
1321+ done := make(chan struct{})
1322+ consumerDone := make(chan struct{})
1323+
1324+ go func() {
1325+ defer close(consumerDone)
1326+ for {
1327+ select {
1328+ case <-done:
1329+ return
1330+ default:
1331+ r.TryRead()
1332+ }
1333+ }
1334+ }()
1335+
1336+ var producerID atomic.Uint64
1337+ b.SetParallelism(8)
1338+ b.ResetTimer()
1339+
1340+ b.RunParallel(func(pb *testing.PB) {
1341+ pid := producerID.Add(1) - 1
1342+ i := 0
1343+ for pb.Next() {
1344+ for !r.Write(pid, i) {}
1345+ i++
1346+ }
1347+ })
1348+
1349+ b.StopTimer()
1350+ close(done)
1351+ <-consumerDone
1352+ }
1353+ ` ` `
1354+
1355+ **Comparison Results:**
1356+
1357+ # #### SPSC (1 Producer → 1 Consumer)
1358+
1359+ | Implementation | Latency | Allocs | Speedup |
1360+ |----------------|---------|--------|---------|
1361+ | Channel | 248 ns | 0 | baseline |
1362+ | go-lock-free-ring (1 shard) | 114 ns | 1 | 2.2x |
1363+ | **Our SPSC Ring (unguarded)** | **36.5 ns** | **0** | **6.8x** |
1364+
1365+ > For pure SPSC, our simple ring buffer wins due to minimal overhead and zero allocations.
1366+
1367+ # #### MPSC (N Producers → 1 Consumer)
1368+
1369+ | Producers | Channel | go-lock-free-ring | Speedup |
1370+ |-----------|---------|-------------------|---------|
1371+ | 4 | 35.3 µs | 539 ns | **65x** |
1372+ | 8 | 47.1 µs | 464 ns | **101x** |
1373+
1374+ > The sharded design of go-lock-free-ring eliminates lock contention, providing **65-100x** speedup.
1375+
1376+ # #### Choosing the Right Queue
1377+
1378+ | Pattern | Best Choice | Why |
1379+ |---------|-------------|-----|
1380+ | 1 producer, 1 consumer | Our SPSC Ring | Fastest, zero allocs |
1381+ | N producers, 1 consumer | go-lock-free-ring | Sharding eliminates contention |
1382+ | Simple/infrequent | Channel | Simplicity matters more |
1383+
11631384# ## 4.3 `internal/tick/tick_bench_test.go`
11641385
11651386` ` ` go
@@ -1392,79 +1613,30 @@ func BenchmarkCombined_CancelTick_Optimized(b *testing.B) {
13921613
13931614> **Why this matters:** Isolated benchmarks often show 10-20x speedups, but real loops have multiple operations. The combined benchmark shows the *actual* end-to-end improvement you'll see in production.
13941615
1395- # ## 4.5 Two-Goroutine SPSC Pipeline Benchmark
1616+ # ## 4.5 Queue Benchmark Summary
13961617
1397- The **most representative ** benchmark for real Go systems—a producer/consumer pipeline :
1618+ > **Note: ** The 2-goroutine SPSC pipeline and MPSC benchmarks are now documented in section 4.2.2 and 4.2.3 respectively, as part of the comprehensive queue benchmark suite.
13981619
1399- ` ` ` go
1400- // internal/combined/pipeline_bench_test.go
1401- package combined_test
1620+ **Key benchmark commands:**
14021621
1403- import (
1404- "testing"
1405-
1406- "github.com/randomizedcoder/some-go-benchmarks/internal/queue"
1407- )
1408-
1409- func BenchmarkPipeline_Channel(b *testing.B) {
1410- q := queue.NewChannel[int](1024)
1411- done := make(chan struct{})
1412-
1413- // Consumer
1414- go func() {
1415- for {
1416- select {
1417- case <-done:
1418- return
1419- default:
1420- q.Pop()
1421- }
1422- }
1423- }()
1424-
1425- b.ReportAllocs()
1426- b.ResetTimer()
1427-
1428- for i := 0; i < b.N; i++ {
1429- for !q.Push(i) {
1430- // Spin until push succeeds
1431- }
1432- }
1433-
1434- b.StopTimer()
1435- close(done)
1436- }
1437-
1438- func BenchmarkPipeline_RingBuffer(b *testing.B) {
1439- q := queue.NewRingBuffer[int](1024)
1440- done := make(chan struct{})
1622+ ` ` ` bash
1623+ # Single-goroutine (baseline)
1624+ go test -bench=BenchmarkQueue -benchmem ./internal/queue
14411625
1442- // Consumer (single goroutine - SPSC contract)
1443- go func() {
1444- for {
1445- select {
1446- case <-done:
1447- return
1448- default:
1449- q.Pop()
1450- }
1451- }
1452- }()
1626+ # 2-goroutine SPSC pipeline
1627+ go test -bench=BenchmarkPipeline -benchmem ./internal/combined
14531628
1454- b.ReportAllocs()
1455- b.ResetTimer()
1629+ # MPSC (multiple producers)
1630+ go test -bench=BenchmarkMPSC -benchmem ./internal/combined
1631+ ` ` `
14561632
1457- // Producer (single goroutine - SPSC contract)
1458- for i := 0; i < b.N; i++ {
1459- for !q.Push(i) {
1460- // Spin until push succeeds
1461- }
1462- }
1633+ **What these benchmarks reveal:**
14631634
1464- b.StopTimer()
1465- close(done)
1466- }
1467- ` ` `
1635+ | Pattern | Best Use Case |
1636+ |---------|---------------|
1637+ | Single goroutine | Testing raw queue overhead |
1638+ | SPSC (2 goroutines) | Classic producer/consumer pipelines |
1639+ | MPSC (N producers) | Fan-in patterns, worker pools |
14681640
14691641# ## 4.6 Benchmark Methodology Validation
14701642
0 commit comments