Skip to content

Commit f1f93d3

Browse files
Merge pull request #2 from randomizedcoder/init
improvements
2 parents e4f9505 + 56de290 commit f1f93d3

8 files changed

Lines changed: 1200 additions & 115 deletions

File tree

IMPLEMENTATION_PLAN.md

Lines changed: 255 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,7 +1085,15 @@ func BenchmarkCancel_Atomic_Done_Parallel(b *testing.B) {
10851085
}
10861086
```
10871087

1088-
### 4.2 `internal/queue/queue_bench_test.go`
1088+
### 4.2 Queue Benchmarks: Goroutine Topology
1089+
1090+
Queue performance varies dramatically based on how goroutines interact. We benchmark three scenarios:
1091+
1092+
#### 4.2.1 Single Goroutine (Baseline)
1093+
1094+
Push+pop in the same goroutine—no lock contention:
1095+
1096+
**File:** `internal/queue/queue_bench_test.go`
10891097

10901098
```go
10911099
package queue_test
@@ -1099,6 +1107,7 @@ import (
10991107
var sinkInt int
11001108
var sinkOK bool
11011109
1110+
// Single goroutine: push+pop in same routine (no contention)
11021111
func BenchmarkQueue_Channel_PushPop_Direct(b *testing.B) {
11031112
q := queue.NewChannel[int](1024)
11041113
b.ReportAllocs()
@@ -1128,38 +1137,250 @@ func BenchmarkQueue_RingBuffer_PushPop_Direct(b *testing.B) {
11281137
sinkInt = val
11291138
sinkOK = ok
11301139
}
1140+
```
1141+
1142+
**Expected results:**
1143+
1144+
| Implementation | Latency | Notes |
1145+
|----------------|---------|-------|
1146+
| Channel | ~39 ns | Go channel with no contention |
1147+
| RingBuffer (guarded) | ~36 ns | SPSC guards add overhead |
1148+
| RingBuffer (unguarded) | ~9.5 ns | True lock-free performance |
1149+
1150+
#### 4.2.2 SPSC: 1 Producer → 1 Consumer (2 Goroutines)
1151+
1152+
The classic producer/consumer pattern—one goroutine writes, another reads:
1153+
1154+
**File:** `internal/combined/combined_bench_test.go`
1155+
1156+
```go
1157+
// BenchmarkPipeline_Channel benchmarks 2-goroutine SPSC with channels.
1158+
func BenchmarkPipeline_Channel(b *testing.B) {
1159+
q := queue.NewChannel[int](1024)
1160+
done := make(chan struct{})
1161+
1162+
// Consumer goroutine
1163+
go func() {
1164+
for {
1165+
select {
1166+
case <-done:
1167+
return
1168+
default:
1169+
q.Pop()
1170+
}
1171+
}
1172+
}()
11311173
1132-
func BenchmarkQueue_Channel_PushPop_Interface(b *testing.B) {
1133-
var q queue.Queue[int] = queue.NewChannel[int](1024)
11341174
b.ReportAllocs()
11351175
b.ResetTimer()
11361176
1137-
var val int
1138-
var ok bool
1177+
// Producer (benchmark loop)
11391178
for i := 0; i < b.N; i++ {
1140-
q.Push(i)
1141-
val, ok = q.Pop()
1179+
for !q.Push(i) {
1180+
// Spin until push succeeds
1181+
}
11421182
}
1143-
sinkInt = val
1144-
sinkOK = ok
1183+
1184+
b.StopTimer()
1185+
close(done)
11451186
}
11461187
1147-
func BenchmarkQueue_RingBuffer_PushPop_Interface(b *testing.B) {
1148-
var q queue.Queue[int] = queue.NewRingBuffer[int](1024)
1188+
// BenchmarkPipeline_RingBuffer benchmarks 2-goroutine SPSC with ring buffer.
1189+
func BenchmarkPipeline_RingBuffer(b *testing.B) {
1190+
q := queue.NewRingBuffer[int](1024)
1191+
done := make(chan struct{})
1192+
1193+
// Consumer goroutine (single consumer - SPSC contract)
1194+
go func() {
1195+
for {
1196+
select {
1197+
case <-done:
1198+
return
1199+
default:
1200+
q.Pop()
1201+
}
1202+
}
1203+
}()
1204+
11491205
b.ReportAllocs()
11501206
b.ResetTimer()
11511207
1152-
var val int
1153-
var ok bool
1208+
// Producer (single producer - SPSC contract)
11541209
for i := 0; i < b.N; i++ {
1155-
q.Push(i)
1156-
val, ok = q.Pop()
1210+
for !q.Push(i) {}
11571211
}
1158-
sinkInt = val
1159-
sinkOK = ok
1212+
1213+
b.StopTimer()
1214+
close(done)
1215+
}
1216+
```
1217+
1218+
**Expected results:**
1219+
1220+
| Implementation | Latency | Speedup |
1221+
|----------------|---------|---------|
1222+
| Channel | ~128 ns | baseline |
1223+
| RingBuffer (guarded) | ~147 ns | 0.9x (slower due to guards!) |
1224+
| RingBuffer (unguarded) | ~39 ns | **3.3x** |
1225+
1226+
#### 4.2.3 MPSC: N Producers → 1 Consumer (Channels Only)
1227+
1228+
Multiple producers sending to one consumer—a very common Go pattern:
1229+
1230+
```go
1231+
// BenchmarkMPSC_Channel_2Producers benchmarks 2 producers -> 1 consumer.
1232+
func BenchmarkMPSC_Channel_2Producers(b *testing.B) {
1233+
ch := make(chan int, 1024)
1234+
done := make(chan struct{})
1235+
consumerDone := make(chan struct{})
1236+
1237+
// Consumer goroutine
1238+
go func() {
1239+
defer close(consumerDone)
1240+
for {
1241+
select {
1242+
case <-done:
1243+
return
1244+
case <-ch:
1245+
default:
1246+
}
1247+
}
1248+
}()
1249+
1250+
b.ReportAllocs()
1251+
b.ResetTimer()
1252+
1253+
b.RunParallel(func(pb *testing.PB) {
1254+
i := 0
1255+
for pb.Next() {
1256+
for {
1257+
select {
1258+
case ch <- i:
1259+
goto sent
1260+
default:
1261+
}
1262+
}
1263+
sent:
1264+
i++
1265+
}
1266+
})
1267+
1268+
b.StopTimer()
1269+
close(done)
1270+
<-consumerDone
11601271
}
11611272
```
11621273

1274+
**Expected results (showing channel lock contention):**
1275+
1276+
| Producers | Channel Latency | vs SPSC |
1277+
|-----------|-----------------|---------|
1278+
| 1 (SPSC) | ~128 ns | baseline |
1279+
| 2 | ~5.9 µs | 46x slower |
1280+
| 4 | ~26 µs | 200x slower |
1281+
| 8 | ~49 µs | 380x slower |
1282+
1283+
> **Why this matters:** Channel lock contention scales poorly. For high-throughput MPSC, use go-lock-free-ring.
1284+
1285+
#### 4.2.4 go-lock-free-ring Comparison
1286+
1287+
The [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) library provides a sharded MPSC ring buffer that dramatically outperforms channels under contention.
1288+
1289+
**File:** `internal/combined/lockfreering_bench_test.go`
1290+
1291+
```go
1292+
import ring "github.com/randomizedcoder/go-lock-free-ring"
1293+
1294+
// BenchmarkLFR_SPSC_ShardedRing1 - go-lock-free-ring with 1 shard
1295+
func BenchmarkLFR_SPSC_ShardedRing1(b *testing.B) {
1296+
r, _ := ring.NewShardedRing(1024, 1)
1297+
done := make(chan struct{})
1298+
1299+
go func() {
1300+
for {
1301+
select {
1302+
case <-done:
1303+
return
1304+
default:
1305+
r.TryRead()
1306+
}
1307+
}
1308+
}()
1309+
1310+
b.ResetTimer()
1311+
for i := 0; i < b.N; i++ {
1312+
for !r.Write(0, i) {}
1313+
}
1314+
b.StopTimer()
1315+
close(done)
1316+
}
1317+
1318+
// BenchmarkLFR_MPSC_ShardedRing_8P_8S - 8 producers, 8 shards
1319+
func BenchmarkLFR_MPSC_ShardedRing_8P_8S(b *testing.B) {
1320+
r, _ := ring.NewShardedRing(2048, 8)
1321+
done := make(chan struct{})
1322+
consumerDone := make(chan struct{})
1323+
1324+
go func() {
1325+
defer close(consumerDone)
1326+
for {
1327+
select {
1328+
case <-done:
1329+
return
1330+
default:
1331+
r.TryRead()
1332+
}
1333+
}
1334+
}()
1335+
1336+
var producerID atomic.Uint64
1337+
b.SetParallelism(8)
1338+
b.ResetTimer()
1339+
1340+
b.RunParallel(func(pb *testing.PB) {
1341+
pid := producerID.Add(1) - 1
1342+
i := 0
1343+
for pb.Next() {
1344+
for !r.Write(pid, i) {}
1345+
i++
1346+
}
1347+
})
1348+
1349+
b.StopTimer()
1350+
close(done)
1351+
<-consumerDone
1352+
}
1353+
```
1354+
1355+
**Comparison Results:**
1356+
1357+
##### SPSC (1 Producer → 1 Consumer)
1358+
1359+
| Implementation | Latency | Allocs | Speedup |
1360+
|----------------|---------|--------|---------|
1361+
| Channel | 248 ns | 0 | baseline |
1362+
| go-lock-free-ring (1 shard) | 114 ns | 1 | 2.2x |
1363+
| **Our SPSC Ring (unguarded)** | **36.5 ns** | **0** | **6.8x** |
1364+
1365+
> For pure SPSC, our simple ring buffer wins due to minimal overhead and zero allocations.
1366+
1367+
##### MPSC (N Producers → 1 Consumer)
1368+
1369+
| Producers | Channel | go-lock-free-ring | Speedup |
1370+
|-----------|---------|-------------------|---------|
1371+
| 4 | 35.3 µs | 539 ns | **65x** |
1372+
| 8 | 47.1 µs | 464 ns | **101x** |
1373+
1374+
> The sharded design of go-lock-free-ring eliminates lock contention, providing **65-100x** speedup.
1375+
1376+
##### Choosing the Right Queue
1377+
1378+
| Pattern | Best Choice | Why |
1379+
|---------|-------------|-----|
1380+
| 1 producer, 1 consumer | Our SPSC Ring | Fastest, zero allocs |
1381+
| N producers, 1 consumer | go-lock-free-ring | Sharding eliminates contention |
1382+
| Simple/infrequent | Channel | Simplicity matters more |
1383+
11631384
### 4.3 `internal/tick/tick_bench_test.go`
11641385

11651386
```go
@@ -1392,79 +1613,30 @@ func BenchmarkCombined_CancelTick_Optimized(b *testing.B) {
13921613

13931614
> **Why this matters:** Isolated benchmarks often show 10-20x speedups, but real loops have multiple operations. The combined benchmark shows the *actual* end-to-end improvement you'll see in production.
13941615

1395-
### 4.5 Two-Goroutine SPSC Pipeline Benchmark
1616+
### 4.5 Queue Benchmark Summary
13961617

1397-
The **most representative** benchmark for real Go systems—a producer/consumer pipeline:
1618+
> **Note:** The 2-goroutine SPSC pipeline and MPSC benchmarks are now documented in section 4.2.2 and 4.2.3 respectively, as part of the comprehensive queue benchmark suite.
13981619

1399-
```go
1400-
// internal/combined/pipeline_bench_test.go
1401-
package combined_test
1620+
**Key benchmark commands:**
14021621

1403-
import (
1404-
"testing"
1405-
1406-
"github.com/randomizedcoder/some-go-benchmarks/internal/queue"
1407-
)
1408-
1409-
func BenchmarkPipeline_Channel(b *testing.B) {
1410-
q := queue.NewChannel[int](1024)
1411-
done := make(chan struct{})
1412-
1413-
// Consumer
1414-
go func() {
1415-
for {
1416-
select {
1417-
case <-done:
1418-
return
1419-
default:
1420-
q.Pop()
1421-
}
1422-
}
1423-
}()
1424-
1425-
b.ReportAllocs()
1426-
b.ResetTimer()
1427-
1428-
for i := 0; i < b.N; i++ {
1429-
for !q.Push(i) {
1430-
// Spin until push succeeds
1431-
}
1432-
}
1433-
1434-
b.StopTimer()
1435-
close(done)
1436-
}
1437-
1438-
func BenchmarkPipeline_RingBuffer(b *testing.B) {
1439-
q := queue.NewRingBuffer[int](1024)
1440-
done := make(chan struct{})
1622+
```bash
1623+
# Single-goroutine (baseline)
1624+
go test -bench=BenchmarkQueue -benchmem ./internal/queue
14411625
1442-
// Consumer (single goroutine - SPSC contract)
1443-
go func() {
1444-
for {
1445-
select {
1446-
case <-done:
1447-
return
1448-
default:
1449-
q.Pop()
1450-
}
1451-
}
1452-
}()
1626+
# 2-goroutine SPSC pipeline
1627+
go test -bench=BenchmarkPipeline -benchmem ./internal/combined
14531628
1454-
b.ReportAllocs()
1455-
b.ResetTimer()
1629+
# MPSC (multiple producers)
1630+
go test -bench=BenchmarkMPSC -benchmem ./internal/combined
1631+
```
14561632

1457-
// Producer (single goroutine - SPSC contract)
1458-
for i := 0; i < b.N; i++ {
1459-
for !q.Push(i) {
1460-
// Spin until push succeeds
1461-
}
1462-
}
1633+
**What these benchmarks reveal:**
14631634

1464-
b.StopTimer()
1465-
close(done)
1466-
}
1467-
```
1635+
| Pattern | Best Use Case |
1636+
|---------|---------------|
1637+
| Single goroutine | Testing raw queue overhead |
1638+
| SPSC (2 goroutines) | Classic producer/consumer pipelines |
1639+
| MPSC (N producers) | Fan-in patterns, worker pools |
14681640

14691641
### 4.6 Benchmark Methodology Validation
14701642

0 commit comments

Comments
 (0)