Address PR review: buffer pool, inline shifts, GGML refs, naming

ajroetker · ajroetker · commit 52503862aacf · 2026-03-25T11:50:32.000-07:00
- Use buffer pool + ConvertDType for weight unpacking and index
  conversion instead of ad-hoc allocations (unpackWeightsToBuffer,
  convertIndicesToInt64)
- Inline shift operations following binary ops pattern to eliminate
  per-element closure overhead (shiftLeftOp, shiftRightArithmeticOp,
  shiftRightLogicalUnsignedOp, shiftRightLogicalSignedOp)
- Rename parallelTileCount → quantizedDenseParallelTileCount
- Simplify numIndices calculation (last dim pre-validated as 1)
- Use tgtIsUint8 GoType check in exec_bitcast instead of Bits() &lt; 8
- Add GGML format references and doc links to fused_ops.go
- Add "Follow Existing Patterns" guidance to AGENTS.md
diff --git a/.agents/AGENTS.md b/.agents/AGENTS.md
@@ -82,6 +82,13 @@ an error, to simplify the code. But everywhere else, use standard Go error handl
 - Use `any` instead of `interface{}`.
 - Organize tests in hierarchies using `t.Run()` to group related tests.
 
+### Follow Existing Patterns
+
+Before writing new code, read neighboring files in the same package to understand the established
+patterns (buffer management, dtype dispatch, parallelization, etc.). Reuse existing infrastructure
+rather than writing ad-hoc implementations. When in doubt, match the style and approach of the
+closest existing operation.
+
 ### Copyright Notes
 
 Normal code files are prefixed with the following copyright line:
diff --git a/backends/fused_ops.go b/backends/fused_ops.go
@@ -322,7 +322,7 @@ type FusedOps interface {
 	// QuantizedEmbeddingLookup performs a quantized embedding lookup (row gather)
 	// with on-the-fly dequantization.
 	//
-	// This is the quantized analogue of embedding lookup, inspired by
+	// This is the quantized analogue of Gather for embedding lookups, inspired by
 	// llama.cpp's ggml_get_rows. For now it is only implemented for the GGML
 	// quantization scheme, but could be extended for others if/when needed.
 	//
diff --git a/backends/simplego/exec_bitcast.go b/backends/simplego/exec_bitcast.go
@@ -3,6 +3,8 @@
 package simplego
 
 import (
+	"reflect"
+
 	"github.com/gomlx/gomlx/backends"
 )
 
@@ -41,7 +43,8 @@ func execBitcast(backend *Backend, node *Node, inputs []*Buffer, inputsOwned []b
 			// target use the same underlying Go storage type. Sub-byte types
 			// (Int2, Uint2, Int4, Uint4) all store as []uint8.
 			_, srcIsUint8 := src.flat.([]uint8)
-			canReuse = srcIsUint8 && targetDType.Bits() < 8
+			tgtIsUint8 := targetDType.GoType().Kind() == reflect.Uint8
+			canReuse = srcIsUint8 && tgtIsUint8
 		}
 	}
 	if canReuse {
diff --git a/backends/simplego/exec_fused_quantized.go b/backends/simplego/exec_fused_quantized.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/gomlx/gomlx/backends"
 	"github.com/gomlx/gomlx/pkg/core/dtypes"
+	"github.com/gomlx/gomlx/pkg/core/shapes"
 	"github.com/pkg/errors"
 )
 
@@ -73,9 +74,16 @@ func execFusedQuantizedDense(backend *Backend, node *Node, inputs []*Buffer, inp
 		zeroPoints = zeroPointsBuf.flat.([]float32)
 	}
 
-	// For packed sub-byte weights (from Bitcast), unpack nibbles before processing.
-	// Packed buffers have len(flat) < shape.Size() (2 nibbles per byte).
-	wFlat := unpackWeightsToInt8(wBuf)
+	// For packed sub-byte weights (from Bitcast), unpack nibbles via the buffer pool
+	// and ConvertDType infrastructure. Non-sub-byte types pass through unchanged.
+	unpackedBuf, unpackedPooled, err := unpackWeightsToBuffer(backend, wBuf)
+	if err != nil {
+		return nil, err
+	}
+	if unpackedPooled {
+		defer backend.putBuffer(unpackedBuf)
+	}
+	wFlat := unpackedBuf.flat
 
 	switch data.scheme {
 	case backends.QuantNF4:
@@ -105,22 +113,37 @@ func execFusedQuantizedDense(backend *Backend, node *Node, inputs []*Buffer, inp
 	return output, nil
 }
 
-// unpackWeightsToInt8 unpacks sub-byte weight data (Int4, Uint4) from packed
-// []byte storage into []int8 (one value per element) for the matmul kernel.
-// For non-sub-byte types, returns the flat data as-is.
-func unpackWeightsToInt8(wBuf *Buffer) any {
-	var unpackFn unpackNibblesFn
+// unpackWeightsToBuffer unpacks sub-byte weight data (Int4, Uint4) into a pooled
+// buffer using the ConvertDType infrastructure. For non-sub-byte types, returns the
+// original buffer unchanged.
+//
+// Returns the (possibly new) buffer, whether it was allocated from the pool
+// (caller must putBuffer), and any error.
+func unpackWeightsToBuffer(backend *Backend, wBuf *Buffer) (*Buffer, bool, error) {
+	var targetDType dtypes.DType
 	switch wBuf.shape.DType {
-	case dtypes.Uint4:
-		unpackFn = unpackUint4Nibbles
 	case dtypes.Int4:
-		unpackFn = unpackInt4Nibbles
+		targetDType = dtypes.Int8
+	case dtypes.Uint4:
+		targetDType = dtypes.Uint8
 	default:
-		return wBuf.flat
+		return wBuf, false, nil
 	}
-	unpacked := make([]int8, wBuf.shape.Size())
-	unpackFn(wBuf.flat.([]byte), unpacked)
-	return unpacked
+
+	outBuf, err := backend.getBuffer(targetDType, wBuf.shape.Size())
+	if err != nil {
+		return nil, false, err
+	}
+	outBuf.shape = shapes.Make(targetDType, wBuf.shape.Dimensions...)
+
+	convertFnAny, err := convertDTypePairMap.Get(wBuf.shape.DType, targetDType)
+	if err != nil {
+		backend.putBuffer(outBuf)
+		return nil, false, err
+	}
+	convertFn := convertFnAny.(convertFnType)
+	convertFn(wBuf, outBuf)
+	return outBuf, true, nil
 }
 
 // execQuantizedEmbeddingLookup performs quantized embedding lookup.
@@ -146,50 +169,61 @@ func execQuantizedEmbeddingLookup(backend *Backend, node *Node, inputs []*Buffer
 		return nil, err
 	}
 
-	numIndices := indicesBuf.shape.Size() / indicesBuf.shape.Dimensions[indicesBuf.shape.Rank()-1]
+	// Last dim is pre-validated to be 1, so total elements == number of indices.
+	numIndices := indicesBuf.shape.Size()
 
-	indices, err := quantGatherIntSliceOfFlat(indicesBuf.flat, numIndices)
+	// Convert indices to int64 via the buffer pool and ConvertDType infrastructure.
+	idxBuf, idxPooled, err := convertIndicesToInt64(backend, indicesBuf)
 	if err != nil {
 		return nil, errors.Wrapf(err, "QuantizedEmbeddingLookup")
 	}
-	vocabSize := dataBuf.shape.Dimensions[0]
-	for i, rowIdx := range indices {
+	if idxPooled {
+		defer backend.putBuffer(idxBuf)
+	}
+	indices := idxBuf.flat.([]int64)
+
+	vocabSize := int64(dataBuf.shape.Dimensions[0])
+	for i, rowIdx := range indices[:numIndices] {
 		if rowIdx < 0 || rowIdx >= vocabSize {
 			return nil, errors.Errorf("QuantizedEmbeddingLookup: index %d out of range [0, %d)", rowIdx, vocabSize)
 		}
-		rowData := dataBytes[rowIdx*bytesPerRow : (rowIdx+1)*bytesPerRow]
+		rowStart := rowIdx * int64(bytesPerRow)
+		rowData := dataBytes[rowStart : rowStart+int64(bytesPerRow)]
 		dequantFn(rowData, out[i*K:(i+1)*K])
 	}
 
 	return output, nil
 }
 
-// quantGatherIntSliceOfFlat converts a flat index slice ([]int32, []int64, or []int) to []int.
-func quantGatherIntSliceOfFlat(flat any, n int) ([]int, error) {
-	switch s := flat.(type) {
-	case []int32:
-		return convertToIntSlice(s, n), nil
-	case []int64:
-		return convertToIntSlice(s, n), nil
-	case []int:
-		return s[:n], nil
-	default:
-		return nil, errors.Errorf("unsupported indices type %T", flat)
+// convertIndicesToInt64 converts an integer index buffer to int64 via the buffer
+// pool and ConvertDType infrastructure. If the buffer is already int64, it is
+// returned as-is.
+//
+// Returns the (possibly new) buffer, whether it was allocated from the pool
+// (caller must putBuffer), and any error.
+func convertIndicesToInt64(backend *Backend, indicesBuf *Buffer) (*Buffer, bool, error) {
+	if indicesBuf.shape.DType == dtypes.Int64 {
+		return indicesBuf, false, nil
 	}
-}
+	outBuf, err := backend.getBuffer(dtypes.Int64, indicesBuf.shape.Size())
+	if err != nil {
+		return nil, false, err
+	}
+	outBuf.shape = shapes.Make(dtypes.Int64, indicesBuf.shape.Dimensions...)
 
-// convertToIntSlice converts the first n elements of an integer slice to []int.
-func convertToIntSlice[T int32 | int64](s []T, n int) []int {
-	out := make([]int, n)
-	for i := range n {
-		out[i] = int(s[i])
+	convertFnAny, err := convertDTypePairMap.Get(indicesBuf.shape.DType, dtypes.Int64)
+	if err != nil {
+		backend.putBuffer(outBuf)
+		return nil, false, err
 	}
-	return out
+	convertFn := convertFnAny.(convertFnType)
+	convertFn(indicesBuf, outBuf)
+	return outBuf, true, nil
 }
 
-// parallelTileCount returns the number of parallel work units that
+// quantizedDenseParallelTileCount returns the number of parallel work units that
 // quantizedDenseParallel will dispatch for the given dimensions.
-func parallelTileCount(backend *Backend, M, K, N int) int {
+func quantizedDenseParallelTileCount(backend *Backend, M, K, N int) int {
 	totalWork := M * K * N
 	if backend == nil || !backend.workers.IsEnabled() || totalWork <= minParallelizeChunk {
 		return M
@@ -202,7 +236,7 @@ func parallelTileCount(backend *Backend, M, K, N int) int {
 }
 
 // quantizedDenseParallel parallelizes over M rows, or tiles over N columns when M=1.
-// workerIdx is a dense index in [0, parallelTileCount) identifying the work unit.
+// workerIdx is a dense index in [0, quantizedDenseParallelTileCount) identifying the work unit.
 func quantizedDenseParallel(backend *Backend, M, K, N int, rowFn func(workerIdx, m, nStart, nEnd int)) {
 	totalWork := M * K * N
 	if backend == nil || !backend.workers.IsEnabled() || totalWork <= minParallelizeChunk {
diff --git a/backends/simplego/exec_fused_quantized_ggml.go b/backends/simplego/exec_fused_quantized_ggml.go
@@ -103,7 +103,7 @@ func quantizedDenseGGML(backend *Backend, x []float32, weights []uint8, bias, ou
 	}
 
 	// Pre-allocate per-worker scratch buffers to avoid heap allocation per tile invocation.
-	numWorkers := parallelTileCount(backend, M, K, N)
+	numWorkers := quantizedDenseParallelTileCount(backend, M, K, N)
 	scratchBufs := make([][]float32, numWorkers)
 	for i := range scratchBufs {
 		scratchBufs[i] = make([]float32, K)
diff --git a/backends/simplego/exec_shift_ops.go b/backends/simplego/exec_shift_ops.go
diff --git a/backends/simplego/fused_ops.go b/backends/simplego/fused_ops.go

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,8 @@`
`3`	`3`	`package simplego`
`4`	`4`
`5`	`5`	`import (`
	`6`	`+ "reflect"`
	`7`	`+`
`6`	`8`	`"github.com/gomlx/gomlx/backends"`
`7`	`9`	`)`
`8`	`10`
`@@ -41,7 +43,8 @@ func execBitcast(backend Backend, node Node, inputs []*Buffer, inputsOwned []b`
`41`	`43`	`// target use the same underlying Go storage type. Sub-byte types`
`42`	`44`	`// (Int2, Uint2, Int4, Uint4) all store as []uint8.`
`43`	`45`	`_, srcIsUint8 := src.flat.([]uint8)`
`44`		`- canReuse = srcIsUint8 && targetDType.Bits() < 8`
	`46`	`+ tgtIsUint8 := targetDType.GoType().Kind() == reflect.Uint8`
	`47`	`+ canReuse = srcIsUint8 && tgtIsUint8`
`45`	`48`	`}`
`46`	`49`	`}`
`47`	`50`	`if canReuse {`
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ func quantizedDenseGGML(backend *Backend, x []float32, weights []uint8, bias, ou`
`103`	`103`	`}`
`104`	`104`
`105`	`105`	`// Pre-allocate per-worker scratch buffers to avoid heap allocation per tile invocation.`
`106`		`- numWorkers := parallelTileCount(backend, M, K, N)`
	`106`	`+ numWorkers := quantizedDenseParallelTileCount(backend, M, K, N)`
`107`	`107`	`scratchBufs := make([][]float32, numWorkers)`
`108`	`108`	`for i := range scratchBufs {`
`109`	`109`	`scratchBufs[i] = make([]float32, K)`