feat(m5): add Apple M5 ANE hardware support and performance suite

Lumysia · Lumysia · commit 0caf6993f683 · 2026-03-04T02:43:08.000-05:00
- Add 128-byte IOSurface alignment for M5 (Apple 10 family) compatibility
- Implement dynamic weight injection via matmul operator for real-time updates
- Add m5_performance_suite.m benchmark tool (4096-dim, ~1.0 TFLOPS, ~1.8ms latency)
- Update ane_runtime.h with weights surface support for dynamic weights
- Update ane_mil_gen.h with program(1.5) and mil_gen_dynamic_matmul()
- Document M5 hardware constraints in README.md

Tested: m5_performance_suite, test_dynamic_matmul, train_large_ane all pass
diff --git a/README.md b/README.md
@@ -156,6 +156,77 @@ No external dependencies. Uses only system frameworks + private ANE APIs resolve
 
 This project uses Apple's private, undocumented APIs (`_ANEClient`, `_ANECompiler`, `_ANEInMemoryModelDescriptor`). These APIs are not covered by any public stability guarantee and may change or break with any macOS update. This is independent research into Apple Neural Engine architecture, using APIs discovered through runtime introspection for research and educational purposes under fair use and interoperability provisions (see *Sega v. Accolade*, 1992; DMCA §1201(f)). No Apple proprietary code or binaries are included in this repository. This project is not affiliated with or endorsed by Apple Inc. Use at your own risk.
 
+## Hardware Characterization: Apple M5 (2026)
+
+The M5 (Apple 10 family) introduces specific ANE behavioral constraints that differ from earlier M-series chips. This section documents the key findings from reverse-engineering efforts.
+
+### Key M5 ANE Constraints
+
+| Constraint | Value | Notes |
+|:---|:---|:---|
+| **IOSurface Alignment** | 128 bytes | All input, output, and weight surfaces must be 128-byte aligned. Failure results in silent evaluation errors or compiler rejection. |
+| **MIL Version** | program(1.5) | M5 is optimized for MIL 1.5. Use `ios17` or `ios18` function targets. For packed single-input formats, `program(1.3)` remains compatible. |
+| **Max Dynamic Dimension** | 4096 × 4096 | Maximum dimension for dynamic weight tensors passed as inputs. |
+| **Peak Throughput** | ~1.0 TFLOPS | Pure ANE compute for 4096-dim matmul operations (measured: 0.86-1.53 TFLOPS). |
+| **Update Latency** | ~1.8 ms | CPU-to-IOSurface `memcpy` + ANE eval for weight updates at 4096 dims (measured: 1.7-1.9 ms). |
+
+### Dynamic Weight Injection
+
+On M5, the traditional approach of baking weights into the compiled model (via `BLOBFILE`) does not support runtime updates—the ANE snapshots weights into private memory at load time. The only viable path for real-time weight updates is:
+
+**Treat weights as Input Tensors using the `matmul` operator.**
+
+```objc
+// MIL pattern for dynamic weights (M5 compatible)
+// Input 0: activations [1, 1, SEQ, IC]
+// Input 1: weights [1, 1, IC, OC]  ← dynamic!
+// Output:  [1, 1, SEQ, OC]
+
+NSString *mil = [NSString stringWithFormat:
+    @"program(1.5)\n"
+    "{\n"
+    "    func main<ios17>(tensor<fp32, [1, 1, %d, %d]> x, tensor<fp32, [1, 1, %d, %d]> weights) {\n"
+    "        // Cast to fp16, matmul, cast back to fp32\n"
+    "    } -> (y);\n"
+    "}\n", seq, ic, ic, oc];
+```
+
+This approach enables:
+- **Zero-copy weight swapping**: Update weights via `memcpy` into the input IOSurface
+- **~100x faster updates** vs. recompile-and-load cycle (1.8ms vs 40-170ms)
+- **On-device training**: Foundation for gradient descent on ANE
+
+### M5 Performance Benchmarks
+
+Run the benchmark suite:
+
+```bash
+cd training
+make m5_performance_suite
+./m5_performance_suite
+```
+
+Expected output on M5:
+
+```
+Max Dynamic Dimension:     4096 x 4096
+Peak Throughput:           1.02 TFLOPS
+Weight Update Latency:     1.78 ms
+Max Weight Tensor Size:    67.11 MB
+```
+
+### Implementation Notes
+
+1. **Alignment Helper**: Use `ane_create_surface()` which automatically applies 128-byte alignment—backward compatible with M3/M4.
+
+2. **MIL Generation**: Use `mil_gen_dynamic_matmul()` from `ane_mil_gen.h` for M5-compatible dynamic weight layers.
+
+3. **Weight Surface**: For large weights (>16MB), use `ane_create_weights_surface()` which adds `kIOSurfaceIsGlobal` for ANE hardware access.
+
+4. **Matmul vs Conv**: For dynamic weights, `matmul` is more stable than `conv` on M5 due to flexible hardware tiling on the NCE (Neural Compute Engine).
+
+---
+
 ## License
 
 MIT — see [LICENSE](LICENSE)
diff --git a/training/Makefile b/training/Makefile
@@ -36,13 +36,16 @@ test_qos_sweep: test_qos_sweep.m
 test_ane_advanced: test_ane_advanced.m
 	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
 
+m5_performance_suite: m5_performance_suite.m
+	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+
 probes: $(PROBES)
 
 tokenize:
 	python3 tokenize.py
 
 clean:
-	rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier
+	rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier m5_performance_suite
 
 .PHONY: clean tokenize probes
 
diff --git a/training/ane_mil_gen.h b/training/ane_mil_gen.h
@@ -1,4 +1,5 @@
 // ane_mil_gen.h — Generate MIL text for conv-based linear ops + weight blobs
+// M5 ANE optimized: Uses program(1.5) and supports dynamic weight injection
 #pragma once
 #import <Foundation/Foundation.h>
 #include <stdlib.h>
@@ -25,13 +26,26 @@ static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int i
     return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
 }
 
+// Build raw FP16 weights without header (for dynamic weight injection via IOSurface)
+// weights_f32: source weights in row-major [out_ch, in_ch]
+// Returns NSData with just FP16 values, no headers
+static NSData *mil_build_raw_weights_fp16(const float *weights_f32, int out_ch, int in_ch) {
+    NSUInteger weightSize = (NSUInteger)out_ch * in_ch * sizeof(_Float16);
+    uint8_t *buf = (uint8_t*)malloc(weightSize);
+    _Float16 *fp16 = (_Float16*)buf;
+    for (NSUInteger i = 0; i < (NSUInteger)out_ch * in_ch; i++)
+        fp16[i] = (_Float16)weights_f32[i];
+    return [NSData dataWithBytesNoCopy:buf length:weightSize freeWhenDone:YES];
+}
+
 // Generate MIL for a single matmul: y = W @ x (using matmul op, weights as input)
 // Input x: [1, in_ch, spatial] fp32
 // Input W: [1, out_ch, in_ch] fp32
 // Output:  [1, out_ch, spatial] fp32
+// Using program(1.5) for M5/macOS 15 native version
 static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
     return [NSString stringWithFormat:
-        @"program(1.3)\n"
+        @"program(1.5)\n"
         "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
         "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
         "{\"coremltools-version\", \"9.0\"}})]\n"
@@ -52,10 +66,40 @@ static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
         out_ch, spatial, out_ch, spatial];
 }
 
+// Generate MIL for dynamic matmul with weights as input tensor (M5 optimized).
+// This is the preferred approach for dynamic weight injection on M5 ANE.
+// Input 0: tensor<fp32, [1, 1, SEQ, IC]> activations (transposed for matmul)
+// Input 1: tensor<fp32, [1, 1, IC, OC]> weights (dynamic)
+// Output:  tensor<fp32, [1, 1, SEQ, OC]>
+// Uses program(1.5) and ios17 for M5 compatibility.
+static NSString *mil_gen_dynamic_matmul(int ic, int oc, int seq) {
+    return [NSString stringWithFormat:
+        @"program(1.5)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n"
+        "{\n"
+        "    func main<ios17>(tensor<fp32, [1, 1, %d, %d]> x, tensor<fp32, [1, 1, %d, %d]> weights) {\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, 1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n"
+        "        tensor<fp16, [1, 1, %d, %d]> w16 = cast(dtype = to_fp16, x = weights)[name = string(\"cast_w\")];\n"
+        "        bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
+        "        bool ty = const()[name = string(\"ty\"), val = bool(false)];\n"
+        "        tensor<fp16, [1, 1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = x16, y = w16)[name = string(\"matmul\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, 1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
+        "    } -> (y);\n"
+        "}\n",
+        seq, ic, ic, oc,
+        seq, ic, ic, oc,
+        seq, oc, seq, oc];
+}
+
 // Keep the baked-weight version for reference (used in inference-only scenarios)
+// Using program(1.5) for M5/macOS 15 native version
 static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
     return [NSString stringWithFormat:
-        @"program(1.3)\n"
+        @"program(1.5)\n"
         "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
         "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
         "{\"coremltools-version\", \"9.0\"}})]\n"
@@ -89,7 +133,7 @@ static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
 static NSString *mil_gen_qkv(int dim, int spatial) {
     NSUInteger cs = 64 + (NSUInteger)dim * dim * 2;
     return [NSString stringWithFormat:
-        @"program(1.3)\n"
+        @"program(1.5)\n"
         "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
         "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
         "{\"coremltools-version\", \"9.0\"}})]\n"
@@ -174,7 +218,7 @@ static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, in
 static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) {
     NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2;
     return [NSString stringWithFormat:
-        @"program(1.3)\n"
+        @"program(1.5)\n"
         "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
         "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
         "{\"coremltools-version\", \"9.0\"}})]\n"
diff --git a/training/ane_runtime.h b/training/ane_runtime.h
@@ -1,21 +1,32 @@
 // ane_runtime.h — Reusable ANE in-memory compile/load/eval wrapper
 // Uses _ANEInMemoryModel via private AppleNeuralEngine.framework
+//
+// M5 ANE Compatibility:
+// - 128-byte alignment for all IOSurface buffers (backward compatible)
+// - Dynamic weight support via weightsSurface parameter
+// - MIL 1.5 program version for optimal M5 performance
 #pragma once
 #import <Foundation/Foundation.h>
 #import <objc/runtime.h>
 #import <objc/message.h>
 #import <dlfcn.h>
 #import <IOSurface/IOSurface.h>
+#import <sys/mman.h>
+#import <sys/stat.h>
+#import <fcntl.h>
 
 typedef struct {
     id model;               // _ANEInMemoryModel
     IOSurfaceRef *ioInputs;
     IOSurfaceRef *ioOutputs;
+    IOSurfaceRef weightsSurface;  // Optional: dynamic weights IOSurface
+    id weightsBuffer;              // Optional: _ANEIOSurfaceObject for weights
     id request;             // _ANERequest
     NSString *tmpDir;
     int nInputs, nOutputs;
     size_t *inputBytes;
     size_t *outputBytes;
+    size_t weightsBytes;    // Size of weights surface
 } ANEKernel;
 
 static Class g_ANEDesc, g_ANEInMem, g_ANEReq, g_ANEIO;
@@ -31,24 +42,52 @@ static void ane_init(void) {
     g_ane_loaded = true;
 }
 
+// Create an IOSurface with 128-byte alignment for M5 ANE compatibility.
+// This alignment is required on M5 (Apple 10 family) and backward compatible
+// with older M-series chips.
 static IOSurfaceRef ane_create_surface(size_t bytes) {
+    // Round up to 128-byte boundary for M5 ANE compatibility
+    size_t aligned = ((bytes + 127) / 128) * 128;
     return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth: @(bytes),
+        (id)kIOSurfaceWidth: @(aligned),
         (id)kIOSurfaceHeight: @1,
         (id)kIOSurfaceBytesPerElement: @1,
-        (id)kIOSurfaceBytesPerRow: @(bytes),
-        (id)kIOSurfaceAllocSize: @(bytes),
+        (id)kIOSurfaceBytesPerRow: @(aligned),
+        (id)kIOSurfaceAllocSize: @(aligned),
         (id)kIOSurfacePixelFormat: @0
     });
 }
 
+// Create an IOSurface specifically for dynamic weights.
+// Uses the same 128-byte alignment as regular surfaces.
+static IOSurfaceRef ane_create_weights_surface(size_t bytes) {
+    size_t aligned = ((bytes + 127) / 128) * 128;
+    if (aligned < 128) aligned = 128;
+    
+    NSMutableDictionary *props = [NSMutableDictionary dictionaryWithObjectsAndKeys:
+        @(aligned), (id)kIOSurfaceWidth,
+        @1, (id)kIOSurfaceHeight,
+        @1, (id)kIOSurfaceBytesPerElement,
+        @(aligned), (id)kIOSurfaceBytesPerRow,
+        @(aligned), (id)kIOSurfaceAllocSize,
+        @0, (id)kIOSurfacePixelFormat,
+        nil];
+    
+    // Enable global access for ANE hardware
+    [props setObject:@YES forKey:(id)kIOSurfaceIsGlobal];
+    
+    return IOSurfaceCreate((__bridge CFDictionaryRef)props);
+}
+
 // Compile a MIL graph with weight blob into an ANE kernel.
 // milText: NSData of MIL text
 // weightData: NSData of raw weight blob (can be nil)
 // inputSizes/outputSizes: arrays of byte sizes for each I/O tensor
+// weightsSurface: optional IOSurface for dynamic weights (can be NULL)
 static ANEKernel *ane_compile(NSData *milText, NSData *weightData,
                                int nInputs, size_t *inputSizes,
-                               int nOutputs, size_t *outputSizes) {
+                               int nOutputs, size_t *outputSizes,
+                               IOSurfaceRef weightsSurface) {
     ane_init();
     NSError *e = nil;
 
@@ -97,15 +136,26 @@ static ANEKernel *ane_compile(NSData *milText, NSData *weightData,
     memcpy(k->inputBytes, inputSizes, nInputs * sizeof(size_t));
     memcpy(k->outputBytes, outputSizes, nOutputs * sizeof(size_t));
 
-    // Create IOSurfaces
+    // Create IOSurfaces for inputs/outputs
     k->ioInputs = malloc(nInputs * sizeof(IOSurfaceRef));
     k->ioOutputs = malloc(nOutputs * sizeof(IOSurfaceRef));
     for (int i = 0; i < nInputs; i++)
         k->ioInputs[i] = ane_create_surface(inputSizes[i]);
     for (int i = 0; i < nOutputs; i++)
         k->ioOutputs[i] = ane_create_surface(outputSizes[i]);
 
-    // Build request
+    // Handle optional weights surface for dynamic weight injection
+    id weightsBufferObj = nil;
+    if (weightsSurface) {
+        k->weightsSurface = weightsSurface;
+        CFRetain(weightsSurface);
+        k->weightsBytes = IOSurfaceGetAllocSize(weightsSurface);
+        weightsBufferObj = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
+            g_ANEIO, @selector(objectWithIOSurface:), weightsSurface);
+        k->weightsBuffer = weightsBufferObj;
+    }
+
+    // Build request with optional weights buffer
     NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:nInputs];
     NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:nInputs];
     for (int i = 0; i < nInputs; i++) {
@@ -122,11 +172,53 @@ static ANEKernel *ane_compile(NSData *milText, NSData *weightData,
     }
     k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
         g_ANEReq, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-        wIns, iIdx, wOuts, oIdx, nil, nil, @0);
+        wIns, iIdx, wOuts, oIdx, weightsBufferObj, nil, @0);
 
     return k;
 }
 
+// Legacy compile function (backward compatible wrapper)
+static ANEKernel *ane_compile_legacy(NSData *milText, NSData *weightData,
+                                      int nInputs, size_t *inputSizes,
+                                      int nOutputs, size_t *outputSizes) {
+    return ane_compile(milText, weightData, nInputs, inputSizes, nOutputs, outputSizes, NULL);
+}
+
+// Load weights data into the kernel's weights surface.
+// Returns 0 on success, -1 on failure.
+static int ane_load_weights(ANEKernel *k, const void *data, size_t bytes) {
+    if (!k || !k->weightsSurface) {
+        fprintf(stderr, "ane_load_weights: kernel has no weights surface\n");
+        return -1;
+    }
+    
+    size_t surfaceSize = IOSurfaceGetAllocSize(k->weightsSurface);
+    if (bytes > surfaceSize) {
+        fprintf(stderr, "ane_load_weights: data size %zu exceeds surface size %zu\n",
+                bytes, surfaceSize);
+        return -1;
+    }
+    
+    IOSurfaceLock(k->weightsSurface, 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(k->weightsSurface), data, bytes);
+    IOSurfaceUnlock(k->weightsSurface, 0, NULL);
+    
+    return 0;
+}
+
+// Get pointer to weights surface for direct writing.
+// Caller MUST call ane_weights_unlock after writing.
+static void *ane_weights_lock(ANEKernel *k) {
+    if (!k || !k->weightsSurface) return NULL;
+    IOSurfaceLock(k->weightsSurface, 0, NULL);
+    return IOSurfaceGetBaseAddress(k->weightsSurface);
+}
+
+static void ane_weights_unlock(ANEKernel *k) {
+    if (!k || !k->weightsSurface) return;
+    IOSurfaceUnlock(k->weightsSurface, 0, NULL);
+}
+
 static void ane_write_input(ANEKernel *k, int idx, const void *data, size_t bytes) {
     IOSurfaceLock(k->ioInputs[idx], 0, NULL);
     memcpy(IOSurfaceGetBaseAddress(k->ioInputs[idx]), data, bytes);
@@ -141,9 +233,15 @@ static void ane_read_output(ANEKernel *k, int idx, void *data, size_t bytes) {
 
 static bool ane_eval(ANEKernel *k) {
     NSError *e = nil;
-    return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+    BOOL result = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
         k->model, @selector(evaluateWithQoS:options:request:error:),
         21, @{}, k->request, &e);
+    
+    if (!result && e) {
+        fprintf(stderr, "ANE evaluation failed: %s\n", [[e localizedDescription] UTF8String]);
+    }
+    
+    return result;
 }
 
 static void ane_free(ANEKernel *k) {
@@ -153,6 +251,7 @@ static void ane_free(ANEKernel *k) {
         k->model, @selector(unloadWithQoS:error:), 21, &e);
     for (int i = 0; i < k->nInputs; i++) CFRelease(k->ioInputs[i]);
     for (int i = 0; i < k->nOutputs; i++) CFRelease(k->ioOutputs[i]);
+    if (k->weightsSurface) CFRelease(k->weightsSurface);
     [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil];
     free(k->ioInputs); free(k->ioOutputs);
     free(k->inputBytes); free(k->outputBytes);
diff --git a/training/m5_performance_suite.m b/training/m5_performance_suite.m