|
8 | 8 |
|
9 | 9 | static mach_timebase_info_data_t g_tb; |
10 | 10 | static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } |
| 11 | +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly |
11 | 12 |
|
12 | 13 | NSData *buildWeightBlob(int ch, int depth) { |
13 | 14 | NSUInteger wsize = ch * ch * 2; |
|
27 | 28 |
|
28 | 29 | NSString *genMIL(int ch, int sp, int depth) { |
29 | 30 | NSMutableString *m = [NSMutableString string]; |
30 | | - [m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"]; |
31 | | - [m appendFormat:@" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp]; |
32 | | - [m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n" |
33 | | - @" tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n" |
34 | | - @" tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n" |
35 | | - @" tensor<int32, [2]> c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n" |
36 | | - @" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n" |
37 | | - @" string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"]; |
38 | | - [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp]; |
| 31 | + [m appendString:@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"]; |
| 32 | + if (g_fp16_io) { |
| 33 | + // fp16 I/O path — no cast ops (M1/M2 compatible) |
| 34 | + [m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ch, sp]; |
| 35 | + } else { |
| 36 | + // fp32 I/O path — cast to/from fp16 internally (M4+ native) |
| 37 | + [m appendFormat:@" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp]; |
| 38 | + } |
| 39 | + [m appendString: |
| 40 | + @" tensor<string, []> c_pad_type_0 = const()[name = tensor<string, []>(\"c_pad_type_0\"), val = tensor<string, []>(\"valid\")];\n" |
| 41 | + @" tensor<int32, [2]> c_strides_0 = const()[name = tensor<string, []>(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n" |
| 42 | + @" tensor<int32, [4]> c_pad_0 = const()[name = tensor<string, []>(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n" |
| 43 | + @" tensor<int32, [2]> c_dilations_0 = const()[name = tensor<string, []>(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n" |
| 44 | + @" tensor<int32, []> c_groups_0 = const()[name = tensor<string, []>(\"c_groups_0\"), val = tensor<int32, []>(1)];\n"]; |
| 45 | + NSString *prev; |
| 46 | + if (g_fp16_io) { |
| 47 | + prev = @"x"; |
| 48 | + } else { |
| 49 | + [m appendString:@" tensor<string, []> x_to_fp16_dtype_0 = const()[name = tensor<string, []>(\"x_to_fp16_dtype_0\"), val = tensor<string, []>(\"fp16\")];\n"]; |
| 50 | + [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = tensor<string, []>(\"cast_in\")];\n", ch, sp]; |
| 51 | + prev = @"x_to_fp16"; |
| 52 | + } |
39 | 53 | NSUInteger cs = 64 + ch*ch*2; |
40 | | - NSString *prev = @"x_to_fp16"; |
41 | 54 | for (int i = 0; i < depth; i++) { |
42 | | - [m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = string(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n", |
| 55 | + [m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = tensor<string, []>(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n", |
43 | 56 | ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)]; |
44 | 57 | NSString *out = [NSString stringWithFormat:@"c%d", i]; |
45 | | - [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n", |
| 58 | + [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = tensor<string, []>(\"%@\")];\n", |
46 | 59 | ch, sp, out, i, prev, out]; |
47 | 60 | prev = out; |
48 | 61 | } |
49 | | - [m appendString:@" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"]; |
50 | | - [m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev]; |
51 | | - [m appendString:@" } -> (c);\n}\n"]; |
| 62 | + if (g_fp16_io) { |
| 63 | + [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> c = identity(x = %@)[name = tensor<string, []>(\"out\")];\n", ch, sp, prev]; |
| 64 | + [m appendString:@" } -> (c);\n}\n"]; |
| 65 | + } else { |
| 66 | + [m appendString:@" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"]; |
| 67 | + [m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = tensor<string, []>(\"cast_out\")];\n", ch, sp, prev]; |
| 68 | + [m appendString:@" } -> (c);\n}\n"]; |
| 69 | + } |
52 | 70 | return m; |
53 | 71 | } |
54 | 72 |
|
|
68 | 86 | [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; |
69 | 87 | [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; |
70 | 88 | [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; |
71 | | - if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;} |
| 89 | + if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){ |
| 90 | + [fm removeItemAtPath:td error:nil]; |
| 91 | + if (!g_fp16_io) { |
| 92 | + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); |
| 93 | + g_fp16_io = 1; |
| 94 | + return bench(ch, sp, depth); |
| 95 | + } |
| 96 | + return -3; |
| 97 | + } |
72 | 98 | if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;} |
73 | | - NSUInteger bytes=ch*sp*4; |
| 99 | + size_t bpe = g_fp16_io ? 2 : 4; |
| 100 | + NSUInteger bytes=ch*sp*bpe; |
74 | 101 | IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); |
75 | 102 | IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); |
76 | 103 | id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI); |
|
0 commit comments