From 727272e57fb2a19d882d1e8b857c1bcb7576416e Mon Sep 17 00:00:00 2001 From: gulugulubing <413153391@qq.com> Date: Wed, 1 Apr 2026 17:15:39 -0600 Subject: [PATCH 1/6] adding drivers --- source/dcompute/driver/metal/bindings.d | 99 +++++++++++++++++++ source/dcompute/driver/metal/buffer.d | 43 +++++++++ source/dcompute/driver/metal/device.d | 35 +++++++ source/dcompute/driver/metal/encoder.d | 43 +++++++++ source/dcompute/driver/metal/package.d | 9 ++ source/dcompute/driver/metal/platform.d | 15 +++ source/dcompute/driver/metal/program.d | 92 ++++++++++++++++++ source/dcompute/driver/metal/queue.d | 121 ++++++++++++++++++++++++ source/dcompute/tests/add.metal | 11 +++ source/dcompute/tests/metal_test.d | 70 ++++++++++++++ 10 files changed, 538 insertions(+) create mode 100644 source/dcompute/driver/metal/bindings.d create mode 100644 source/dcompute/driver/metal/buffer.d create mode 100644 source/dcompute/driver/metal/device.d create mode 100644 source/dcompute/driver/metal/encoder.d create mode 100644 source/dcompute/driver/metal/package.d create mode 100644 source/dcompute/driver/metal/platform.d create mode 100644 source/dcompute/driver/metal/program.d create mode 100644 source/dcompute/driver/metal/queue.d create mode 100644 source/dcompute/tests/add.metal create mode 100644 source/dcompute/tests/metal_test.d diff --git a/source/dcompute/driver/metal/bindings.d b/source/dcompute/driver/metal/bindings.d new file mode 100644 index 0000000..2ce90f9 --- /dev/null +++ b/source/dcompute/driver/metal/bindings.d @@ -0,0 +1,99 @@ +module dcompute.driver.metal.bindings; + +import core.attribute : selector; + +alias NSUInteger = ulong; + +enum MTLResourceOptions : NSUInteger { + storageModeShared = 0, +} + +struct MTLSize { + NSUInteger width; + NSUInteger height; + NSUInteger depth; +} + +extern(Objective-C) interface NSObject { + void release(); +} + +extern(Objective-C) interface NSAutoreleasePool : NSObject { + void drain(); +} + +extern(Objective-C) interface NSString : NSObject {} + +extern(Objective-C) interface MTLBuffer : NSObject { + void* contents(); +} + +extern(Objective-C) interface MTLFunction : NSObject {} + +extern(Objective-C) interface MTLLibrary : NSObject { + @selector("newFunctionWithName:") + MTLFunction newFunctionWithName(NSString name); +} + +extern(Objective-C) interface MTLComputePipelineState : NSObject {} + +extern(Objective-C) interface MTLComputeCommandEncoder : NSObject { + @selector("setComputePipelineState:") + void setComputePipelineState(MTLComputePipelineState state); + + @selector("setBuffer:offset:atIndex:") + void setBuffer(MTLBuffer buffer, NSUInteger offset, NSUInteger index); + + @selector("dispatchThreads:threadsPerThreadgroup:") + void dispatchThreads(MTLSize threadsPerGrid, MTLSize threadsPerThreadgroup); + + void endEncoding(); +} + +extern(Objective-C) interface MTLCommandBuffer : NSObject { + MTLComputeCommandEncoder computeCommandEncoder(); + void commit(); + void waitUntilCompleted(); +} + +extern(Objective-C) interface MTLCommandQueue : NSObject { + MTLCommandBuffer commandBuffer(); +} + +extern(Objective-C) interface MTLDevice : NSObject { + MTLCommandQueue newCommandQueue(); + + @selector("newLibraryWithFile:error:") + MTLLibrary newLibraryWithFile(NSString filePath, void* error); + + @selector("newComputePipelineStateWithFunction:error:") + MTLComputePipelineState newComputePipelineStateWithFunction(MTLFunction function_, void* error); + + @selector("newBufferWithBytes:length:options:") + MTLBuffer newBufferWithBytes(const(void)* bytes, NSUInteger length, MTLResourceOptions options); + + @selector("newBufferWithLength:options:") + MTLBuffer newBufferWithLength(NSUInteger length, MTLResourceOptions options); +} + +extern(C) { + alias CFAllocatorRef = const(void)*; + alias CFStringEncoding = uint; + + pragma(mangle, "MTLCreateSystemDefaultDevice") + MTLDevice mtlCreateSystemDefaultDevice(); + + void* CFStringCreateWithCString(CFAllocatorRef alloc, const(char)* cStr, CFStringEncoding encoding); +} + +enum kCFStringEncodingUTF8 = 0x08000100; + +MTLSize mtlSize(NSUInteger w, NSUInteger h = 1, NSUInteger d = 1) { + return MTLSize(w, h, d); +} + +NSString nsString(const(char)[] text) { + auto cfStr = CFStringCreateWithCString(null, (text ~ "\0").ptr, kCFStringEncodingUTF8); + return cast(NSString) cfStr; +} + diff --git a/source/dcompute/driver/metal/buffer.d b/source/dcompute/driver/metal/buffer.d new file mode 100644 index 0000000..d9f0636 --- /dev/null +++ b/source/dcompute/driver/metal/buffer.d @@ -0,0 +1,43 @@ +module dcompute.driver.metal.buffer; + +import dcompute.driver.metal.device; +import dcompute.driver.metal.bindings; + +// No need for host<->device copy calls, since Metal buffers can be shared between CPU and GPU. +// Just create the buffer with storageModeShared and use contents() to get a pointer to the data for both host and device access. +struct Buffer(T) { + + // Store as void* to avoid emitting RTTI/classinfo for extern(Objective-C) interfaces. + private void* raw_; + + this(size_t elems) { + auto dev = defaultDevice().raw; + auto byteCount = cast(NSUInteger)(elems * T.sizeof); + raw = dev.newBufferWithLength(byteCount, MTLResourceOptions.storageModeShared); + } + + this(const(T)[] arr) { + auto dev = defaultDevice().raw; + auto byteCount = cast(NSUInteger)(arr.length * T.sizeof); + raw = dev.newBufferWithBytes(arr.ptr, byteCount, MTLResourceOptions.storageModeShared); + } + + @property MTLBuffer raw() { + return cast(MTLBuffer) raw_; + } + + @property void raw(MTLBuffer buf) { + raw_ = cast(void*) buf; + } + + void* contents() { + auto buf = raw(); + return (buf is null) ? null : buf.contents(); + } + + void release() { + auto buf = raw(); + if (buf !is null) buf.release(); + raw_ = null; + } +} diff --git a/source/dcompute/driver/metal/device.d b/source/dcompute/driver/metal/device.d new file mode 100644 index 0000000..0f7f1e8 --- /dev/null +++ b/source/dcompute/driver/metal/device.d @@ -0,0 +1,35 @@ +module dcompute.driver.metal.device; + +import dcompute.driver.metal.bindings; + +struct Device { + private void* raw_; + + @property MTLDevice raw() { + return cast(MTLDevice) raw_; + } + + @property void raw(MTLDevice device) { + raw_ = cast(void*) device; + } + + void release() { + auto dev = raw(); + if (dev !is null) dev.release(); + } +} + +__gshared void* g_defaultDevice; + +Device defaultDevice() { + Device dev; + if (g_defaultDevice is null) { + g_defaultDevice = cast(void*) mtlCreateSystemDefaultDevice(); + } + dev.raw = cast(MTLDevice) g_defaultDevice; + return dev; +} + +void setDefaultDevice(MTLDevice device) { + g_defaultDevice = cast(void*) device; +} diff --git a/source/dcompute/driver/metal/encoder.d b/source/dcompute/driver/metal/encoder.d new file mode 100644 index 0000000..e799b0b --- /dev/null +++ b/source/dcompute/driver/metal/encoder.d @@ -0,0 +1,43 @@ +module dcompute.driver.metal.encoder; + +import dcompute.driver.metal.buffer; +import dcompute.driver.metal.program; +import dcompute.driver.metal.bindings; + +struct Encoder { + private void* raw_; + + @property MTLComputeCommandEncoder raw() { + return cast(MTLComputeCommandEncoder) raw_; + } + + @property void raw(MTLComputeCommandEncoder enc) { + raw_ = cast(void*) enc; + } + + void setPipeline(Pipeline pipeline) { + auto enc = raw(); + if (enc !is null) enc.setComputePipelineState(pipeline.raw); + } + + void setBuffer(T)(Buffer!T buffer, NSUInteger offset, NSUInteger index) { + auto enc = raw(); + if (enc !is null) enc.setBuffer(buffer.raw, offset, index); + } + + void dispatchThreads(MTLSize threadsPerGrid, MTLSize threadsPerThreadgroup) { + auto enc = raw(); + if (enc !is null) enc.dispatchThreads(threadsPerGrid, threadsPerThreadgroup); + } + + void endEncoding() { + auto enc = raw(); + if (enc !is null) enc.endEncoding(); + } + + void release() { + auto enc = raw(); + if (enc !is null) enc.release(); + raw_ = null; + } +} diff --git a/source/dcompute/driver/metal/package.d b/source/dcompute/driver/metal/package.d new file mode 100644 index 0000000..b6903ee --- /dev/null +++ b/source/dcompute/driver/metal/package.d @@ -0,0 +1,9 @@ +module dcompute.driver.metal; + +public import dcompute.driver.metal.bindings : mtlSize; +public import dcompute.driver.metal.device; +public import dcompute.driver.metal.buffer; +public import dcompute.driver.metal.platform; +public import dcompute.driver.metal.queue; +public import dcompute.driver.metal.program; +public import dcompute.driver.metal.encoder; diff --git a/source/dcompute/driver/metal/platform.d b/source/dcompute/driver/metal/platform.d new file mode 100644 index 0000000..43383c8 --- /dev/null +++ b/source/dcompute/driver/metal/platform.d @@ -0,0 +1,15 @@ +module dcompute.driver.metal.platform; + +import dcompute.driver.metal.device; + +struct Platform { + static void initialise() { + // Metal has no explicit global init step in this backend. + } + + static Device[] getDevices() { + // Current implementation exposes only the default device. + // TODO: Enumerate all Metal devices (for multi-GPU systems). + return [defaultDevice()]; + } +} diff --git a/source/dcompute/driver/metal/program.d b/source/dcompute/driver/metal/program.d new file mode 100644 index 0000000..730f6e7 --- /dev/null +++ b/source/dcompute/driver/metal/program.d @@ -0,0 +1,92 @@ +module dcompute.driver.metal.program; + +import dcompute.driver.metal.device; +import dcompute.driver.metal.bindings; + +struct Library { + private void* raw_; + + @property MTLLibrary raw() { + return cast(MTLLibrary) raw_; + } + + @property void raw(MTLLibrary lib) { + raw_ = cast(void*) lib; + } + + void release() { + auto lib = raw(); + if (lib !is null) lib.release(); + raw_ = null; + } + + MTLFunction newFunction(const(char)[] name) { + auto lib = raw(); + return (lib is null) ? null : lib.newFunctionWithName(nsString(name)); + } +} + +struct Pipeline { + private void* raw_; + + @property MTLComputePipelineState raw() { + return cast(MTLComputePipelineState) raw_; + } + + @property void raw(MTLComputePipelineState pso) { + raw_ = cast(void*) pso; + } + + void release() { + auto pso = raw(); + if (pso !is null) pso.release(); + raw_ = null; + } +} + +struct Kernel { + Pipeline pipeline; + + void release() { + pipeline.release(); + } +} + +struct Program { + Device device; + Library library; + + static Program fromDefaultDevice() { + Program p; + p.device = defaultDevice(); + return p; + } + + Library loadLibrary(const(char)[] path) { + auto dev = device.raw; + library.raw = (dev is null) ? null : dev.newLibraryWithFile(nsString(path), null); + return library; + } + + Pipeline makePipeline(MTLFunction fn) { + Pipeline pso; + auto dev = device.raw; + pso.raw = (dev is null) ? null : dev.newComputePipelineStateWithFunction(fn, null); + return pso; + } + + Kernel getKernel(const(char)[] name) { + Kernel k; + auto lib = library.raw; + if (lib is null) return k; + auto fn = lib.newFunctionWithName(nsString(name)); + if (fn is null) return k; + k.pipeline = makePipeline(fn); + fn.release(); + return k; + } + + void release() { + library.release(); + } +} diff --git a/source/dcompute/driver/metal/queue.d b/source/dcompute/driver/metal/queue.d new file mode 100644 index 0000000..18310a1 --- /dev/null +++ b/source/dcompute/driver/metal/queue.d @@ -0,0 +1,121 @@ +module dcompute.driver.metal.queue; + +import dcompute.driver.metal.device; +import dcompute.driver.metal.encoder; +import dcompute.driver.metal.program; +import dcompute.driver.metal.bindings; +import dcompute.driver.metal.buffer; +import std.meta : allSatisfy; + +private enum isBuffer(T) = is(T == Buffer!U, U); + +struct CommandBuffer { + private void* raw_; + + @property MTLCommandBuffer raw() { + return cast(MTLCommandBuffer) raw_; + } + + @property void raw(MTLCommandBuffer buf) { + raw_ = cast(void*) buf; + } + + void commit() { + auto buf = raw(); + if (buf !is null) buf.commit(); + } + + void waitUntilCompleted() { + auto buf = raw(); + if (buf !is null) buf.waitUntilCompleted(); + } + + Encoder computeEncoder() { + Encoder enc; + auto buf = raw(); + enc.raw = (buf is null) ? null : buf.computeCommandEncoder(); + return enc; + } + + void release() { + auto buf = raw(); + if (buf !is null) buf.release(); + raw_ = null; + } +} + +struct Queue { + private void* raw_; + + this(Device dev) { + auto q = dev.raw.newCommandQueue(); + raw_ = cast(void*) q; + } + + @property MTLCommandQueue raw() { + return cast(MTLCommandQueue) raw_; + } + + @property void raw(MTLCommandQueue q) { + raw_ = cast(void*) q; + } + + CommandBuffer commandBuffer() { + CommandBuffer cb; + auto q = raw(); + cb.raw = (q is null) ? null : q.commandBuffer(); + return cb; + } + + void release() { + auto q = raw(); + if (q !is null) q.release(); + raw_ = null; + } + + void enqueue(Buffers...)(Pipeline pipeline, MTLSize grid, MTLSize group, Buffers buffers) + if (allSatisfy!(isBuffer, Buffers)) { + auto cmdBuffer = commandBuffer(); + if (cmdBuffer.raw is null) return; + auto encoder = cmdBuffer.computeEncoder(); + if (encoder.raw is null) { + cmdBuffer.release(); + return; + } + scope(exit) cmdBuffer.release(); + scope(exit) encoder.release(); + + encoder.setPipeline(pipeline); + foreach (i, ref buf; buffers) { + encoder.setBuffer(buf, 0, cast(NSUInteger) i); + } + encoder.dispatchThreads(grid, group); + encoder.endEncoding(); + cmdBuffer.commit(); + cmdBuffer.waitUntilCompleted(); + } + + + auto enqueue(Kernel kernel, MTLSize grid, MTLSize group) { + static struct Launch { + Queue q; + Kernel k; + MTLSize grid; + MTLSize group; + + this(Queue q, Kernel k, MTLSize grid, MTLSize group) { + this.q = q; + this.k = k; + this.grid = grid; + this.group = group; + } + + void opCall(Buffers...)(Buffers buffers) + if (allSatisfy!(isBuffer, Buffers)) { + q.enqueue(k.pipeline, grid, group, buffers); + } + } + + return Launch(this, kernel, grid, group); + } +} diff --git a/source/dcompute/tests/add.metal b/source/dcompute/tests/add.metal new file mode 100644 index 0000000..1579305 --- /dev/null +++ b/source/dcompute/tests/add.metal @@ -0,0 +1,11 @@ +#include +using namespace metal; + +kernel void vec_add( + device const float* a [[buffer(0)]], + device const float* b [[buffer(1)]], + device float* out [[buffer(2)]], + uint id [[thread_position_in_grid]]) +{ + out[id] = a[id] + b[id]; +} diff --git a/source/dcompute/tests/metal_test.d b/source/dcompute/tests/metal_test.d new file mode 100644 index 0000000..6687c81 --- /dev/null +++ b/source/dcompute/tests/metal_test.d @@ -0,0 +1,70 @@ +module dcompute.tests.metal_test; + +import std.math : fabs; +import std.stdio : writeln; +import dcompute.driver.metal; + +private int fail(const(char)[] msg) { + writeln(msg); + return 1; +} + +int main() { + enum n = 1024; + float[n] hostA; + float[n] hostB; + foreach (i; 0 .. n) { + hostA[i] = cast(float) i; + hostB[i] = cast(float) (i * 2); + } + + Platform.initialise(); + auto devices = Platform.getDevices(); + auto dev = devices[0]; + auto device = dev.raw; + if (device is null) return fail("FAIL: MTLCreateSystemDefaultDevice returned null."); + scope(exit) dev.release(); + + auto queue = Queue(dev); + if (queue.raw is null) return fail("FAIL: newCommandQueue returned null."); + scope(exit) queue.release(); + + auto program = Program.fromDefaultDevice(); + auto library = program.loadLibrary("add.metallib"); + if (library.raw is null) return fail("FAIL: cannot load add.metallib"); + scope(exit) program.release(); + + auto kernel = program.getKernel("vec_add"); + if (kernel.pipeline.raw is null) return fail("FAIL: newComputePipelineStateWithFunction failed."); + scope(exit) kernel.release(); + + Buffer!float aBuffer = Buffer!float(hostA[]); + Buffer!float bBuffer = Buffer!float(hostB[]); + Buffer!float outBuffer = Buffer!float(n); + if (aBuffer.raw is null || bBuffer.raw is null || outBuffer.raw is null) { + return fail("FAIL: buffer allocation failed."); + } + scope(exit) aBuffer.release(); + scope(exit) bBuffer.release(); + scope(exit) outBuffer.release(); + + queue.enqueue(kernel, mtlSize(n, 1, 1), mtlSize(64, 1, 1)) + (aBuffer, bBuffer, outBuffer); + + auto outPtr = cast(float*) outBuffer.contents(); + if (outPtr is null) return fail("FAIL: output buffer has null contents."); + + bool ok = true; + foreach (i; 0 .. n) { + auto expected = hostA[i] + hostB[i]; + if (fabs(outPtr[i] - expected) > 1e-6f) { + ok = false; + writeln("Mismatch at ", i, ": got=", outPtr[i], " expected=", expected); + break; + } + } + + writeln("Sample output: out[0]=", outPtr[0], ", out[1]=", outPtr[1], ", out[2]=", outPtr[2]); + writeln(ok ? "PASS: Metal vector add succeeded." : "FAIL: Metal vector add validation failed."); + return ok ? 0 : 2; +} From 8d4cb24c574c3ea5ffcba3d75d82026d6605c776 Mon Sep 17 00:00:00 2001 From: gulugulubing <413153391@qq.com> Date: Thu, 16 Apr 2026 10:31:39 -0600 Subject: [PATCH 2/6] add metal's index --- source/dcompute/std/index.d | 42 ++++++++++++++++++++++++++++--- source/dcompute/std/metal/index.d | 34 +++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 source/dcompute/std/metal/index.d diff --git a/source/dcompute/std/index.d b/source/dcompute/std/index.d index 60abf6c..6ce73d3 100644 --- a/source/dcompute/std/index.d +++ b/source/dcompute/std/index.d @@ -2,8 +2,9 @@ import ldc.dcompute; -private import ocl = dcompute.std.opencl.index; -private import cuda = dcompute.std.cuda.index; +private import ocl = dcompute.std.opencl.index; +private import cuda = dcompute.std.cuda.index; +private import metal = dcompute.std.metal.index; /* Index Terminology @@ -46,6 +47,8 @@ struct GlobalDimension return ocl.get_global_size(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_x()*cuda.nctaid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_global_size_x(); else assert(0); } @@ -56,6 +59,8 @@ struct GlobalDimension return ocl.get_global_size(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_y()*cuda.nctaid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_global_size_y(); else assert(0); } @@ -66,6 +71,8 @@ struct GlobalDimension return ocl.get_global_size(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_z()*cuda.nctaid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_global_size_z(); else assert(0); } @@ -80,6 +87,8 @@ struct GlobalIndex return ocl.get_global_id(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_x()*cuda.ntid_x() + cuda.tid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_global_id_x(); else assert(0); } @@ -90,6 +99,8 @@ struct GlobalIndex return ocl.get_global_id(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_y()*cuda.ntid_y() + cuda.tid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_global_id_y(); else assert(0); } @@ -100,6 +111,8 @@ struct GlobalIndex return ocl.get_global_id(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_z()*cuda.ntid_z() + cuda.tid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_global_id_z(); else assert(0); } @@ -139,6 +152,8 @@ struct GroupDimension return ocl.get_num_groups(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.nctaid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_num_groups_x(); else assert(0); } @@ -149,6 +164,8 @@ struct GroupDimension return ocl.get_num_groups(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.nctaid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_num_groups_y(); else assert(0); } @@ -159,6 +176,8 @@ struct GroupDimension return ocl.get_num_groups(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.nctaid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_num_groups_z(); else assert(0); } @@ -173,6 +192,8 @@ struct GroupIndex return ocl.get_group_id(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_group_id_x(); else assert(0); } @@ -183,6 +204,8 @@ struct GroupIndex return ocl.get_group_id(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_group_id_y(); else assert(0); } @@ -193,6 +216,8 @@ struct GroupIndex return ocl.get_group_id(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_group_id_z(); else assert(0); } @@ -207,6 +232,8 @@ struct SharedDimension return ocl.get_local_size(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_local_size_x(); else assert(0); } @@ -217,9 +244,10 @@ struct SharedDimension return ocl.get_local_size(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_local_size_y(); else assert(0); - } pragma(inline,true); @property static size_t z()() @@ -228,6 +256,8 @@ struct SharedDimension return ocl.get_local_size(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_local_size_z(); else assert(0); } @@ -242,6 +272,8 @@ struct SharedIndex return ocl.get_local_id(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.tid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_local_id_x(); else assert(0); } @@ -252,6 +284,8 @@ struct SharedIndex return ocl.get_local_id(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.tid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_local_id_y(); else assert(0); } @@ -262,6 +296,8 @@ struct SharedIndex return ocl.get_local_id(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.tid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.get_local_id_z(); else assert(0); } diff --git a/source/dcompute/std/metal/index.d b/source/dcompute/std/metal/index.d new file mode 100644 index 0000000..fbf4644 --- /dev/null +++ b/source/dcompute/std/metal/index.d @@ -0,0 +1,34 @@ +@compute(CompileFor.deviceOnly) module dcompute.std.metal.index; + +import ldc.dcompute; +pure: nothrow: @nogc: + +// thread_position_in_grid → GlobalIndex +pragma(mangle, "air.thread_position_in_grid.x") uint get_global_id_x(); +pragma(mangle, "air.thread_position_in_grid.y") uint get_global_id_y(); +pragma(mangle, "air.thread_position_in_grid.z") uint get_global_id_z(); + +// thread_position_in_threadgroup → SharedIndex +pragma(mangle, "air.thread_position_in_threadgroup.x") uint get_local_id_x(); +pragma(mangle, "air.thread_position_in_threadgroup.y") uint get_local_id_y(); +pragma(mangle, "air.thread_position_in_threadgroup.z") uint get_local_id_z(); + +// threadgroup_position_in_grid → GroupIndex +pragma(mangle, "air.threadgroup_position_in_grid.x") uint get_group_id_x(); +pragma(mangle, "air.threadgroup_position_in_grid.y") uint get_group_id_y(); +pragma(mangle, "air.threadgroup_position_in_grid.z") uint get_group_id_z(); + +// threads_per_grid → GlobalDimension +pragma(mangle, "air.threads_per_grid.x") uint get_global_size_x(); +pragma(mangle, "air.threads_per_grid.y") uint get_global_size_y(); +pragma(mangle, "air.threads_per_grid.z") uint get_global_size_z(); + +// threads_per_threadgroup → SharedDimension +pragma(mangle, "air.threads_per_threadgroup.x") uint get_local_size_x(); +pragma(mangle, "air.threads_per_threadgroup.y") uint get_local_size_y(); +pragma(mangle, "air.threads_per_threadgroup.z") uint get_local_size_z(); + +// threadgroups_per_grid → GroupDimension +pragma(mangle, "air.threadgroups_per_grid.x") uint get_num_groups_x(); +pragma(mangle, "air.threadgroups_per_grid.y") uint get_num_groups_y(); +pragma(mangle, "air.threadgroups_per_grid.z") uint get_num_groups_z(); From 4ab05efb869502dabc51e2ff2dfb909ad83c28e2 Mon Sep 17 00:00:00 2001 From: gulugulubing <413153391@qq.com> Date: Thu, 7 May 2026 21:32:15 -0600 Subject: [PATCH 3/6] add threadIdx and sync --- source/dcompute/driver/metal/bindings.d | 3 +++ source/dcompute/driver/metal/encoder.d | 5 +++++ source/dcompute/std/sync.d | 3 +++ 3 files changed, 11 insertions(+) diff --git a/source/dcompute/driver/metal/bindings.d b/source/dcompute/driver/metal/bindings.d index 2ce90f9..776fe3d 100644 --- a/source/dcompute/driver/metal/bindings.d +++ b/source/dcompute/driver/metal/bindings.d @@ -44,6 +44,9 @@ extern(Objective-C) interface MTLComputeCommandEncoder : NSObject { @selector("setBuffer:offset:atIndex:") void setBuffer(MTLBuffer buffer, NSUInteger offset, NSUInteger index); + @selector("setThreadgroupMemoryLength:atIndex:") + void setThreadgroupMemoryLength(NSUInteger length, NSUInteger index); + @selector("dispatchThreads:threadsPerThreadgroup:") void dispatchThreads(MTLSize threadsPerGrid, MTLSize threadsPerThreadgroup); diff --git a/source/dcompute/driver/metal/encoder.d b/source/dcompute/driver/metal/encoder.d index e799b0b..cf49c59 100644 --- a/source/dcompute/driver/metal/encoder.d +++ b/source/dcompute/driver/metal/encoder.d @@ -25,6 +25,11 @@ struct Encoder { if (enc !is null) enc.setBuffer(buffer.raw, offset, index); } + void setThreadgroupMemoryLength(NSUInteger length, NSUInteger index) { + auto enc = raw(); + if (enc !is null) enc.setThreadgroupMemoryLength(length, index); + } + void dispatchThreads(MTLSize threadsPerGrid, MTLSize threadsPerThreadgroup) { auto enc = raw(); if (enc !is null) enc.dispatchThreads(threadsPerGrid, threadsPerThreadgroup); diff --git a/source/dcompute/std/sync.d b/source/dcompute/std/sync.d index 6e59bb4..71753cb 100644 --- a/source/dcompute/std/sync.d +++ b/source/dcompute/std/sync.d @@ -18,6 +18,9 @@ void barrier() cuda.barrier0(); } } + // Metal: use `dcompute.std.metal.sync.wg_barrier(0, 1)` — importing this + // module’s `barrier()` from a `@compute` kernel currently triggers + // dcompute semantic analysis on OCL/CUDA helpers (see LDC issue / follow-up). } void local_fence() From ef1dafb2528d857df22662aa1a81ff224a230c01 Mon Sep 17 00:00:00 2001 From: gulugulubing <413153391@qq.com> Date: Sun, 10 May 2026 10:00:03 -0600 Subject: [PATCH 4/6] add init sync and change index wrap style --- source/dcompute/std/metal/index.d | 36 +++++++++++++++---------------- source/dcompute/std/metal/sync.d | 28 ++++++++++++++++++++++++ source/dcompute/std/sync.d | 14 ++++++------ 3 files changed, 53 insertions(+), 25 deletions(-) create mode 100644 source/dcompute/std/metal/sync.d diff --git a/source/dcompute/std/metal/index.d b/source/dcompute/std/metal/index.d index fbf4644..9e7f8bb 100644 --- a/source/dcompute/std/metal/index.d +++ b/source/dcompute/std/metal/index.d @@ -4,31 +4,31 @@ import ldc.dcompute; pure: nothrow: @nogc: // thread_position_in_grid → GlobalIndex -pragma(mangle, "air.thread_position_in_grid.x") uint get_global_id_x(); -pragma(mangle, "air.thread_position_in_grid.y") uint get_global_id_y(); -pragma(mangle, "air.thread_position_in_grid.z") uint get_global_id_z(); +pragma(LDC_intrinsic, "air.thread_position_in_grid.x") uint get_global_id_x(); +pragma(LDC_intrinsic, "air.thread_position_in_grid.y") uint get_global_id_y(); +pragma(LDC_intrinsic, "air.thread_position_in_grid.z") uint get_global_id_z(); // thread_position_in_threadgroup → SharedIndex -pragma(mangle, "air.thread_position_in_threadgroup.x") uint get_local_id_x(); -pragma(mangle, "air.thread_position_in_threadgroup.y") uint get_local_id_y(); -pragma(mangle, "air.thread_position_in_threadgroup.z") uint get_local_id_z(); +pragma(LDC_intrinsic, "air.thread_position_in_threadgroup.x") uint get_local_id_x(); +pragma(LDC_intrinsic, "air.thread_position_in_threadgroup.y") uint get_local_id_y(); +pragma(LDC_intrinsic, "air.thread_position_in_threadgroup.z") uint get_local_id_z(); // threadgroup_position_in_grid → GroupIndex -pragma(mangle, "air.threadgroup_position_in_grid.x") uint get_group_id_x(); -pragma(mangle, "air.threadgroup_position_in_grid.y") uint get_group_id_y(); -pragma(mangle, "air.threadgroup_position_in_grid.z") uint get_group_id_z(); +pragma(LDC_intrinsic, "air.threadgroup_position_in_grid.x") uint get_group_id_x(); +pragma(LDC_intrinsic, "air.threadgroup_position_in_grid.y") uint get_group_id_y(); +pragma(LDC_intrinsic, "air.threadgroup_position_in_grid.z") uint get_group_id_z(); // threads_per_grid → GlobalDimension -pragma(mangle, "air.threads_per_grid.x") uint get_global_size_x(); -pragma(mangle, "air.threads_per_grid.y") uint get_global_size_y(); -pragma(mangle, "air.threads_per_grid.z") uint get_global_size_z(); +pragma(LDC_intrinsic, "air.threads_per_grid.x") uint get_global_size_x(); +pragma(LDC_intrinsic, "air.threads_per_grid.y") uint get_global_size_y(); +pragma(LDC_intrinsic, "air.threads_per_grid.z") uint get_global_size_z(); // threads_per_threadgroup → SharedDimension -pragma(mangle, "air.threads_per_threadgroup.x") uint get_local_size_x(); -pragma(mangle, "air.threads_per_threadgroup.y") uint get_local_size_y(); -pragma(mangle, "air.threads_per_threadgroup.z") uint get_local_size_z(); +pragma(LDC_intrinsic, "air.threads_per_threadgroup.x") uint get_local_size_x(); +pragma(LDC_intrinsic, "air.threads_per_threadgroup.y") uint get_local_size_y(); +pragma(LDC_intrinsic, "air.threads_per_threadgroup.z") uint get_local_size_z(); // threadgroups_per_grid → GroupDimension -pragma(mangle, "air.threadgroups_per_grid.x") uint get_num_groups_x(); -pragma(mangle, "air.threadgroups_per_grid.y") uint get_num_groups_y(); -pragma(mangle, "air.threadgroups_per_grid.z") uint get_num_groups_z(); +pragma(LDC_intrinsic, "air.threadgroups_per_grid.x") uint get_num_groups_x(); +pragma(LDC_intrinsic, "air.threadgroups_per_grid.y") uint get_num_groups_y(); +pragma(LDC_intrinsic, "air.threadgroups_per_grid.z") uint get_num_groups_z(); diff --git a/source/dcompute/std/metal/sync.d b/source/dcompute/std/metal/sync.d new file mode 100644 index 0000000..f8d4eba --- /dev/null +++ b/source/dcompute/std/metal/sync.d @@ -0,0 +1,28 @@ +@compute(CompileFor.deviceOnly) module dcompute.std.metal.sync; + +import ldc.dcompute; + +pure: nothrow: @nogc: + +alias mem_flags = uint; + +enum : mem_flags +{ + mem_none = 0, + mem_device = 1, + mem_threadgroup = 2, + mem_texture = 4, +} + +enum uint threadgroup_scope = 1; + +/// Threadgroup-wide execution barrier with optional memory fence flags. +/// Matches Julia Metal.jl `threadgroup_barrier(flag)` → `air.wg.barrier(flag, 1)`. +/// See Apple’s `MemoryFlags` / MSL `threadgroup_barrier`. +pragma(LDC_intrinsic, "air.wg.barrier") +void wg_barrier(uint mem_flags, uint execution_scope); + +void threadgroup_barrier()(mem_flags flags = mem_none) +{ + wg_barrier(flags, threadgroup_scope); +} diff --git a/source/dcompute/std/sync.d b/source/dcompute/std/sync.d index 71753cb..88d1ba6 100644 --- a/source/dcompute/std/sync.d +++ b/source/dcompute/std/sync.d @@ -3,24 +3,24 @@ import ldc.dcompute; import ldc.intrinsics; -import ocl = dcompute.std.opencl.sync; -import cuda = dcompute.std.cuda.sync; +import ocl = dcompute.std.opencl.sync; +import cuda = dcompute.std.cuda.sync; +import metal = dcompute.std.metal.sync; //suspends work-item execution until all work-items in the work-group have called the barrier -void barrier() +void barrier()() { if(__dcompute_reflect(ReflectTarget.OpenCL)) ocl.barrier(0); - if(__dcompute_reflect(ReflectTarget.CUDA)) { + else if(__dcompute_reflect(ReflectTarget.CUDA)) { static if (LLVM_atleast!21) { // >= LDC 1.42.0(LLVM 21) cuda.barrier_n(0); } else { cuda.barrier0(); } } - // Metal: use `dcompute.std.metal.sync.wg_barrier(0, 1)` — importing this - // module’s `barrier()` from a `@compute` kernel currently triggers - // dcompute semantic analysis on OCL/CUDA helpers (see LDC issue / follow-up). + else if(__dcompute_reflect(ReflectTarget.Metal)) + metal.threadgroup_barrier(metal.mem_none); } void local_fence() From 4b80239ef402248d32412c55f1795d84d069f27b Mon Sep 17 00:00:00 2001 From: gulugulubing <413153391@qq.com> Date: Sun, 10 May 2026 17:43:31 -0600 Subject: [PATCH 5/6] change index style --- source/dcompute/std/metal/index.d | 43 ++++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/source/dcompute/std/metal/index.d b/source/dcompute/std/metal/index.d index 9e7f8bb..ce6aec0 100644 --- a/source/dcompute/std/metal/index.d +++ b/source/dcompute/std/metal/index.d @@ -3,32 +3,39 @@ import ldc.dcompute; pure: nothrow: @nogc: +pragma(LDC_intrinsic, "air.get_global_id.i32") uint get_global_id(uint dim); +pragma(LDC_intrinsic, "air.get_local_id.i32") uint get_local_id(uint dim); +pragma(LDC_intrinsic, "air.get_group_id.i32") uint get_group_id(uint dim); +pragma(LDC_intrinsic, "air.get_global_size.i32") uint get_global_size(uint dim); +pragma(LDC_intrinsic, "air.get_local_size.i32") uint get_local_size(uint dim); +pragma(LDC_intrinsic, "air.get_num_groups.i32") uint get_num_groups(uint dim); + // thread_position_in_grid → GlobalIndex -pragma(LDC_intrinsic, "air.thread_position_in_grid.x") uint get_global_id_x(); -pragma(LDC_intrinsic, "air.thread_position_in_grid.y") uint get_global_id_y(); -pragma(LDC_intrinsic, "air.thread_position_in_grid.z") uint get_global_id_z(); +uint get_global_id_x()() { return get_global_id(0); } +uint get_global_id_y()() { return get_global_id(1); } +uint get_global_id_z()() { return get_global_id(2); } // thread_position_in_threadgroup → SharedIndex -pragma(LDC_intrinsic, "air.thread_position_in_threadgroup.x") uint get_local_id_x(); -pragma(LDC_intrinsic, "air.thread_position_in_threadgroup.y") uint get_local_id_y(); -pragma(LDC_intrinsic, "air.thread_position_in_threadgroup.z") uint get_local_id_z(); +uint get_local_id_x()() { return get_local_id(0); } +uint get_local_id_y()() { return get_local_id(1); } +uint get_local_id_z()() { return get_local_id(2); } // threadgroup_position_in_grid → GroupIndex -pragma(LDC_intrinsic, "air.threadgroup_position_in_grid.x") uint get_group_id_x(); -pragma(LDC_intrinsic, "air.threadgroup_position_in_grid.y") uint get_group_id_y(); -pragma(LDC_intrinsic, "air.threadgroup_position_in_grid.z") uint get_group_id_z(); +uint get_group_id_x()() { return get_group_id(0); } +uint get_group_id_y()() { return get_group_id(1); } +uint get_group_id_z()() { return get_group_id(2); } // threads_per_grid → GlobalDimension -pragma(LDC_intrinsic, "air.threads_per_grid.x") uint get_global_size_x(); -pragma(LDC_intrinsic, "air.threads_per_grid.y") uint get_global_size_y(); -pragma(LDC_intrinsic, "air.threads_per_grid.z") uint get_global_size_z(); +uint get_global_size_x()() { return get_global_size(0); } +uint get_global_size_y()() { return get_global_size(1); } +uint get_global_size_z()() { return get_global_size(2); } // threads_per_threadgroup → SharedDimension -pragma(LDC_intrinsic, "air.threads_per_threadgroup.x") uint get_local_size_x(); -pragma(LDC_intrinsic, "air.threads_per_threadgroup.y") uint get_local_size_y(); -pragma(LDC_intrinsic, "air.threads_per_threadgroup.z") uint get_local_size_z(); +uint get_local_size_x()() { return get_local_size(0); } +uint get_local_size_y()() { return get_local_size(1); } +uint get_local_size_z()() { return get_local_size(2); } // threadgroups_per_grid → GroupDimension -pragma(LDC_intrinsic, "air.threadgroups_per_grid.x") uint get_num_groups_x(); -pragma(LDC_intrinsic, "air.threadgroups_per_grid.y") uint get_num_groups_y(); -pragma(LDC_intrinsic, "air.threadgroups_per_grid.z") uint get_num_groups_z(); +uint get_num_groups_x()() { return get_num_groups(0); } +uint get_num_groups_y()() { return get_num_groups(1); } +uint get_num_groups_z()() { return get_num_groups(2); } From 169498836b4b251d95520a7f06043f43fb15d13f Mon Sep 17 00:00:00 2001 From: gulugulubing <413153391@qq.com> Date: Sun, 17 May 2026 12:20:32 -0600 Subject: [PATCH 6/6] support scalar parameter in kenerl and make enquee's api more flexible --- source/dcompute/driver/metal/bindings.d | 3 ++ source/dcompute/driver/metal/encoder.d | 5 +++ source/dcompute/driver/metal/queue.d | 44 ++++++++++++++++++++----- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/source/dcompute/driver/metal/bindings.d b/source/dcompute/driver/metal/bindings.d index 776fe3d..5cde8de 100644 --- a/source/dcompute/driver/metal/bindings.d +++ b/source/dcompute/driver/metal/bindings.d @@ -44,6 +44,9 @@ extern(Objective-C) interface MTLComputeCommandEncoder : NSObject { @selector("setBuffer:offset:atIndex:") void setBuffer(MTLBuffer buffer, NSUInteger offset, NSUInteger index); + @selector("setBytes:length:atIndex:") + void setBytes(const(void)* bytes, NSUInteger length, NSUInteger index); + @selector("setThreadgroupMemoryLength:atIndex:") void setThreadgroupMemoryLength(NSUInteger length, NSUInteger index); diff --git a/source/dcompute/driver/metal/encoder.d b/source/dcompute/driver/metal/encoder.d index cf49c59..f7e1307 100644 --- a/source/dcompute/driver/metal/encoder.d +++ b/source/dcompute/driver/metal/encoder.d @@ -25,6 +25,11 @@ struct Encoder { if (enc !is null) enc.setBuffer(buffer.raw, offset, index); } + void setBytes(const(void)* bytes, NSUInteger length, NSUInteger index) { + auto enc = raw(); + if (enc !is null) enc.setBytes(bytes, length, index); + } + void setThreadgroupMemoryLength(NSUInteger length, NSUInteger index) { auto enc = raw(); if (enc !is null) enc.setThreadgroupMemoryLength(length, index); diff --git a/source/dcompute/driver/metal/queue.d b/source/dcompute/driver/metal/queue.d index 18310a1..cad271c 100644 --- a/source/dcompute/driver/metal/queue.d +++ b/source/dcompute/driver/metal/queue.d @@ -6,8 +6,21 @@ import dcompute.driver.metal.program; import dcompute.driver.metal.bindings; import dcompute.driver.metal.buffer; import std.meta : allSatisfy; +import std.traits : isNumeric, Unqual; -private enum isBuffer(T) = is(T == Buffer!U, U); +private enum isBuffer(T) = is(Unqual!T == Buffer!U, U); +private enum isThreadgroupMemory(T) = is(Unqual!T == ThreadgroupMemory); +private enum isScalarKernelArgument(T) = isNumeric!(Unqual!T); +private enum isKernelArgument(T) = + isBuffer!T || isThreadgroupMemory!T || isScalarKernelArgument!T; + +struct ThreadgroupMemory { + NSUInteger length; +} + +ThreadgroupMemory threadgroupMemory(NSUInteger length) { + return ThreadgroupMemory(length); +} struct CommandBuffer { private void* raw_; @@ -73,8 +86,8 @@ struct Queue { raw_ = null; } - void enqueue(Buffers...)(Pipeline pipeline, MTLSize grid, MTLSize group, Buffers buffers) - if (allSatisfy!(isBuffer, Buffers)) { + void enqueue(Args...)(Pipeline pipeline, MTLSize grid, MTLSize group, Args args) + if (allSatisfy!(isKernelArgument, Args)) { auto cmdBuffer = commandBuffer(); if (cmdBuffer.raw is null) return; auto encoder = cmdBuffer.computeEncoder(); @@ -86,8 +99,8 @@ struct Queue { scope(exit) encoder.release(); encoder.setPipeline(pipeline); - foreach (i, ref buf; buffers) { - encoder.setBuffer(buf, 0, cast(NSUInteger) i); + foreach (i, ref arg; args) { + encodeKernelArgument(encoder, arg, cast(NSUInteger) i); } encoder.dispatchThreads(grid, group); encoder.endEncoding(); @@ -110,12 +123,27 @@ struct Queue { this.group = group; } - void opCall(Buffers...)(Buffers buffers) - if (allSatisfy!(isBuffer, Buffers)) { - q.enqueue(k.pipeline, grid, group, buffers); + void opCall(Args...)(Args args) + if (allSatisfy!(isKernelArgument, Args)) { + q.enqueue(k.pipeline, grid, group, args); } } return Launch(this, kernel, grid, group); } } + +private void encodeKernelArgument(T)(ref Encoder encoder, ref T buffer, NSUInteger index) + if (isBuffer!T) { + encoder.setBuffer(buffer, 0, index); +} + +private void encodeKernelArgument(T)(ref Encoder encoder, ref T memory, NSUInteger index) + if (isThreadgroupMemory!T) { + encoder.setThreadgroupMemoryLength(memory.length, index); +} + +private void encodeKernelArgument(T)(ref Encoder encoder, ref T value, NSUInteger index) + if (isScalarKernelArgument!T) { + encoder.setBytes(&value, T.sizeof, index); +}