ml-explore · qflen · Apr 24, 2026
diff --git a/mlx/backend/metal/conv.cpp b/mlx/backend/metal/conv.cpp
@@ -962,7 +962,9 @@ void depthwise_conv_2D_gpu(
 
   MTL::Size group_dims = MTL::Size(tc, tw, th);
   MTL::Size grid_dims = MTL::Size(
-      conv_params.C / tc, conv_params.oS[1] / tw, (conv_params.oS[0] / th) * N);
+      conv_params.C / tc,
+      (conv_params.oS[1] + tw - 1) / tw,
+      ((conv_params.oS[0] + th - 1) / th) * N);
 
   compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }
@@ -986,7 +988,6 @@ void dispatch_conv_2D_gpu(
     if (C_per_group == 1 && O_per_group == 1 && is_kdil_one &&
         conv_params.wS[0] <= 7 && conv_params.wS[1] <= 7 &&
         conv_params.str[0] <= 2 && conv_params.str[1] <= 2 &&
-        conv_params.oS[0] % 8 == 0 && conv_params.oS[1] % 8 == 0 &&
         conv_params.wt_strides[1] == conv_params.wS[1] &&
         conv_params.C % 16 == 0 && conv_params.C == conv_params.O) {
       return depthwise_conv_2D_gpu(s, d, in, wt, out, conv_params);

diff --git a/mlx/backend/metal/kernels/conv.metal b/mlx/backend/metal/kernels/conv.metal
@@ -206,7 +206,7 @@ template <typename T>
 
   threadgroup T ins[TGH * TGW * TGC];
 
-  const int n_tgblocks_h = params.oS[0] / th;
+  const int n_tgblocks_h = (params.oS[0] + th - 1) / th;
   const int n = tid.z / n_tgblocks_h;
   const int tghid = tid.z % n_tgblocks_h;
   const int oh = tghid * th + lid.z;
@@ -277,6 +277,10 @@ template <typename T>
   }
   threadgroup_barrier(mem_flags::mem_none);
 
+  if (oh >= params.oS[0] || ow >= params.oS[1]) {
+    return;
+  }
+
   out += n * params.out_strides[0] + oh * params.out_strides[1] +
       ow * params.out_strides[2];
   out[c] = static_cast<T>(o);

diff --git a/tests/gpu_tests.cpp b/tests/gpu_tests.cpp
@@ -521,3 +521,43 @@ TEST_CASE("test memory info") {
   clear_cache();
   CHECK_EQ(get_cache_memory(), 0);
 }
+
+TEST_CASE("test gpu depthwise conv2d non-mod-8 spatial") {
+  // Depthwise Metal kernel is gated on C_per_group == 1, C == O, C % 16 == 0,
+  // kernel <= 7, stride <= 2. Previously the dispatch also required the
+  // output spatial dims to be multiples of 8; non-multiples fell back to the
+  // gemm path. These cases exercise the tail-tile dispatch now handled by
+  // the depthwise kernel itself and verify GPU ≈ CPU.
+  struct Case {
+    int N, H, W, C, kH, kW;
+    std::pair<int, int> stride;
+    std::pair<int, int> padding;
+  };
+  std::vector<Case> cases = {
+      // Matches the issue reproducer resolutions, reduced batch.
+      {1, 15, 15, 16, 3, 3, {1, 1}, {1, 1}},
+      {1, 30, 30, 32, 3, 3, {1, 1}, {1, 1}},
+      {1, 60, 60, 16, 3, 3, {1, 1}, {1, 1}},
+      // Non-square and non-matching alignment on the two spatial axes.
+      {1, 17, 30, 16, 5, 5, {1, 1}, {2, 2}},
+      {1, 8, 13, 16, 3, 3, {1, 1}, {1, 1}},
+      // Stride 2 with non-mod-8 output.
+      {1, 19, 19, 32, 3, 3, {2, 2}, {1, 1}},
+  };
+
+  auto key = random::key(42);
+  for (const auto& c : cases) {
+    auto in = random::normal(
+        {c.N, c.H, c.W, c.C}, float32, 0.0f, 1.0f, key, Device::cpu);
+    auto wt = random::normal(
+        {c.C, c.kH, c.kW, 1}, float32, 0.0f, 1.0f, key, Device::cpu);
+    eval(in);
+    eval(wt);
+
+    auto out_cpu =
+        conv2d(in, wt, c.stride, c.padding, {1, 1}, c.C, Device::cpu);
+    auto out_gpu =
+        conv2d(in, wt, c.stride, c.padding, {1, 1}, c.C, Device::gpu);
+    CHECK(allclose(out_cpu, out_gpu, 1e-4, 1e-4).item<bool>());
+  }
+}