Add batch size validation and test coverage for depthwise conv2d

RJ Ascani · RJ Ascani · commit 2546c391d6ba · 2025-12-15T11:47:39.000-08:00
CMSIS-NN arm_depthwise_conv_wrapper_s8 only supports batch size 1.
Add validation in both AOT pass (fail during compilation) and runtime
(defensive check).

Add 6 test cases covering edge cases:
- Combined stride/padding/bias
- 1x1 kernels (common in mobile networks)
- Higher depth_multiplier (4)
- Asymmetric kernels (1x3)
- Asymmetric stride/padding
- Larger kernels (5x5)

Fix depthwise_conv2d_stride test to use batch size 1.
diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
@@ -39,6 +39,16 @@ bool validate_depthwise_conv2d_arguments(
     return false;
   }
 
+  // CMSIS-NN depthwise convolution only supports batch size of 1
+  if (input.size(0) != 1) {
+    ET_LOG(
+        Error,
+        "quantized_depthwise_conv2d_out: CMSIS-NN only supports batch size 1, got %zd",
+        input.size(0));
+    context.fail(Error::InvalidArgument);
+    return false;
+  }
+
   // Validate weight is in IHWO layout: [1, H, W, C_OUT]
   if (weight.size(0) != 1) {
     ET_LOG(
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -207,6 +207,14 @@ def _get_convolution_replacement(self, node) -> int:
             output_channels = weight_tensor.shape[0]
             input_channels = groups  # For depthwise, groups == input_channels
 
+            # CMSIS-NN depthwise convolution only supports batch size of 1
+            input_tensor = get_first_fake_tensor(x)
+            batch_size = input_tensor.shape[0]
+            if batch_size != 1:
+                raise ValueError(
+                    f"Depthwise conv: CMSIS-NN only supports batch size 1, got {batch_size}"
+                )
+
             if output_channels % input_channels != 0:
                 raise ValueError(
                     f"Depthwise conv: output_channels ({output_channels}) must be "
diff --git a/backends/cortex_m/test/ops/test_conv.py b/backends/cortex_m/test/ops/test_conv.py
@@ -238,7 +238,7 @@ def forward(self, x):
     "depthwise_conv2d_stride": McuTestCase(
         model=CortexMDepthwiseConv2D(4, 4, 3, stride=2, groups=4),
         example_inputs=(
-            ramp_tensor(-50, 50, (2, 4, 8, 8)).to(memory_format=torch.channels_last),
+            ramp_tensor(-50, 50, (1, 4, 8, 8)).to(memory_format=torch.channels_last),
         ),
     ),
     "depthwise_conv2d_padding": McuTestCase(
@@ -253,6 +253,42 @@ def forward(self, x):
             ramp_tensor(-10, 10, (1, 3, 6, 6)).to(memory_format=torch.channels_last),
         ),
     ),
+    "depthwise_conv2d_stride_padding_bias": McuTestCase(
+        model=CortexMDepthwiseConv2DBias(4, 4, 3, stride=2, padding=1, groups=4),
+        example_inputs=(
+            ramp_tensor(0, 5, (1, 4, 8, 8)).to(memory_format=torch.channels_last),
+        ),
+    ),
+    "depthwise_conv2d_1x1": McuTestCase(
+        model=CortexMDepthwiseConv2D(4, 8, 1, groups=4),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 4, 8, 8)).to(memory_format=torch.channels_last),
+        ),
+    ),
+    "depthwise_conv2d_multiplier_4": McuTestCase(
+        model=CortexMDepthwiseConv2D(2, 8, 3, groups=2),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 2, 8, 8)).to(memory_format=torch.channels_last),
+        ),
+    ),
+    "depthwise_conv2d_asymmetric_kernel": McuTestCase(
+        model=CortexMDepthwiseConv2D(4, 4, (1, 3), groups=4),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 4, 8, 8)).to(memory_format=torch.channels_last),
+        ),
+    ),
+    "depthwise_conv2d_asymmetric_stride": McuTestCase(
+        model=CortexMDepthwiseConv2D(3, 3, 3, stride=(2, 1), padding=(1, 0), groups=3),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 3, 8, 8)).to(memory_format=torch.channels_last),
+        ),
+    ),
+    "depthwise_conv2d_5x5": McuTestCase(
+        model=CortexMDepthwiseConv2D(4, 4, 5, padding=2, groups=4),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 4, 8, 8)).to(memory_format=torch.channels_last),
+        ),
+    ),
 }