perf: optimize CPU deform_conv2d forward pass

developer0hye · claude · developer0hye · commit e653cad3a0df · 2026-03-15T23:41:34.000+09:00
Three changes to the CPU deformable convolution forward kernel:

1. Replace at::zeros with at::empty for columns and out_buf buffers.
   The deformable_im2col_kernel writes every element of the columns
   buffer, and out_buf is fully written by addmm_, so zero-initialization
   is wasted work.

2. Use addmm_ with beta=0 instead of the default beta=1. This avoids
   accumulating into uninitialized memory while preserving in-place
   operation (no extra allocation unlike at::mm).

3. Parallelize deformable_im2col_kernel with at::parallel_for. The
   im2col loop was the only single-threaded phase in the forward pass
   (GEMM is already parallelized by BLAS). Each loop iteration writes
   to a non-overlapping region of the columns buffer, so parallelization
   is safe.

Benchmark results on Apple M2 (CPU, float32):

  Config          Before (ms)   After (ms)    Change
  small-b1              9.76        2.44       -75%
  small-b8             91.77       33.88       -63%
  medium-b1           216.70       75.80       -65%
  medium-b8          1152.09      650.00       -44%
  large-b1            348.86      302.70       -13%
  large-b4           1342.75     1289.96        -4%

Signed-off-by: Yonghye Kwon &lt;developer.0hye@gmail.com&gt;
Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Yonghye Kwon &lt;developer.0hye@gmail.com&gt;
diff --git a/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp
@@ -68,6 +68,7 @@
 // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
 
 #include <ATen/ATen.h>
+#include <ATen/Parallel.h>
 #include <torch/library.h>
 
 namespace vision {
@@ -139,58 +140,60 @@ void deformable_im2col_kernel(
     int out_w,
     bool use_mask,
     scalar_t* columns) {
-  for (int index = 0; index != n; ++index) {
-    const int out_x = index % out_w;
-    const int out_y = (index / out_w) % out_h;
-    const int out_b = (index / (out_w * out_h)) % batch_sz;
-    const int in_c = index / (out_w * out_h * batch_sz);
-    const int out_c = in_c * weight_h * weight_w;
+  at::parallel_for(0, n, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t index = begin; index != end; ++index) {
+      const int out_x = index % out_w;
+      const int out_y = (index / out_w) % out_h;
+      const int out_b = (index / (out_w * out_h)) % batch_sz;
+      const int in_c = index / (out_w * out_h * batch_sz);
+      const int out_c = in_c * weight_h * weight_w;
 
-    int c_per_offset_grp = n_in_channels / n_offset_grps;
-    const int grp_idx = in_c / c_per_offset_grp;
+      int c_per_offset_grp = n_in_channels / n_offset_grps;
+      const int grp_idx = in_c / c_per_offset_grp;
 
-    auto columns_ptr = columns +
-        (out_c * (batch_sz * out_h * out_w) + out_b * (out_h * out_w) +
-         out_y * out_w + out_x);
+      auto columns_ptr = columns +
+          (out_c * (batch_sz * out_h * out_w) + out_b * (out_h * out_w) +
+           out_y * out_w + out_x);
 
-    auto input_ptr = input +
-        (out_b * (n_in_channels * height * width) + in_c * (height * width));
+      auto input_ptr = input +
+          (out_b * (n_in_channels * height * width) + in_c * (height * width));
 
-    auto offset_ptr = offset +
-        (out_b * n_offset_grps + grp_idx) * 2 * weight_h * weight_w * out_h *
-            out_w;
+      auto offset_ptr = offset +
+          (out_b * n_offset_grps + grp_idx) * 2 * weight_h * weight_w * out_h *
+              out_w;
 
-    auto mask_ptr = mask;
-    if (use_mask) {
-      mask_ptr += (out_b * n_offset_grps + grp_idx) * weight_h * weight_w *
-          out_h * out_w;
-    }
-
-    for (int i = 0; i < weight_h; ++i) {
-      for (int j = 0; j < weight_w; ++j) {
-        const int mask_idx = i * weight_w + j;
-        const int offset_idx = 2 * mask_idx;
+      auto mask_ptr = mask;
+      if (use_mask) {
+        mask_ptr += (out_b * n_offset_grps + grp_idx) * weight_h * weight_w *
+            out_h * out_w;
+      }
 
-        scalar_t mask_value = 1;
-        if (use_mask) {
-          mask_value =
-              mask_ptr[mask_idx * (out_h * out_w) + out_y * out_w + out_x];
+      for (int i = 0; i < weight_h; ++i) {
+        for (int j = 0; j < weight_w; ++j) {
+          const int mask_idx = i * weight_w + j;
+          const int offset_idx = 2 * mask_idx;
+
+          scalar_t mask_value = 1;
+          if (use_mask) {
+            mask_value =
+                mask_ptr[mask_idx * (out_h * out_w) + out_y * out_w + out_x];
+          }
+
+          const scalar_t offset_h =
+              offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x];
+          const scalar_t offset_w = offset_ptr
+              [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x];
+          const scalar_t y =
+              (out_y * stride_h - pad_h) + i * dilation_h + offset_h;
+          const scalar_t x =
+              (out_x * stride_w - pad_w) + j * dilation_w + offset_w;
+          *columns_ptr =
+              mask_value * bilinear_interpolate(input_ptr, height, width, y, x);
+          columns_ptr += batch_sz * out_h * out_w;
         }
-
-        const scalar_t offset_h =
-            offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x];
-        const scalar_t offset_w = offset_ptr
-            [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x];
-        const scalar_t y =
-            (out_y * stride_h - pad_h) + i * dilation_h + offset_h;
-        const scalar_t x =
-            (out_x * stride_w - pad_w) + j * dilation_w + offset_w;
-        *columns_ptr =
-            mask_value * bilinear_interpolate(input_ptr, height, width, y, x);
-        columns_ptr += batch_sz * out_h * out_w;
       }
     }
-  }
+  });
 }
 
 void deformable_im2col(
@@ -1013,7 +1016,7 @@ at::Tensor deform_conv2d_forward_kernel(
          out_w});
   }
 
-  at::Tensor out_buf = at::zeros(
+  at::Tensor out_buf = at::empty(
       {batch_sz / n_parallel_imgs,
        out_channels,
        n_parallel_imgs * out_h,
@@ -1035,7 +1038,7 @@ at::Tensor deform_conv2d_forward_kernel(
        weight_c.size(3)});
 
   // Sample points and perform convolution
-  auto columns = at::zeros(
+  auto columns = at::empty(
       {n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w},
       input_c.options());
   for (int b = 0; b < batch_sz / n_parallel_imgs; b++) {
@@ -1064,10 +1067,9 @@ at::Tensor deform_conv2d_forward_kernel(
     columns = columns.view(
         {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
     for (int g = 0; g < n_weight_grps; g++) {
-      out_buf[b][g] = out_buf[b][g]
-                          .flatten(1)
-                          .addmm_(weight_c[g].flatten(1), columns[g])
-                          .view_as(out_buf[b][g]);
+      out_buf[b][g]
+          .flatten(1)
+          .addmm_(weight_c[g].flatten(1), columns[g], 0, 1);
     }
     columns =
         columns.view({columns.size(0) * columns.size(1), columns.size(2)});