darktable-org · TurboGit · May 22, 2026 · May 22, 2026
diff --git a/dev-doc/IOP_Module_API.md b/dev-doc/IOP_Module_API.md
@@ -465,7 +465,14 @@ If the module can use the GPU, implement `process_cl()` wrapped in `#ifdef HAVE_
 
 ## Tiling Support
 
-If `IOP_FLAGS_ALLOW_TILING` is set, implement `tiling_callback()` to report memory requirements:
+If `IOP_FLAGS_ALLOW_TILING` is set, the pixelpipe is allowed to process a piece in tiling mode, if some parameters don't allow tiling override this in `commit_params()`.
+
+For calculation of memory requirements and tile aligning we have `tiling_callback()`, if not provided defaults are used as in `default_tiling_callback()`
+
+Whenever a module possibly exceeds requirements as defined in `default_tiling_callback()` or requires special aligning a specific `tiling_callback()` should be provided for three reasons:
+a) the tiling process will not allocate more memory than granted
+b) the OpenCL code path will not be tried if requirements are too high thus avoiding costly late fallbacks to CPU path.
+c) tile stitching will be correct for alignment
 
 | Field | Purpose |
 |-------|---------|
@@ -475,13 +482,15 @@ If `IOP_FLAGS_ALLOW_TILING` is set, implement `tiling_callback()` to report memo
 | `overlap` | Pixels of overlap between adjacent tiles (for spatial filters) |
 | `align` | Tile origin alignment (1 = none, other values only for special algorithms) |
 
+
+An example
 ```c
 void tiling_callback(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece,
                      const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out,
                      dt_develop_tiling_t *tiling)
 {
-  tiling->factor = 2.5f;     // input + 1.5× temp buffers
-  tiling->factor_cl = 2.5f;
+  tiling->factor = 2.5f;     // input + output + 2 single channel temp buffers
+  tiling->factor_cl = 3.75f; // as above but we need an additional rgb buffer plus a single channel buffer for a mask
   tiling->maxbuf = 1.0f;
   tiling->maxbuf_cl = 1.0f;
   tiling->overhead = 0;

diff --git a/src/common/opencl.c b/src/common/opencl.c
@@ -325,6 +325,22 @@ gboolean dt_opencl_use_pinned_memory(const int devid)
   return (!_cldev_running(devid)) ? FALSE : cl->dev[devid].pinned_memory;
 }
 
+gboolean dt_opencl_unified_memory(const int devid)
+{
+  dt_opencl_t *cl = darktable.opencl;
+  return (!_cldev_running(devid)) ? FALSE : cl->dev[devid].unified_memory;
+}
+
+/* this defines an additional alignment requirement for opencl image width.
+   It can have strong effects on processing speed. Reasonable values are a
+   power of 2. set to 1 for no effect.
+   FIXME we can possibly fix this per device
+*/
+unsigned int dt_opencl_tiling_align(const int devid)
+{
+  return 4;
+}
+
 void dt_opencl_write_device_config(const int devid)
 {
   if(devid <= DT_DEVICE_CPU) return;

diff --git a/src/common/opencl.h b/src/common/opencl.h
@@ -607,6 +607,8 @@ gboolean dt_opencl_read_device_config(const int devid);
 gboolean dt_opencl_avoid_atomics(const int devid);
 void dt_opencl_micro_nap(const int devid);
 gboolean dt_opencl_use_pinned_memory(const int devid);
+gboolean dt_opencl_unified_memory(const int devid);
+unsigned int dt_opencl_tiling_align(const int devid);
 
 G_END_DECLS
 
@@ -644,7 +646,7 @@ static inline void dt_opencl_cleanup(dt_opencl_t *cl)
 }
 static inline gboolean dt_opencl_finish(const int devid)
 {
-  return -1;
+  return DT_OPENCL_DEFAULT_ERROR;
 }
 static inline gboolean dt_opencl_finish_sync_pipe(const int devid,
                                                   const int pipetype)
@@ -653,49 +655,49 @@ static inline gboolean dt_opencl_finish_sync_pipe(const int devid,
 }
 static inline int dt_opencl_lock_device(const int pipetype)
 {
-  return -1;
+  return DT_OPENCL_DEFAULT_ERROR;
 }
 static inline void dt_opencl_unlock_device(const int dev)
 {
 }
 static inline int dt_opencl_create_kernel(const int program,
                                           const char *name)
 {
-  return -1;
+  return DT_OPENCL_DEFAULT_ERROR;
 }
 static inline void dt_opencl_free_kernel(const int kernel)
 {
 }
 static inline int dt_opencl_get_max_work_item_sizes(const int dev,
                                                     size_t *sizes)
 {
-  return -1;
+  return DT_OPENCL_DEFAULT_ERROR;
 }
 static inline int dt_opencl_get_work_group_limits(const int dev,
                                                   size_t *sizes,
                                                   size_t *workgroupsize,
                                                   unsigned long *localmemsize)
 {
-  return -1;
+  return DT_OPENCL_DEFAULT_ERROR;
 }
 static inline int dt_opencl_get_kernel_work_group_size(const int dev,
                                                        const int kernel,
                                                        size_t *kernelworkgroupsize)
 {
-  return -1;
+  return DT_OPENCL_DEFAULT_ERROR;
 }
 static inline int dt_opencl_enqueue_kernel_2d(const int dev,
                                               const int kernel,
                                               const size_t *sizes)
 {
-  return -1;
+  return DT_OPENCL_DEFAULT_ERROR;
 }
 static inline int dt_opencl_enqueue_kernel_2d_with_local(const int dev,
                                                          const int kernel,
                                                          const size_t *sizes,
                                                          const size_t *local)
 {
-  return -1;
+  return DT_OPENCL_DEFAULT_ERROR;
 }
 static inline gboolean dt_opencl_is_enabled(void)
 {