Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions dev-doc/IOP_Module_API.md
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,14 @@ If the module can use the GPU, implement `process_cl()` wrapped in `#ifdef HAVE_

## Tiling Support

If `IOP_FLAGS_ALLOW_TILING` is set, implement `tiling_callback()` to report memory requirements:
If `IOP_FLAGS_ALLOW_TILING` is set, the pixelpipe is allowed to process a piece in tiling mode, if some parameters don't allow tiling override this in `commit_params()`.

For calculation of memory requirements and tile aligning we have `tiling_callback()`, if not provided defaults are used as in `default_tiling_callback()`

Whenever a module possibly exceeds requirements as defined in `default_tiling_callback()` or requires special aligning a specific `tiling_callback()` should be provided for three reasons:
a) the tiling process will not allocate more memory than granted
b) the OpenCL code path will not be tried if requirements are too high thus avoiding costly late fallbacks to CPU path.
c) tile stitching will be correct for alignment

| Field | Purpose |
|-------|---------|
Expand All @@ -475,13 +482,15 @@ If `IOP_FLAGS_ALLOW_TILING` is set, implement `tiling_callback()` to report memo
| `overlap` | Pixels of overlap between adjacent tiles (for spatial filters) |
| `align` | Tile origin alignment (1 = none, other values only for special algorithms) |


An example
```c
void tiling_callback(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece,
const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out,
dt_develop_tiling_t *tiling)
{
tiling->factor = 2.5f; // input + 1.5× temp buffers
tiling->factor_cl = 2.5f;
tiling->factor = 2.5f; // input + output + 2 single channel temp buffers
tiling->factor_cl = 3.75f; // as above but we need an additional rgb buffer plus a single channel buffer for a mask
tiling->maxbuf = 1.0f;
tiling->maxbuf_cl = 1.0f;
tiling->overhead = 0;
Expand Down
16 changes: 16 additions & 0 deletions src/common/opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,22 @@ gboolean dt_opencl_use_pinned_memory(const int devid)
return (!_cldev_running(devid)) ? FALSE : cl->dev[devid].pinned_memory;
}

gboolean dt_opencl_unified_memory(const int devid)
{
dt_opencl_t *cl = darktable.opencl;
return (!_cldev_running(devid)) ? FALSE : cl->dev[devid].unified_memory;
}

/* this defines an additional alignment requirement for opencl image width.
It can have strong effects on processing speed. Reasonable values are a
power of 2. set to 1 for no effect.
FIXME we can possibly fix this per device
*/
unsigned int dt_opencl_tiling_align(const int devid)
{
return 4;
}

void dt_opencl_write_device_config(const int devid)
{
if(devid <= DT_DEVICE_CPU) return;
Expand Down
18 changes: 10 additions & 8 deletions src/common/opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,8 @@ gboolean dt_opencl_read_device_config(const int devid);
gboolean dt_opencl_avoid_atomics(const int devid);
void dt_opencl_micro_nap(const int devid);
gboolean dt_opencl_use_pinned_memory(const int devid);
gboolean dt_opencl_unified_memory(const int devid);
unsigned int dt_opencl_tiling_align(const int devid);

G_END_DECLS

Expand Down Expand Up @@ -644,7 +646,7 @@ static inline void dt_opencl_cleanup(dt_opencl_t *cl)
}
static inline gboolean dt_opencl_finish(const int devid)
{
return -1;
return DT_OPENCL_DEFAULT_ERROR;
}
static inline gboolean dt_opencl_finish_sync_pipe(const int devid,
const int pipetype)
Expand All @@ -653,49 +655,49 @@ static inline gboolean dt_opencl_finish_sync_pipe(const int devid,
}
static inline int dt_opencl_lock_device(const int pipetype)
{
return -1;
return DT_OPENCL_DEFAULT_ERROR;
}
static inline void dt_opencl_unlock_device(const int dev)
{
}
static inline int dt_opencl_create_kernel(const int program,
const char *name)
{
return -1;
return DT_OPENCL_DEFAULT_ERROR;
}
static inline void dt_opencl_free_kernel(const int kernel)
{
}
static inline int dt_opencl_get_max_work_item_sizes(const int dev,
size_t *sizes)
{
return -1;
return DT_OPENCL_DEFAULT_ERROR;
}
static inline int dt_opencl_get_work_group_limits(const int dev,
size_t *sizes,
size_t *workgroupsize,
unsigned long *localmemsize)
{
return -1;
return DT_OPENCL_DEFAULT_ERROR;
}
static inline int dt_opencl_get_kernel_work_group_size(const int dev,
const int kernel,
size_t *kernelworkgroupsize)
{
return -1;
return DT_OPENCL_DEFAULT_ERROR;
}
static inline int dt_opencl_enqueue_kernel_2d(const int dev,
const int kernel,
const size_t *sizes)
{
return -1;
return DT_OPENCL_DEFAULT_ERROR;
}
static inline int dt_opencl_enqueue_kernel_2d_with_local(const int dev,
const int kernel,
const size_t *sizes,
const size_t *local)
{
return -1;
return DT_OPENCL_DEFAULT_ERROR;
}
static inline gboolean dt_opencl_is_enabled(void)
{
Expand Down
Loading
Loading