From d7c766cc19581c3068c8e5f9eb881f94bc39278f Mon Sep 17 00:00:00 2001 From: Daniel Lu Date: Mon, 18 May 2026 14:19:07 -0700 Subject: [PATCH] Prefault OpenCL readback destinations --- src/common/opencl.c | 56 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/src/common/opencl.c b/src/common/opencl.c index 58d716aeba38..4e0bbdc9989a 100644 --- a/src/common/opencl.c +++ b/src/common/opencl.c @@ -46,6 +46,9 @@ #include #include #include +#if !defined(_WIN32) +#include +#endif #include static const char *_opencl_get_vendor_by_id(unsigned int id); @@ -71,6 +74,14 @@ static char *_ascii_str_canonical(const char *in, char *out, int maxlen); static char *_strsep(char **stringp, const char *delim); +static void _opencl_prefault_host_write_pages(const int devid, void *host, const size_t bytes); +int dt_opencl_read_host_from_device_rowpitch(const int devid, + void *host, + void *device, + const int width, + const int height, + const int rowpitch); + /** read scheduling profile for config variables */ static dt_opencl_scheduling_profile_t _opencl_get_scheduling_profile(void); @@ -2870,6 +2881,49 @@ int dt_opencl_copy_device_to_host(const int devid, const int width, const int height, const int bpp) +{ + if(host && width > 0 && height > 0 && bpp > 0) + _opencl_prefault_host_write_pages(devid, host, (size_t)width * height * bpp); + + return dt_opencl_read_host_from_device_rowpitch(devid, host, device, + width, height, bpp * width); +} + +static void _opencl_prefault_host_write_pages(const int devid, void *host, const size_t bytes) +{ +#if !defined(_WIN32) + if(!host || bytes == 0) return; + if(!_cldev_running(devid)) return; + + dt_opencl_t *cl = darktable.opencl; + if(!cl->fastcl || !cl->dev[devid].unified_memory) return; + + const long page_size = sysconf(_SC_PAGESIZE); + const size_t step = page_size > 0 ? (size_t)page_size : 4096; + if(bytes < step) return; + + volatile unsigned char *p = (volatile unsigned char *)host; + + // NVIDIA OpenCL on unified-memory systems can be extremely slow when a + // blocking read faults large cold host buffers from inside the driver. + // Touch the destination pages first so the driver sees committed memory. + for(size_t offset = 0; offset < bytes; offset += step) + p[offset] = p[offset]; + + p[bytes - 1] = p[bytes - 1]; +#else + (void)devid; + (void)host; + (void)bytes; +#endif +} + +int dt_opencl_read_host_from_device_rowpitch(const int devid, + void *host, + void *device, + const int width, + const int height, + const int rowpitch) { if(!_cldev_running(devid)) return DT_OPENCL_NODEVICE; @@ -2877,7 +2931,7 @@ int dt_opencl_copy_device_to_host(const int devid, const size_t region[2] = { width, height }; // blocking. return dt_opencl_read_host_from_device_raw(devid, host, device, CLIMG_ORIGIN, - region, (size_t)width * bpp, TRUE); + region, rowpitch, TRUE); } int dt_opencl_read_host_from_device_raw(const int devid,