From 632f46d75df5dfbfaecb30f605b06e25cf253a51 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn@traverseresearch.nl>
Date: Mon, 23 Oct 2023 22:03:43 +0200
Subject: [PATCH 1/4] lanczos3: Mark filter weights as `uniform`

Before:

    Downsample `square_test.png` using ispc_downsampler
                        time:   [43.438 ms 43.468 ms 43.500 ms]

After:

    Downsample `square_test.png` using ispc_downsampler
                        time:   [29.891 ms 29.922 ms 29.953 ms]
                        change: [-31.246% -31.162% -31.077%] (p = 0.00 < 0.05)
---
 src/ispc/kernels/lanczos3.ispc | 43 +++++++++++++++-------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/ispc/kernels/lanczos3.ispc b/src/ispc/kernels/lanczos3.ispc
index 6767450..b23cddb 100644
--- a/src/ispc/kernels/lanczos3.ispc
+++ b/src/ispc/kernels/lanczos3.ispc
@@ -1,18 +1,18 @@
 #include "image.ispc"
 
-#define M_PI 3.14159265358979
+const uniform float M_PI = 3.14159265358979;
 
-static inline float clean(float t)
+static inline uniform float clean(uniform float t)
 {
-    const float EPSILON = .0000125f;
+    const uniform float EPSILON = .0000125f;
     if (abs(t) < EPSILON)
         return 0.0f;
-    return (float)t;
+    return t;
 }
 
-static inline float sinc(float x)
+static inline uniform float sinc(uniform float x)
 {
-    x = (x * M_PI);
+    x = x * M_PI;
 
     // if ((x < 0.01f) && (x > -0.01f))
     //     return 1.0f + x * x * (-1.0f / 6.0f + x * x * 1.0f / 120.0f);
@@ -20,7 +20,7 @@ static inline float sinc(float x)
     return sin(x) / x;
 }
 
-static inline float lanczos3_filter(float t)
+static inline uniform float lanczos3_filter(uniform float t)
 {
     t = abs(t);
 
@@ -30,19 +30,14 @@ static inline float lanczos3_filter(float t)
         return 0.0f;
 }
 
-static inline float frac(float f) {
-    float absf = abs(f);
-    return absf - floor(absf);
-}
-
 static inline float byte_to_float(uint b) {
     //return floatbits(0x3f800000 | (b << (23 - 8))) - 1.0;
     return (float)b;
 }
 
-static inline uint8<4> resample_internal(uniform Image src_image, float<2> uv, uniform uint8 num_channels) {
+static inline uint8<4> resample_internal(const uniform Image src_image, const float<2> uv, const uniform uint8 num_channels) {
     float<4> col = 0.0;
-    float weight = 0.0;
+    uniform float weight = 0.0;
     // Truncate floating point coordinate to integer:
     const int<2> src_coord = uv * src_image.size;
 
@@ -52,22 +47,22 @@ static inline uint8<4> resample_internal(uniform Image src_image, float<2> uv, u
     // right and bottom of the target pixel.
     for (uniform int x = -3; x < 3; x++) {
         for (uniform int y = -3; y < 3; y++) {
-            float wx = lanczos3_filter((uniform float)x + 0.5);
-            float wy = lanczos3_filter((uniform float)y + 0.5);
+            const uniform float wx = lanczos3_filter((uniform float)x + 0.5);
+            const uniform float wy = lanczos3_filter((uniform float)y + 0.5);
+            const uniform float w = wx * wy;
+            const uniform int<2> texel_offset = {x, y};
 
-            float w = wx * wy;
-            int<2> texel_offset = {x, y};
             int<2> src_kernel_coord = src_coord + texel_offset;
             // TODO: Let the user specify a boundary mode!
             // https://github.com/Traverse-Research/ispc-downsampler/issues/25#issuecomment-1584915050
             src_kernel_coord.x = clamp(src_kernel_coord.x, 0, src_image.size.x - 1);
             src_kernel_coord.y = clamp(src_kernel_coord.y, 0, src_image.size.y - 1);
 
-            int addr = (src_kernel_coord.x + src_kernel_coord.y * src_image.size.x) * num_channels;
+            const int addr = (src_kernel_coord.x + src_kernel_coord.y * src_image.size.x) * num_channels;
 
             float<4> texel;
 
-            const float inv_255 = rcp(255.0);
+            const uniform float inv_255 = rcp(255.0);
 
             if (num_channels == 3) {
                 texel.x = byte_to_float(src_image.data[addr + 0]) * inv_255;
@@ -89,9 +84,9 @@ static inline uint8<4> resample_internal(uniform Image src_image, float<2> uv, u
 }
 
 export void resample(uniform uint32 width, uniform uint32 height, uniform uint32 stride, uniform uint8 num_channels, uniform uint32 target_width, uniform uint32 target_height, uniform const uint8 src_data[], uniform uint8 out_data[]) {
-    uniform Image src = {src_data, {width, height}};
-    uniform float<2> target_size = {(float)target_width, (float)target_height};
-    uniform float<2> inv_target_size = 1.0f / target_size;
+    const uniform Image src = {src_data, {width, height}};
+    const uniform float<2> target_size = {(float)target_width, (float)target_height};
+    const uniform float<2> inv_target_size = 1.0f / target_size;
 
     foreach_tiled (y = 0 ... target_height, x = 0 ... target_width) {
         float<2> uv = {x, y};
@@ -100,7 +95,7 @@ export void resample(uniform uint32 width, uniform uint32 height, uniform uint32
         // Convert to uniform space:
         uv *= inv_target_size;
 
-        uint8<4> s = resample_internal(src, uv, num_channels);
+        const uint8<4> s = resample_internal(src, uv, num_channels);
 
         for (uniform int i = 0; i < num_channels; i++)
             out_data[(x + y * target_width) * num_channels + i] = s[i];

From 0106556d02c99d620e96d9995118343a88f63ef0 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn@traverseresearch.nl>
Date: Mon, 23 Oct 2023 22:45:00 +0200
Subject: [PATCH 2/4] Perform `degamma` and `gamma` conversions on user request

This crate can't assume that the input and output is linear, nor did it
correct for that in the `test` example where the output from `stb_image`
is clearly nonlinear (it doesn't state this in the "docs", but is
visible from not linearizing JPG and PNG inputs and applying a gamma of
1/2.2 when converting HDR to LDR).  While we could request users to pre-
correct for this and return linear output to them, it is more efficient
to do it within the downsampling algorithm that already runs over all
the pixels, and (more importantly!) requiring these parameters in the
input forces the caller to think about it.
---
 benches/basic.rs               |  11 ++-
 examples/test.rs               |  10 ++-
 src/ispc/downsample_ispc.rs    | 125 +++++++++++++++++++++++++++++++--
 src/ispc/kernels/image.ispc    |   2 +-
 src/ispc/kernels/lanczos3.ispc |  78 ++++++++++++++------
 src/lib.rs                     |  60 ++++++++++++----
 6 files changed, 237 insertions(+), 49 deletions(-)

diff --git a/benches/basic.rs b/benches/basic.rs
index e2584df..5e68b0e 100644
--- a/benches/basic.rs
+++ b/benches/basic.rs
@@ -1,5 +1,5 @@
 use criterion::{criterion_group, criterion_main, Criterion};
-use ispc_downsampler::{downsample, Format, Image};
+use ispc_downsampler::{downsample, Format, Image, Parameters};
 use resize::{px::RGB, Type::Lanczos3};
 use stb_image::image::{load, LoadResult};
 use std::path::Path;
@@ -17,8 +17,15 @@ pub fn ispc_downsampler(c: &mut Criterion) {
         let target_width = (img.width / 4) as u32;
         let target_height = (img.height / 4) as u32;
 
+        let params = Parameters {
+            // Input stb Image is gamma-corrected (i.e. expects to be passed through a CRT with exponent 2.2)
+            degamma: true,
+            // Output image is PNG which must be stored with a gamma of 1/2.2
+            gamma: true,
+        };
+
         c.bench_function("Downsample `square_test.png` using ispc_downsampler", |b| {
-            b.iter(|| downsample(&src_img, target_width, target_height))
+            b.iter(|| downsample(&params, &src_img, target_width, target_height))
         });
     }
 }
diff --git a/examples/test.rs b/examples/test.rs
index 3d58186..fa1118a 100644
--- a/examples/test.rs
+++ b/examples/test.rs
@@ -1,5 +1,5 @@
 use image::{RgbImage, RgbaImage};
-use ispc_downsampler::{downsample, Format, Image};
+use ispc_downsampler::{downsample, Format, Image, Parameters};
 use stb_image::image::{load, LoadResult};
 use std::path::Path;
 use std::time::Instant;
@@ -26,7 +26,13 @@ fn main() {
 
             let now = Instant::now();
             println!("Downsampling started!");
-            let downsampled_pixels = downsample(&src_img, target_width, target_height);
+            let params = Parameters {
+                // Input stb Image is gamma-corrected (i.e. expects to be passed through a CRT with exponent 2.2)
+                degamma: true,
+                // Output image is PNG which must be stored with a gamma of 1/2.2
+                gamma: true,
+            };
+            let downsampled_pixels = downsample(&params, &src_img, target_width, target_height);
             println!("Finished downsampling in {:.2?}!", now.elapsed());
 
             std::fs::create_dir_all("example_outputs").unwrap();
diff --git a/src/ispc/downsample_ispc.rs b/src/ispc/downsample_ispc.rs
index 044bf5f..404c62b 100644
--- a/src/ispc/downsample_ispc.rs
+++ b/src/ispc/downsample_ispc.rs
@@ -2,16 +2,127 @@
 pub mod downsample_ispc {
 /* automatically generated by rust-bindgen 0.61.0 */
 
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Debug, Copy, Clone)]
+pub struct uint32_t2 {
+    pub v: [u32; 2usize],
+}
+#[test]
+fn bindgen_test_layout_uint32_t2() {
+    const UNINIT: ::std::mem::MaybeUninit<uint32_t2> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<uint32_t2>(),
+        16usize,
+        concat!("Size of: ", stringify!(uint32_t2))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<uint32_t2>(),
+        16usize,
+        concat!("Alignment of ", stringify!(uint32_t2))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(uint32_t2),
+            "::",
+            stringify!(v)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct Parameters {
+    pub degamma: bool,
+    pub gamma: bool,
+}
+#[test]
+fn bindgen_test_layout_Parameters() {
+    const UNINIT: ::std::mem::MaybeUninit<Parameters> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<Parameters>(),
+        2usize,
+        concat!("Size of: ", stringify!(Parameters))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<Parameters>(),
+        1usize,
+        concat!("Alignment of ", stringify!(Parameters))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).degamma) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(Parameters),
+            "::",
+            stringify!(degamma)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).gamma) as usize - ptr as usize },
+        1usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(Parameters),
+            "::",
+            stringify!(gamma)
+        )
+    );
+}
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Debug, Copy, Clone)]
+pub struct Image {
+    pub data: *mut u8,
+    pub __bindgen_padding_0: u64,
+    pub size: uint32_t2,
+}
+#[test]
+fn bindgen_test_layout_Image() {
+    const UNINIT: ::std::mem::MaybeUninit<Image> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<Image>(),
+        32usize,
+        concat!("Size of: ", stringify!(Image))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<Image>(),
+        16usize,
+        concat!("Alignment of ", stringify!(Image))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(Image),
+            "::",
+            stringify!(data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(Image),
+            "::",
+            stringify!(size)
+        )
+    );
+}
 extern "C" {
     pub fn resample(
-        width: u32,
-        height: u32,
-        stride: u32,
+        params: *const Parameters,
+        src: *const Image,
+        dst: *mut Image,
         num_channels: u8,
-        target_width: u32,
-        target_height: u32,
-        src_data: *const u8,
-        out_data: *mut u8,
     );
 }
 extern "C" {
diff --git a/src/ispc/kernels/image.ispc b/src/ispc/kernels/image.ispc
index c906c24..a37b260 100644
--- a/src/ispc/kernels/image.ispc
+++ b/src/ispc/kernels/image.ispc
@@ -1,4 +1,4 @@
 struct Image {
     uniform uint8* data;
-    uniform int<2> size;
+    uniform uint<2> size;
 };
diff --git a/src/ispc/kernels/lanczos3.ispc b/src/ispc/kernels/lanczos3.ispc
index b23cddb..a8133a7 100644
--- a/src/ispc/kernels/lanczos3.ispc
+++ b/src/ispc/kernels/lanczos3.ispc
@@ -1,6 +1,16 @@
 #include "image.ispc"
 
-const uniform float M_PI = 3.14159265358979;
+const uniform float M_PI = 3.14159265358979f;
+const uniform float GAMMA = 2.2f;
+
+struct Parameters {
+    // Whether to linearize the input before downsampling.  Assumes the input has a gamma of 1/2.2
+    // that needs to be linearized by applying exponent 2.2.
+    bool degamma;
+    // Whether to apply gamma (make the output nonlinear) to make it compatible with typical CRTs
+    // that have a gamma of 2.2, by giving linear values a gamma of 1/2.2.
+    bool gamma;
+};
 
 static inline uniform float clean(uniform float t)
 {
@@ -30,12 +40,28 @@ static inline uniform float lanczos3_filter(uniform float t)
         return 0.0f;
 }
 
-static inline float byte_to_float(uint b) {
-    //return floatbits(0x3f800000 | (b << (23 - 8))) - 1.0;
-    return (float)b;
+static inline float byte_to_float(uint8 b, uniform bool degamma) {
+    const uniform float inv_255 = rcp(255.0);
+
+    // floatbits(0x3f800000 | (b << (23 - 8))) - 1.0;
+    float d = (float)b * inv_255;
+
+    if (degamma) {
+        d = pow(d, GAMMA);
+    }
+
+    return d;
 }
 
-static inline uint8<4> resample_internal(const uniform Image src_image, const float<2> uv, const uniform uint8 num_channels) {
+static inline uint8 float_to_byte(float d, uniform bool gamma) {
+    if (gamma) {
+        d = pow(d, 1.0f / GAMMA);
+    }
+    int b = d * 255;
+    return clamp(b, 0, 255);
+}
+
+static inline uint8<4> resample_internal(const uniform Parameters params, const uniform Image src_image, const float<2> uv, const uniform uint8 num_channels) {
     float<4> col = 0.0;
     uniform float weight = 0.0;
     // Truncate floating point coordinate to integer:
@@ -61,18 +87,15 @@ static inline uint8<4> resample_internal(const uniform Image src_image, const fl
             const int addr = (src_kernel_coord.x + src_kernel_coord.y * src_image.size.x) * num_channels;
 
             float<4> texel;
-
-            const uniform float inv_255 = rcp(255.0);
-
             if (num_channels == 3) {
-                texel.x = byte_to_float(src_image.data[addr + 0]) * inv_255;
-                texel.y = byte_to_float(src_image.data[addr + 1]) * inv_255;
-                texel.z = byte_to_float(src_image.data[addr + 2]) * inv_255;
+                texel.x = byte_to_float(src_image.data[addr + 0], params.degamma);
+                texel.y = byte_to_float(src_image.data[addr + 1], params.degamma);
+                texel.z = byte_to_float(src_image.data[addr + 2], params.degamma);
             } else if (num_channels == 4) {
-                texel.x = byte_to_float(src_image.data[addr + 0]) * inv_255;
-                texel.y = byte_to_float(src_image.data[addr + 1]) * inv_255;
-                texel.z = byte_to_float(src_image.data[addr + 2]) * inv_255;
-                texel.w = byte_to_float(src_image.data[addr + 3]) * inv_255;
+                texel.x = byte_to_float(src_image.data[addr + 0], params.degamma);
+                texel.y = byte_to_float(src_image.data[addr + 1], params.degamma);
+                texel.z = byte_to_float(src_image.data[addr + 2], params.degamma);
+                texel.w = byte_to_float(src_image.data[addr + 3], params.degamma);
             }
 
             col += w * texel;
@@ -80,24 +103,33 @@ static inline uint8<4> resample_internal(const uniform Image src_image, const fl
         }
     }
     col /= weight;
-    return col * 255;
+
+    uint8<4> res;
+    for (uniform int i = 0; i < num_channels; i++)
+        res[i] = float_to_byte(col[i], params.gamma);
+
+    return res;
 }
 
-export void resample(uniform uint32 width, uniform uint32 height, uniform uint32 stride, uniform uint8 num_channels, uniform uint32 target_width, uniform uint32 target_height, uniform const uint8 src_data[], uniform uint8 out_data[]) {
-    const uniform Image src = {src_data, {width, height}};
-    const uniform float<2> target_size = {(float)target_width, (float)target_height};
-    const uniform float<2> inv_target_size = 1.0f / target_size;
+export void resample(
+    uniform const Parameters *uniform params,
+    uniform const Image *uniform src,
+    uniform Image *uniform dst,
+    // Passed separately because it should be the same between input and output:
+    uniform uint8 num_channels
+) {
+    const uniform float<2> inv_target_size = 1.0f / dst->size;
 
-    foreach_tiled (y = 0 ... target_height, x = 0 ... target_width) {
+    foreach_tiled (y = 0 ... dst->size.y, x = 0 ... dst->size.x) {
         float<2> uv = {x, y};
         // Use the center of each pixel, not the top-left:
         uv += 0.5f;
         // Convert to uniform space:
         uv *= inv_target_size;
 
-        const uint8<4> s = resample_internal(src, uv, num_channels);
+        const uint8<4> s = resample_internal(*params, *src, uv, num_channels);
 
         for (uniform int i = 0; i < num_channels; i++)
-            out_data[(x + y * target_width) * num_channels + i] = s[i];
+            dst->data[(x + y * dst->size.x) * num_channels + i] = s[i];
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 3f6d36b..add0655 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -35,6 +35,25 @@ impl<'a> Image<'a> {
     }
 }
 
+#[derive(Clone, Debug)]
+pub struct Parameters {
+    /// Whether to linearize the input before downsampling.  Assumes the input has a gamma of
+    /// `1/2.2` that needs to be linearized by applying exponent `2.2`.
+    pub degamma: bool,
+    /// Whether to apply gamma (make the output nonlinear) to make it compatible with typical CRTs
+    /// that have a gamma of `2.2`, by giving linear values a gamma of `1/2.2`.
+    pub gamma: bool,
+}
+
+impl Parameters {
+    fn to_ispc(&self) -> ispc::downsample_ispc::Parameters {
+        ispc::downsample_ispc::Parameters {
+            degamma: self.degamma,
+            gamma: self.gamma,
+        }
+    }
+}
+
 /// Scales the alpha to the downscaled texture to preserve the overall alpha coverage.
 ///
 /// If alpha cutoff is specified, any alpha value above it is considered visible of
@@ -70,25 +89,38 @@ pub fn scale_alpha_to_original_coverage(
 /// Runs the ISPC kernel on the source image, sampling it down to the `target_width` and `target_height`. Returns the downsampled pixel data as a `Vec<u8>`.
 ///
 /// Will panic if the target width or height are higher than that of the source image.
-pub fn downsample(src: &Image<'_>, target_width: u32, target_height: u32) -> Vec<u8> {
+pub fn downsample(
+    params: &Parameters,
+    src: &Image<'_>,
+    target_width: u32,
+    target_height: u32,
+) -> Vec<u8> {
     assert!(src.width >= target_width, "The width of the source image is less than the target's width. You are trying to upsample rather than downsample");
-    assert!(src.height >= target_height, "The width of the source image is less than the target's width. You are trying to upsample rather than downsample");
+    assert!(src.height >= target_height, "The height of the source image is less than the target's height. You are trying to upsample rather than downsample");
 
     let num_channels = src.format.num_channels();
+
+    let src = ispc::downsample_ispc::Image {
+        data: src.pixels.as_ptr() as *mut _,
+        __bindgen_padding_0: 0,
+        // TODO: Use the builtin type when ISPC 1.22 is released
+        // https://github.com/ispc/ispc/issues/2650
+        size: ispc::downsample_ispc::uint32_t2 {
+            v: [src.width, src.height],
+        },
+    };
+
     let mut output = vec![0; (target_width * target_height * num_channels as u32) as usize];
 
-    unsafe {
-        ispc::downsample_ispc::resample(
-            src.width,
-            src.height,
-            src.width,
-            num_channels,
-            target_width,
-            target_height,
-            src.pixels.as_ptr(),
-            output.as_mut_ptr(),
-        )
-    }
+    let mut dst = ispc::downsample_ispc::Image {
+        data: output.as_mut_ptr(),
+        __bindgen_padding_0: 0,
+        size: ispc::downsample_ispc::uint32_t2 {
+            v: [target_width, target_height],
+        },
+    };
+
+    unsafe { ispc::downsample_ispc::resample(&params.to_ispc(), &src, &mut dst, num_channels) }
 
     output
 }

From 8df604ae8db2d9d9acd2ce7824aaa4da5888d979 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn@traverseresearch.nl>
Date: Sat, 26 Aug 2023 11:21:07 +0200
Subject: [PATCH 3/4] WIP: Preprocess degamma

---
 src/ispc/kernels/lanczos3.ispc | 55 ++++++++++++++++------------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/src/ispc/kernels/lanczos3.ispc b/src/ispc/kernels/lanczos3.ispc
index a8133a7..0371160 100644
--- a/src/ispc/kernels/lanczos3.ispc
+++ b/src/ispc/kernels/lanczos3.ispc
@@ -2,6 +2,7 @@
 
 const uniform float M_PI = 3.14159265358979f;
 const uniform float GAMMA = 2.2f;
+const uniform float DEGAMMA = 1.0f / GAMMA;
 
 struct Parameters {
     // Whether to linearize the input before downsampling.  Assumes the input has a gamma of 1/2.2
@@ -40,28 +41,21 @@ static inline uniform float lanczos3_filter(uniform float t)
         return 0.0f;
 }
 
-static inline float byte_to_float(uint8 b, uniform bool degamma) {
+static inline float byte_to_float(uint8 b/*, uniform bool degamma*/) {
     const uniform float inv_255 = rcp(255.0);
-
     // floatbits(0x3f800000 | (b << (23 - 8))) - 1.0;
-    float d = (float)b * inv_255;
-
-    if (degamma) {
-        d = pow(d, GAMMA);
-    }
-
-    return d;
+    return (float)b * inv_255;
 }
 
-static inline uint8 float_to_byte(float d, uniform bool gamma) {
+static inline uint8 float_to_byte(float d, bool gamma) {
     if (gamma) {
-        d = pow(d, 1.0f / GAMMA);
+        d = pow(d, DEGAMMA);
     }
     int b = d * 255;
     return clamp(b, 0, 255);
 }
 
-static inline uint8<4> resample_internal(const uniform Parameters params, const uniform Image src_image, const float<2> uv, const uniform uint8 num_channels) {
+static inline float<4> resample_internal(const uniform Image src_image, const float<2> uv, const uniform uint8 num_channels) {
     float<4> col = 0.0;
     uniform float weight = 0.0;
     // Truncate floating point coordinate to integer:
@@ -87,28 +81,19 @@ static inline uint8<4> resample_internal(const uniform Parameters params, const
             const int addr = (src_kernel_coord.x + src_kernel_coord.y * src_image.size.x) * num_channels;
 
             float<4> texel;
-            if (num_channels == 3) {
-                texel.x = byte_to_float(src_image.data[addr + 0], params.degamma);
-                texel.y = byte_to_float(src_image.data[addr + 1], params.degamma);
-                texel.z = byte_to_float(src_image.data[addr + 2], params.degamma);
-            } else if (num_channels == 4) {
-                texel.x = byte_to_float(src_image.data[addr + 0], params.degamma);
-                texel.y = byte_to_float(src_image.data[addr + 1], params.degamma);
-                texel.z = byte_to_float(src_image.data[addr + 2], params.degamma);
-                texel.w = byte_to_float(src_image.data[addr + 3], params.degamma);
-            }
+            texel.x = byte_to_float(src_image.data[addr + 0]);
+            texel.y = byte_to_float(src_image.data[addr + 1]);
+            texel.z = byte_to_float(src_image.data[addr + 2]);
+            if (num_channels >= 4)
+                texel.w = byte_to_float(src_image.data[addr + 3]);
 
             col += w * texel;
             weight += w;
         }
     }
     col /= weight;
+    return col;
 
-    uint8<4> res;
-    for (uniform int i = 0; i < num_channels; i++)
-        res[i] = float_to_byte(col[i], params.gamma);
-
-    return res;
 }
 
 export void resample(
@@ -120,6 +105,18 @@ export void resample(
 ) {
     const uniform float<2> inv_target_size = 1.0f / dst->size;
 
+    if (params->degamma) {
+        foreach_tiled(y = 0 ... src->size.y, x = 0 ... src->size.x)
+        {
+            uint p = (x + y * src->size.x) * num_channels;
+            for (uniform int i = 0; i < num_channels; i++) {
+                uint c = p + i;
+                // TODO: This texture should be writeonly!
+                src->data[c] = float_to_byte(pow(byte_to_float(src->data[c]), GAMMA), false);
+            }
+        }
+    }
+
     foreach_tiled (y = 0 ... dst->size.y, x = 0 ... dst->size.x) {
         float<2> uv = {x, y};
         // Use the center of each pixel, not the top-left:
@@ -127,9 +124,9 @@ export void resample(
         // Convert to uniform space:
         uv *= inv_target_size;
 
-        const uint8<4> s = resample_internal(*params, *src, uv, num_channels);
+        const float<4> col = resample_internal(*src, uv, num_channels);
 
         for (uniform int i = 0; i < num_channels; i++)
-            dst->data[(x + y * dst->size.x) * num_channels + i] = s[i];
+            dst->data[(x + y * dst->size.x) * num_channels + i] = float_to_byte(col[i], params->gamma);
     }
 }

From 0be3323eed2f7245a51d48f8dfb3a30cb2b08043 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn@traverseresearch.nl>
Date: Tue, 21 Nov 2023 20:45:50 +0100
Subject: [PATCH 4/4] fixup! Perform `degamma` and `gamma` conversions on user
 request

---
 examples/test.rs               |  2 +-
 src/ispc/downsample_ispc.rs    | 44 +++++++++++++++++
 src/ispc/kernels/image.ispc    |  5 ++
 src/ispc/kernels/lanczos3.ispc | 90 +++++++++++++++++++++++-----------
 src/lib.rs                     | 25 ++++++++--
 5 files changed, 133 insertions(+), 33 deletions(-)

diff --git a/examples/test.rs b/examples/test.rs
index fa1118a..67f5a65 100644
--- a/examples/test.rs
+++ b/examples/test.rs
@@ -28,7 +28,7 @@ fn main() {
             println!("Downsampling started!");
             let params = Parameters {
                 // Input stb Image is gamma-corrected (i.e. expects to be passed through a CRT with exponent 2.2)
-                degamma: true,
+                degamma: false,
                 // Output image is PNG which must be stored with a gamma of 1/2.2
                 gamma: true,
             };
diff --git a/src/ispc/downsample_ispc.rs b/src/ispc/downsample_ispc.rs
index 404c62b..e79bce5 100644
--- a/src/ispc/downsample_ispc.rs
+++ b/src/ispc/downsample_ispc.rs
@@ -117,10 +117,54 @@ fn bindgen_test_layout_Image() {
         )
     );
 }
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Debug, Copy, Clone)]
+pub struct FloatImage {
+    pub data: *mut f32,
+    pub __bindgen_padding_0: u64,
+    pub size: uint32_t2,
+}
+#[test]
+fn bindgen_test_layout_FloatImage() {
+    const UNINIT: ::std::mem::MaybeUninit<FloatImage> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<FloatImage>(),
+        32usize,
+        concat!("Size of: ", stringify!(FloatImage))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<FloatImage>(),
+        16usize,
+        concat!("Alignment of ", stringify!(FloatImage))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(FloatImage),
+            "::",
+            stringify!(data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(FloatImage),
+            "::",
+            stringify!(size)
+        )
+    );
+}
 extern "C" {
     pub fn resample(
         params: *const Parameters,
         src: *const Image,
+        degamma: *mut FloatImage,
         dst: *mut Image,
         num_channels: u8,
     );
diff --git a/src/ispc/kernels/image.ispc b/src/ispc/kernels/image.ispc
index a37b260..66f5f16 100644
--- a/src/ispc/kernels/image.ispc
+++ b/src/ispc/kernels/image.ispc
@@ -2,3 +2,8 @@ struct Image {
     uniform uint8* data;
     uniform uint<2> size;
 };
+
+struct FloatImage {
+    uniform float* data;
+    uniform uint<2> size;
+};
diff --git a/src/ispc/kernels/lanczos3.ispc b/src/ispc/kernels/lanczos3.ispc
index 0371160..be0b23d 100644
--- a/src/ispc/kernels/lanczos3.ispc
+++ b/src/ispc/kernels/lanczos3.ispc
@@ -47,7 +47,7 @@ static inline float byte_to_float(uint8 b/*, uniform bool degamma*/) {
     return (float)b * inv_255;
 }
 
-static inline uint8 float_to_byte(float d, bool gamma) {
+static inline uint8 float_to_byte(float d, uniform bool gamma) {
     if (gamma) {
         d = pow(d, DEGAMMA);
     }
@@ -55,7 +55,45 @@ static inline uint8 float_to_byte(float d, bool gamma) {
     return clamp(b, 0, 255);
 }
 
-static inline float<4> resample_internal(const uniform Image src_image, const float<2> uv, const uniform uint8 num_channels) {
+template<typename IT>
+static float<4> sample_image(const uniform IT &image, const int<2> coord, const uniform uint8 num_channels) {
+    return 0.0f;
+}
+
+template<>
+static float<4> sample_image<Image>(const uniform Image &image, const int<2> coord, const uniform uint8 num_channels) {
+    float<4> col = 0.0;
+    int x = clamp(coord.x, 0, image.size.x - 1);
+    int y = clamp(coord.y, 0, image.size.y - 1);
+    int addr = (x + y * image.size.x) * num_channels;
+
+    col[0] = byte_to_float(image.data[addr + 0]);
+    col[1] = byte_to_float(image.data[addr + 1]);
+    col[2] = byte_to_float(image.data[addr + 2]);
+    if (num_channels == 4)
+        col[3] = byte_to_float(image.data[addr + 3]);
+
+    return col;
+}
+
+template<>
+static float<4> sample_image<FloatImage>(const uniform FloatImage &image, const int<2> coord, const uniform uint8 num_channels) {
+    float<4> col = 0.0;
+    int x = clamp(coord.x, 0, image.size.x - 1);
+    int y = clamp(coord.y, 0, image.size.y - 1);
+    int addr = (x + y * image.size.x) * num_channels;
+
+    col[0] = image.data[addr + 0];
+    col[1] = image.data[addr + 1];
+    col[2] = image.data[addr + 2];
+    if (num_channels == 4)
+        col[3] = image.data[addr + 3];
+
+    return col;
+}
+
+template<typename IT>
+static inline float<4> resample_internal(const uniform IT src_image, const float<2> uv, const uniform uint8 num_channels) {
     float<4> col = 0.0;
     uniform float weight = 0.0;
     // Truncate floating point coordinate to integer:
@@ -72,61 +110,57 @@ static inline float<4> resample_internal(const uniform Image src_image, const fl
             const uniform float w = wx * wy;
             const uniform int<2> texel_offset = {x, y};
 
-            int<2> src_kernel_coord = src_coord + texel_offset;
+            int<2> c = src_coord + texel_offset;
             // TODO: Let the user specify a boundary mode!
             // https://github.com/Traverse-Research/ispc-downsampler/issues/25#issuecomment-1584915050
-            src_kernel_coord.x = clamp(src_kernel_coord.x, 0, src_image.size.x - 1);
-            src_kernel_coord.y = clamp(src_kernel_coord.y, 0, src_image.size.y - 1);
-
-            const int addr = (src_kernel_coord.x + src_kernel_coord.y * src_image.size.x) * num_channels;
+            // TODO: For some obscure reason this must happen in sample_image() or the whole thing segfaults because
+            // values become <0 !?!?
+            // c.x = clamp(c.x, 0, src_image.size.x - 1);
+            // c.y = clamp(c.y, 0, src_image.size.y - 1);
 
-            float<4> texel;
-            texel.x = byte_to_float(src_image.data[addr + 0]);
-            texel.y = byte_to_float(src_image.data[addr + 1]);
-            texel.z = byte_to_float(src_image.data[addr + 2]);
-            if (num_channels >= 4)
-                texel.w = byte_to_float(src_image.data[addr + 3]);
-
-            col += w * texel;
             weight += w;
+            col += w * sample_image<IT>(src_image, c, num_channels);
         }
     }
     col /= weight;
     return col;
-
 }
 
 export void resample(
-    uniform const Parameters *uniform params,
-    uniform const Image *uniform src,
-    uniform Image *uniform dst,
+    uniform const Parameters &params,
+    uniform const Image &src,
+    uniform FloatImage &degamma,
+    uniform Image &dst,
     // Passed separately because it should be the same between input and output:
     uniform uint8 num_channels
 ) {
-    const uniform float<2> inv_target_size = 1.0f / dst->size;
+    const uniform float<2> inv_target_size = 1.0f / dst.size;
 
-    if (params->degamma) {
-        foreach_tiled(y = 0 ... src->size.y, x = 0 ... src->size.x)
+    if (params.degamma) {
+        foreach_tiled(y = 0 ... src.size.y, x = 0 ... src.size.x)
         {
-            uint p = (x + y * src->size.x) * num_channels;
+            uint p = (x + y * src.size.x) * num_channels;
             for (uniform int i = 0; i < num_channels; i++) {
                 uint c = p + i;
-                // TODO: This texture should be writeonly!
-                src->data[c] = float_to_byte(pow(byte_to_float(src->data[c]), GAMMA), false);
+                degamma.data[c] = pow(byte_to_float(src.data[c]), GAMMA);
             }
         }
     }
 
-    foreach_tiled (y = 0 ... dst->size.y, x = 0 ... dst->size.x) {
+    foreach_tiled (y = 0 ... dst.size.y, x = 0 ... dst.size.x) {
         float<2> uv = {x, y};
         // Use the center of each pixel, not the top-left:
         uv += 0.5f;
         // Convert to uniform space:
         uv *= inv_target_size;
 
-        const float<4> col = resample_internal(*src, uv, num_channels);
+        float<4> col;
+        if (params.degamma)
+            col = resample_internal(degamma, uv, num_channels);
+        else
+            col = resample_internal(src, uv, num_channels);
 
         for (uniform int i = 0; i < num_channels; i++)
-            dst->data[(x + y * dst->size.x) * num_channels + i] = float_to_byte(col[i], params->gamma);
+            dst.data[(x + y * dst.size.x) * num_channels + i] = float_to_byte(col[i], params.gamma);
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index add0655..fd03711 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -100,16 +100,25 @@ pub fn downsample(
 
     let num_channels = src.format.num_channels();
 
-    let src = ispc::downsample_ispc::Image {
+    let src_raw = ispc::downsample_ispc::Image {
         data: src.pixels.as_ptr() as *mut _,
         __bindgen_padding_0: 0,
-        // TODO: Use the builtin type when ISPC 1.22 is released
-        // https://github.com/ispc/ispc/issues/2650
         size: ispc::downsample_ispc::uint32_t2 {
             v: [src.width, src.height],
         },
     };
 
+    let mut degamma = params.degamma.then(|| {
+        let mut degamma = vec![0f32; (src.width * src.height * num_channels as u32) as usize];
+        ispc::downsample_ispc::FloatImage {
+            data: degamma.as_mut_ptr(),
+            __bindgen_padding_0: 0,
+            size: ispc::downsample_ispc::uint32_t2 {
+                v: [src.width, src.height],
+            },
+        }
+    });
+
     let mut output = vec![0; (target_width * target_height * num_channels as u32) as usize];
 
     let mut dst = ispc::downsample_ispc::Image {
@@ -120,7 +129,15 @@ pub fn downsample(
         },
     };
 
-    unsafe { ispc::downsample_ispc::resample(&params.to_ispc(), &src, &mut dst, num_channels) }
+    unsafe {
+        ispc::downsample_ispc::resample(
+            &params.to_ispc(),
+            &src_raw,
+            degamma.as_mut().map_or(std::ptr::null_mut(), |x| x),
+            &mut dst,
+            num_channels,
+        )
+    }
 
     output
 }