diff --git a/benches/basic.rs b/benches/basic.rs
index e2584df..5e68b0e 100644
--- a/benches/basic.rs
+++ b/benches/basic.rs
@@ -1,5 +1,5 @@
 use criterion::{criterion_group, criterion_main, Criterion};
-use ispc_downsampler::{downsample, Format, Image};
+use ispc_downsampler::{downsample, Format, Image, Parameters};
 use resize::{px::RGB, Type::Lanczos3};
 use stb_image::image::{load, LoadResult};
 use std::path::Path;
@@ -17,8 +17,15 @@ pub fn ispc_downsampler(c: &mut Criterion) {
         let target_width = (img.width / 4) as u32;
         let target_height = (img.height / 4) as u32;
 
+        let params = Parameters {
+            // Input stb Image is gamma-corrected (i.e. expects to be passed through a CRT with exponent 2.2)
+            degamma: true,
+            // Output image is PNG which must be stored with a gamma of 1/2.2
+            gamma: true,
+        };
+
         c.bench_function("Downsample `square_test.png` using ispc_downsampler", |b| {
-            b.iter(|| downsample(&src_img, target_width, target_height))
+            b.iter(|| downsample(&params, &src_img, target_width, target_height))
         });
     }
 }
diff --git a/examples/test.rs b/examples/test.rs
index 3d58186..67f5a65 100644
--- a/examples/test.rs
+++ b/examples/test.rs
@@ -1,5 +1,5 @@
 use image::{RgbImage, RgbaImage};
-use ispc_downsampler::{downsample, Format, Image};
+use ispc_downsampler::{downsample, Format, Image, Parameters};
 use stb_image::image::{load, LoadResult};
 use std::path::Path;
 use std::time::Instant;
@@ -26,7 +26,13 @@ fn main() {
 
             let now = Instant::now();
             println!("Downsampling started!");
-            let downsampled_pixels = downsample(&src_img, target_width, target_height);
+            let params = Parameters {
+                // Input stb Image is gamma-corrected (i.e. expects to be passed through a CRT with exponent 2.2)
+                degamma: false,
+                // Output image is PNG which must be stored with a gamma of 1/2.2
+                gamma: true,
+            };
+            let downsampled_pixels = downsample(&params, &src_img, target_width, target_height);
             println!("Finished downsampling in {:.2?}!", now.elapsed());
 
             std::fs::create_dir_all("example_outputs").unwrap();
diff --git a/src/ispc/downsample_ispc.rs b/src/ispc/downsample_ispc.rs
index 044bf5f..e79bce5 100644
--- a/src/ispc/downsample_ispc.rs
+++ b/src/ispc/downsample_ispc.rs
@@ -2,16 +2,171 @@
 pub mod downsample_ispc {
 /* automatically generated by rust-bindgen 0.61.0 */
 
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Debug, Copy, Clone)]
+pub struct uint32_t2 {
+    pub v: [u32; 2usize],
+}
+#[test]
+fn bindgen_test_layout_uint32_t2() {
+    const UNINIT: ::std::mem::MaybeUninit<uint32_t2> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<uint32_t2>(),
+        16usize,
+        concat!("Size of: ", stringify!(uint32_t2))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<uint32_t2>(),
+        16usize,
+        concat!("Alignment of ", stringify!(uint32_t2))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(uint32_t2),
+            "::",
+            stringify!(v)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct Parameters {
+    pub degamma: bool,
+    pub gamma: bool,
+}
+#[test]
+fn bindgen_test_layout_Parameters() {
+    const UNINIT: ::std::mem::MaybeUninit<Parameters> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<Parameters>(),
+        2usize,
+        concat!("Size of: ", stringify!(Parameters))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<Parameters>(),
+        1usize,
+        concat!("Alignment of ", stringify!(Parameters))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).degamma) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(Parameters),
+            "::",
+            stringify!(degamma)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).gamma) as usize - ptr as usize },
+        1usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(Parameters),
+            "::",
+            stringify!(gamma)
+        )
+    );
+}
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Debug, Copy, Clone)]
+pub struct Image {
+    pub data: *mut u8,
+    pub __bindgen_padding_0: u64,
+    pub size: uint32_t2,
+}
+#[test]
+fn bindgen_test_layout_Image() {
+    const UNINIT: ::std::mem::MaybeUninit<Image> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<Image>(),
+        32usize,
+        concat!("Size of: ", stringify!(Image))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<Image>(),
+        16usize,
+        concat!("Alignment of ", stringify!(Image))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(Image),
+            "::",
+            stringify!(data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(Image),
+            "::",
+            stringify!(size)
+        )
+    );
+}
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Debug, Copy, Clone)]
+pub struct FloatImage {
+    pub data: *mut f32,
+    pub __bindgen_padding_0: u64,
+    pub size: uint32_t2,
+}
+#[test]
+fn bindgen_test_layout_FloatImage() {
+    const UNINIT: ::std::mem::MaybeUninit<FloatImage> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<FloatImage>(),
+        32usize,
+        concat!("Size of: ", stringify!(FloatImage))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<FloatImage>(),
+        16usize,
+        concat!("Alignment of ", stringify!(FloatImage))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(FloatImage),
+            "::",
+            stringify!(data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(FloatImage),
+            "::",
+            stringify!(size)
+        )
+    );
+}
 extern "C" {
     pub fn resample(
-        width: u32,
-        height: u32,
-        stride: u32,
+        params: *const Parameters,
+        src: *const Image,
+        degamma: *mut FloatImage,
+        dst: *mut Image,
         num_channels: u8,
-        target_width: u32,
-        target_height: u32,
-        src_data: *const u8,
-        out_data: *mut u8,
     );
 }
 extern "C" {
diff --git a/src/ispc/kernels/image.ispc b/src/ispc/kernels/image.ispc
index c906c24..66f5f16 100644
--- a/src/ispc/kernels/image.ispc
+++ b/src/ispc/kernels/image.ispc
@@ -1,4 +1,9 @@
 struct Image {
     uniform uint8* data;
-    uniform int<2> size;
+    uniform uint<2> size;
+};
+
+struct FloatImage {
+    uniform float* data;
+    uniform uint<2> size;
 };
diff --git a/src/ispc/kernels/lanczos3.ispc b/src/ispc/kernels/lanczos3.ispc
index 6767450..be0b23d 100644
--- a/src/ispc/kernels/lanczos3.ispc
+++ b/src/ispc/kernels/lanczos3.ispc
@@ -1,18 +1,29 @@
 #include "image.ispc"
 
-#define M_PI 3.14159265358979
-
-static inline float clean(float t)
+const uniform float M_PI = 3.14159265358979f;
+const uniform float GAMMA = 2.2f;
+const uniform float DEGAMMA = 1.0f / GAMMA;
+
+struct Parameters {
+    // Whether to linearize the input before downsampling.  Assumes the input has a gamma of 1/2.2
+    // that needs to be linearized by applying exponent 2.2.
+    bool degamma;
+    // Whether to apply gamma (make the output nonlinear) to make it compatible with typical CRTs
+    // that have a gamma of 2.2, by giving linear values a gamma of 1/2.2.
+    bool gamma;
+};
+
+static inline uniform float clean(uniform float t)
 {
-    const float EPSILON = .0000125f;
+    const uniform float EPSILON = .0000125f;
     if (abs(t) < EPSILON)
         return 0.0f;
-    return (float)t;
+    return t;
 }
 
-static inline float sinc(float x)
+static inline uniform float sinc(uniform float x)
 {
-    x = (x * M_PI);
+    x = x * M_PI;
 
     // if ((x < 0.01f) && (x > -0.01f))
     //     return 1.0f + x * x * (-1.0f / 6.0f + x * x * 1.0f / 120.0f);
@@ -20,7 +31,7 @@ static inline float sinc(float x)
     return sin(x) / x;
 }
 
-static inline float lanczos3_filter(float t)
+static inline uniform float lanczos3_filter(uniform float t)
 {
     t = abs(t);
 
@@ -30,19 +41,61 @@ static inline float lanczos3_filter(float t)
         return 0.0f;
 }
 
-static inline float frac(float f) {
-    float absf = abs(f);
-    return absf - floor(absf);
+static inline float byte_to_float(uint8 b/*, uniform bool degamma*/) {
+    const uniform float inv_255 = rcp(255.0);
+    // floatbits(0x3f800000 | (b << (23 - 8))) - 1.0;
+    return (float)b * inv_255;
+}
+
+static inline uint8 float_to_byte(float d, uniform bool gamma) {
+    if (gamma) {
+        d = pow(d, DEGAMMA);
+    }
+    int b = d * 255;
+    return clamp(b, 0, 255);
+}
+
+template<typename IT>
+static float<4> sample_image(const uniform IT &image, const int<2> coord, const uniform uint8 num_channels) {
+    return 0.0f;
+}
+
+template<>
+static float<4> sample_image<Image>(const uniform Image &image, const int<2> coord, const uniform uint8 num_channels) {
+    float<4> col = 0.0;
+    int x = clamp(coord.x, 0, image.size.x - 1);
+    int y = clamp(coord.y, 0, image.size.y - 1);
+    int addr = (x + y * image.size.x) * num_channels;
+
+    col[0] = byte_to_float(image.data[addr + 0]);
+    col[1] = byte_to_float(image.data[addr + 1]);
+    col[2] = byte_to_float(image.data[addr + 2]);
+    if (num_channels == 4)
+        col[3] = byte_to_float(image.data[addr + 3]);
+
+    return col;
 }
 
-static inline float byte_to_float(uint b) {
-    //return floatbits(0x3f800000 | (b << (23 - 8))) - 1.0;
-    return (float)b;
+template<>
+static float<4> sample_image<FloatImage>(const uniform FloatImage &image, const int<2> coord, const uniform uint8 num_channels) {
+    float<4> col = 0.0;
+    int x = clamp(coord.x, 0, image.size.x - 1);
+    int y = clamp(coord.y, 0, image.size.y - 1);
+    int addr = (x + y * image.size.x) * num_channels;
+
+    col[0] = image.data[addr + 0];
+    col[1] = image.data[addr + 1];
+    col[2] = image.data[addr + 2];
+    if (num_channels == 4)
+        col[3] = image.data[addr + 3];
+
+    return col;
 }
 
-static inline uint8<4> resample_internal(uniform Image src_image, float<2> uv, uniform uint8 num_channels) {
+template<typename IT>
+static inline float<4> resample_internal(const uniform IT src_image, const float<2> uv, const uniform uint8 num_channels) {
     float<4> col = 0.0;
-    float weight = 0.0;
+    uniform float weight = 0.0;
     // Truncate floating point coordinate to integer:
     const int<2> src_coord = uv * src_image.size;
 
@@ -52,57 +105,62 @@ static inline uint8<4> resample_internal(uniform Image src_image, float<2> uv, u
     // right and bottom of the target pixel.
     for (uniform int x = -3; x < 3; x++) {
         for (uniform int y = -3; y < 3; y++) {
-            float wx = lanczos3_filter((uniform float)x + 0.5);
-            float wy = lanczos3_filter((uniform float)y + 0.5);
+            const uniform float wx = lanczos3_filter((uniform float)x + 0.5);
+            const uniform float wy = lanczos3_filter((uniform float)y + 0.5);
+            const uniform float w = wx * wy;
+            const uniform int<2> texel_offset = {x, y};
 
-            float w = wx * wy;
-            int<2> texel_offset = {x, y};
-            int<2> src_kernel_coord = src_coord + texel_offset;
+            int<2> c = src_coord + texel_offset;
             // TODO: Let the user specify a boundary mode!
             // https://github.com/Traverse-Research/ispc-downsampler/issues/25#issuecomment-1584915050
-            src_kernel_coord.x = clamp(src_kernel_coord.x, 0, src_image.size.x - 1);
-            src_kernel_coord.y = clamp(src_kernel_coord.y, 0, src_image.size.y - 1);
-
-            int addr = (src_kernel_coord.x + src_kernel_coord.y * src_image.size.x) * num_channels;
-
-            float<4> texel;
-
-            const float inv_255 = rcp(255.0);
+            // TODO: For some obscure reason this must happen in sample_image() or the whole thing segfaults because
+            // values become <0 !?!?
+            // c.x = clamp(c.x, 0, src_image.size.x - 1);
+            // c.y = clamp(c.y, 0, src_image.size.y - 1);
 
-            if (num_channels == 3) {
-                texel.x = byte_to_float(src_image.data[addr + 0]) * inv_255;
-                texel.y = byte_to_float(src_image.data[addr + 1]) * inv_255;
-                texel.z = byte_to_float(src_image.data[addr + 2]) * inv_255;
-            } else if (num_channels == 4) {
-                texel.x = byte_to_float(src_image.data[addr + 0]) * inv_255;
-                texel.y = byte_to_float(src_image.data[addr + 1]) * inv_255;
-                texel.z = byte_to_float(src_image.data[addr + 2]) * inv_255;
-                texel.w = byte_to_float(src_image.data[addr + 3]) * inv_255;
-            }
-
-            col += w * texel;
             weight += w;
+            col += w * sample_image<IT>(src_image, c, num_channels);
         }
     }
     col /= weight;
-    return col * 255;
+    return col;
 }
 
-export void resample(uniform uint32 width, uniform uint32 height, uniform uint32 stride, uniform uint8 num_channels, uniform uint32 target_width, uniform uint32 target_height, uniform const uint8 src_data[], uniform uint8 out_data[]) {
-    uniform Image src = {src_data, {width, height}};
-    uniform float<2> target_size = {(float)target_width, (float)target_height};
-    uniform float<2> inv_target_size = 1.0f / target_size;
+export void resample(
+    uniform const Parameters &params,
+    uniform const Image &src,
+    uniform FloatImage &degamma,
+    uniform Image &dst,
+    // Passed separately because it should be the same between input and output:
+    uniform uint8 num_channels
+) {
+    const uniform float<2> inv_target_size = 1.0f / dst.size;
+
+    if (params.degamma) {
+        foreach_tiled(y = 0 ... src.size.y, x = 0 ... src.size.x)
+        {
+            uint p = (x + y * src.size.x) * num_channels;
+            for (uniform int i = 0; i < num_channels; i++) {
+                uint c = p + i;
+                degamma.data[c] = pow(byte_to_float(src.data[c]), GAMMA);
+            }
+        }
+    }
 
-    foreach_tiled (y = 0 ... target_height, x = 0 ... target_width) {
+    foreach_tiled (y = 0 ... dst.size.y, x = 0 ... dst.size.x) {
         float<2> uv = {x, y};
         // Use the center of each pixel, not the top-left:
         uv += 0.5f;
         // Convert to uniform space:
         uv *= inv_target_size;
 
-        uint8<4> s = resample_internal(src, uv, num_channels);
+        float<4> col;
+        if (params.degamma)
+            col = resample_internal(degamma, uv, num_channels);
+        else
+            col = resample_internal(src, uv, num_channels);
 
         for (uniform int i = 0; i < num_channels; i++)
-            out_data[(x + y * target_width) * num_channels + i] = s[i];
+            dst.data[(x + y * dst.size.x) * num_channels + i] = float_to_byte(col[i], params.gamma);
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 3f6d36b..fd03711 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -35,6 +35,25 @@ impl<'a> Image<'a> {
     }
 }
 
+#[derive(Clone, Debug)]
+pub struct Parameters {
+    /// Whether to linearize the input before downsampling.  Assumes the input has a gamma of
+    /// `1/2.2` that needs to be linearized by applying exponent `2.2`.
+    pub degamma: bool,
+    /// Whether to apply gamma (make the output nonlinear) to make it compatible with typical CRTs
+    /// that have a gamma of `2.2`, by giving linear values a gamma of `1/2.2`.
+    pub gamma: bool,
+}
+
+impl Parameters {
+    fn to_ispc(&self) -> ispc::downsample_ispc::Parameters {
+        ispc::downsample_ispc::Parameters {
+            degamma: self.degamma,
+            gamma: self.gamma,
+        }
+    }
+}
+
 /// Scales the alpha to the downscaled texture to preserve the overall alpha coverage.
 ///
 /// If alpha cutoff is specified, any alpha value above it is considered visible of
@@ -70,23 +89,53 @@ pub fn scale_alpha_to_original_coverage(
 /// Runs the ISPC kernel on the source image, sampling it down to the `target_width` and `target_height`. Returns the downsampled pixel data as a `Vec<u8>`.
 ///
 /// Will panic if the target width or height are higher than that of the source image.
-pub fn downsample(src: &Image<'_>, target_width: u32, target_height: u32) -> Vec<u8> {
+pub fn downsample(
+    params: &Parameters,
+    src: &Image<'_>,
+    target_width: u32,
+    target_height: u32,
+) -> Vec<u8> {
     assert!(src.width >= target_width, "The width of the source image is less than the target's width. You are trying to upsample rather than downsample");
-    assert!(src.height >= target_height, "The width of the source image is less than the target's width. You are trying to upsample rather than downsample");
+    assert!(src.height >= target_height, "The height of the source image is less than the target's height. You are trying to upsample rather than downsample");
 
     let num_channels = src.format.num_channels();
+
+    let src_raw = ispc::downsample_ispc::Image {
+        data: src.pixels.as_ptr() as *mut _,
+        __bindgen_padding_0: 0,
+        size: ispc::downsample_ispc::uint32_t2 {
+            v: [src.width, src.height],
+        },
+    };
+
+    let mut degamma = params.degamma.then(|| {
+        let mut degamma = vec![0f32; (src.width * src.height * num_channels as u32) as usize];
+        ispc::downsample_ispc::FloatImage {
+            data: degamma.as_mut_ptr(),
+            __bindgen_padding_0: 0,
+            size: ispc::downsample_ispc::uint32_t2 {
+                v: [src.width, src.height],
+            },
+        }
+    });
+
     let mut output = vec![0; (target_width * target_height * num_channels as u32) as usize];
 
+    let mut dst = ispc::downsample_ispc::Image {
+        data: output.as_mut_ptr(),
+        __bindgen_padding_0: 0,
+        size: ispc::downsample_ispc::uint32_t2 {
+            v: [target_width, target_height],
+        },
+    };
+
     unsafe {
         ispc::downsample_ispc::resample(
-            src.width,
-            src.height,
-            src.width,
+            &params.to_ispc(),
+            &src_raw,
+            degamma.as_mut().map_or(std::ptr::null_mut(), |x| x),
+            &mut dst,
             num_channels,
-            target_width,
-            target_height,
-            src.pixels.as_ptr(),
-            output.as_mut_ptr(),
         )
     }