diff --git a/README.md b/README.md
index 4203d15..a795f7c 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ The crate comes with the bindings and precompiled libraries for Windows, Linux a
 
 ## Usage
 
-Create a new `ispc_downsampler::Image` from a slice of the texture's pixels, the dimensions of the source image, and the format it is in. Currently only works with RGB8 and RGBA8 textures.
+Create a new `ispc_downsampler::Image` from a slice of the texture's pixels, the dimensions of the source image, and the format it is in.
 Call `ispc_downsampler::downsample` with the source image, and the target dimension for downsampled image. The function will return a `Vec<u8>` with the pixels of the downsampled image in the same format as the source image.
 
 #### Example
diff --git a/examples/test.rs b/examples/test.rs
index 2b7ba25..00e3291 100644
--- a/examples/test.rs
+++ b/examples/test.rs
@@ -1,4 +1,4 @@
-use image::{RgbImage, RgbaImage};
+use image::{GrayAlphaImage, GrayImage, RgbImage, RgbaImage};
 use ispc_downsampler::{downsample_with_custom_scale, Format, Image};
 use stb_image::image::{load, LoadResult};
 use std::path::Path;
@@ -11,10 +11,18 @@ fn main() {
         LoadResult::ImageU8(img) => {
             assert!(!img.data.is_empty());
 
-            let src_fmt = if img.data.len() / (img.width * img.height) == 4 {
+            let num_channels = img.data.len() / (img.width * img.height);
+
+            let src_fmt = if num_channels == 4 {
                 Format::Rgba8
-            } else {
+            } else if num_channels == 3 {
                 Format::Rgb8
+            } else if num_channels == 2 {
+                Format::Rg8
+            } else if num_channels == 1 {
+                Format::R8
+            } else {
+                panic!("We expect a number of channels in the [1, 4] range");
             };
 
             println!("Loaded image!");
@@ -32,6 +40,22 @@ fn main() {
 
             std::fs::create_dir_all("example_outputs").unwrap();
             match src_fmt {
+                Format::R8 => {
+                    let save_image =
+                        GrayImage::from_vec(target_width, target_height, downsampled_pixels)
+                            .unwrap();
+                    save_image
+                        .save("example_outputs/square_test_result.png")
+                        .unwrap()
+                }
+                Format::Rg8 => {
+                    let save_image =
+                        GrayAlphaImage::from_vec(target_width, target_height, downsampled_pixels)
+                            .unwrap();
+                    save_image
+                        .save("example_outputs/square_test_result.png")
+                        .unwrap()
+                }
                 Format::Rgba8 | Format::Srgba8 => {
                     let save_image =
                         RgbaImage::from_vec(target_width, target_height, downsampled_pixels)
diff --git a/src/ispc/downsample_ispcx86_64-pc-windows-msvc.lib b/src/ispc/downsample_ispcx86_64-pc-windows-msvc.lib
index 14b8010..4ed6e7d 100644
Binary files a/src/ispc/downsample_ispcx86_64-pc-windows-msvc.lib and b/src/ispc/downsample_ispcx86_64-pc-windows-msvc.lib differ
diff --git a/src/ispc/kernels/lanczos3.ispc b/src/ispc/kernels/lanczos3.ispc
index 456bfc0..32fbe45 100644
--- a/src/ispc/kernels/lanczos3.ispc
+++ b/src/ispc/kernels/lanczos3.ispc
@@ -90,6 +90,34 @@ struct SampleWeights {
     uniform const WeightCollection* horizontal_weights;
 };
 
+uint8<1> sample_1_channel(const uniform uint8* varying pixel_ptr) {
+    const uniform uint8<1>* pixel_ptr1 = (const uniform uint8<1>*)(pixel_ptr);
+    varying uint8<1> dst = {0};
+    dst = *pixel_ptr1;
+    return dst;
+}
+
+void clean_and_write_1_channel(varying float<1> color, uniform uint8* varying pixel_ptr) {
+    pixel_ptr[0] = clamp(color[0], 0.0f, 255.0f);
+}
+
+uint8<2> sample_2_channels(const uniform uint8* varying pixel_ptr) {
+    // Memory reinterpretation to read all channels at once rather than one-by-one.
+    // While testing, this proved more performant than reading one-by-one.
+    const uniform uint8<2>* pixel_ptr2 = (const uniform uint8<2>*)(pixel_ptr);
+    varying uint8<2> dst = {0, 0};
+    dst = *pixel_ptr2;
+    return dst;
+}
+
+void clean_and_write_2_channels(varying float<2> color, uniform uint8* varying pixel_ptr) {
+    // The final color is a sum of numbers that are multiplied by the weights of their respective pixels.
+    // Because of their numbers, floating point precision leads to the final color being potentially outside of the 0-255 range by a slight margin.
+    // This would cause an underflow/overflow, which we avoid with the clamps.
+    pixel_ptr[0] = clamp(color[0], 0.0f, 255.0f);
+    pixel_ptr[1] = clamp(color[1], 0.0f, 255.0f);
+}
+
 uint8<3> sample_3_channels(const uniform uint8* varying pixel_ptr) {
     // Memory reinterpretation to read all channels at once rather than one-by-one.
     // While testing, this proved more performant than reading one-by-one.
@@ -136,6 +164,8 @@ void resample_with_cached_weights(uniform uint32 num_channels, uniform uint32 sr
         uint32 num_horizontal_weights = horizontal_weight_collection->weight_counts[x];
         float* horizontal_weights = horizontal_weight_collection->values[x];
 
+        float<1> color1 = {0.0f};
+        float<2> color2 = {0.0f, 0.0f};
         float<3> color3 = {0.0f, 0.0f, 0.0f};
         float<4> color4 = {0.0f, 0.0f, 0.0f, 0.0f};
         for (uint32 i = 0; i < num_horizontal_weights; i++) {
@@ -143,7 +173,11 @@ void resample_with_cached_weights(uniform uint32 num_channels, uniform uint32 sr
             uint32 src_x = src_width_start + i;
             uint64 src_read_address = (y * src_width + src_x) * num_channels;
 
-            if (num_channels == 3)
+            if (num_channels == 1)
+                color1 += sample_1_channel(src_data + src_read_address) * weight;
+            else if (num_channels == 2)
+                color2 += sample_2_channels(src_data + src_read_address) * weight;
+            else if (num_channels == 3)
                 color3 += sample_3_channels(src_data + src_read_address) * weight;
             else
                 color4 += sample_4_channels(src_data + src_read_address) * weight;
@@ -151,7 +185,11 @@ void resample_with_cached_weights(uniform uint32 num_channels, uniform uint32 sr
 
         uint64 scratch_write_address = (y * target_width + x) * num_channels;
 
-        if (num_channels == 3)
+        if (num_channels == 1)
+            clean_and_write_1_channel(color1, scratch_space + scratch_write_address);
+        else if (num_channels == 2)
+            clean_and_write_2_channels(color2, scratch_space + scratch_write_address);
+        else if (num_channels == 3)
             clean_and_write_3_channels(color3, scratch_space + scratch_write_address);
         else
             clean_and_write_4_channels(color4, scratch_space + scratch_write_address);
@@ -163,6 +201,9 @@ void resample_with_cached_weights(uniform uint32 num_channels, uniform uint32 sr
         uint32 src_height_start = vertical_weight_collection->starts[y];
         uint32 num_vertical_weights = vertical_weight_collection->weight_counts[y];
         float* vertical_weights = vertical_weight_collection->values[y];
+
+        float<1> color1 = {0.0f};
+        float<2> color2 = {0.0f, 0.0f};
         float<3> color3 = {0.0f, 0.0f, 0.0f};
         float<4> color4 = {0.0f, 0.0f, 0.0f, 0.0f};
         for (uint32 i = 0; i < num_vertical_weights; i++) {
@@ -173,7 +214,11 @@ void resample_with_cached_weights(uniform uint32 num_channels, uniform uint32 sr
             uniform uint8<3>* varying scratch_pixel_ptr = (uniform uint8<3>* varying)(scratch_space + scratch_read_address);
             uint8<3> scratch_color = *scratch_pixel_ptr;
 
-            if (num_channels == 3)
+            if (num_channels == 1)
+                color1 += sample_1_channel(scratch_space + scratch_read_address) * weight;
+            else if (num_channels == 2)
+                color2 += sample_2_channels(scratch_space + scratch_read_address) * weight;
+            else if (num_channels == 3)
                 color3 += sample_3_channels(scratch_space + scratch_read_address) * weight;
             else
                 color4 += sample_4_channels(scratch_space + scratch_read_address) * weight;
@@ -181,7 +226,12 @@ void resample_with_cached_weights(uniform uint32 num_channels, uniform uint32 sr
 
         uint64 out_write_address = (y * target_width + x) * num_channels;
         assert(out_write_address < target_height * target_width * num_channels);
-        if (num_channels == 3)
+
+        if (num_channels == 1)
+            clean_and_write_1_channel(color1, out_data + out_write_address);
+        else if (num_channels == 2)
+            clean_and_write_2_channels(color2, out_data + out_write_address);
+        else if (num_channels == 3)
             clean_and_write_3_channels(color3, out_data + out_write_address);
         else
             clean_and_write_4_channels(color4, out_data + out_write_address);
diff --git a/src/lib.rs b/src/lib.rs
index 28eb9a5..72474b6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,6 +7,8 @@ mod ispc;
 
 #[derive(Clone, Copy, Eq, PartialEq, Debug)]
 pub enum Format {
+    R8,
+    Rg8,
     Rgb8,
     Srgb8,
     Rgba8,
@@ -15,6 +17,8 @@ pub enum Format {
 impl Format {
     fn num_channels(&self) -> u8 {
         match self {
+            Self::R8 => 1,
+            Self::Rg8 => 2,
             Self::Rgb8 | Self::Srgb8 => 3,
             Self::Rgba8 | Self::Srgba8 => 4,
         }