From c5abf112165439485e0f28d85f3d344903d5e554 Mon Sep 17 00:00:00 2001
From: TroyHernandez <troy@cornball.ai>
Date: Fri, 20 Feb 2026 11:33:12 -0600
Subject: [PATCH] Add inpaint() and inpaint_sd21() for mask-based image editing

SD 2.1 inpainting pipeline: encode image, add noise, denoise with
mask blending at each DDIM step. preprocess_mask() helper loads,
resizes, and binarizes masks to latent dimensions.

Mask convention: white=1=inpaint, black=0=keep.
---
 NAMESPACE              |   5 +
 R/inpaint.R            |  90 +++++++++++++++
 R/inpaint_sd21.R       | 245 +++++++++++++++++++++++++++++++++++++++++
 man/inpaint.Rd         |  26 +++++
 man/inpaint_sd21.Rd    |  79 +++++++++++++
 man/preprocess_mask.Rd |  32 ++++++
 6 files changed, 477 insertions(+)
 create mode 100644 R/inpaint.R
 create mode 100644 R/inpaint_sd21.R
 create mode 100644 man/inpaint.Rd
 create mode 100644 man/inpaint_sd21.Rd
 create mode 100644 man/preprocess_mask.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 8133d0b..c57850d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -28,6 +28,8 @@ export(gemma3_config_ltx2)
 export(gemma3_text_model)
 export(gemma3_tokenizer)
 export(img2img)
+export(inpaint)
+export(inpaint_sd21)
 export(int4_linear)
 export(int4_linear_from_quantized)
 export(is_blackwell_gpu)
@@ -69,6 +71,7 @@ export(pack_video_latents)
 export(per_channel_rms_norm)
 export(post_quant_conv)
 export(preprocess_image)
+export(preprocess_mask)
 export(quant_conv)
 export(quantize_int4)
 export(quantize_ltx2_transformer)
@@ -103,3 +106,5 @@ export(vocab_size)
 export(vram_report)
 
 S3method(print,bpe_tokenizer)
+
+importFrom(utils,head)
diff --git a/R/inpaint.R b/R/inpaint.R
new file mode 100644
index 0000000..cd353fb
--- /dev/null
+++ b/R/inpaint.R
@@ -0,0 +1,90 @@
+#' Inpaint an image using a diffusion model
+#'
+#' Generates a new image by inpainting masked regions of an input image guided
+#' by a text prompt. Unmasked regions are preserved from the original image.
+#'
+#' @param input_image Path to the input image (.jpg/.jpeg/.png) or a 3D array.
+#' @param mask_image Path to the mask image or a matrix/array. White (1) = inpaint,
+#'   Black (0) = keep.
+#' @param prompt Text prompt to guide the inpainting.
+#' @param model_name Name of the model to use (currently "sd21").
+#' @param ... Additional parameters passed to the model-specific inpainting function.
+#'
+#' @return A list containing the generated image array and metadata.
+#' @export
+inpaint <- function(
+  input_image,
+  mask_image,
+  prompt,
+  model_name = "sd21",
+  ...
+) {
+  switch(model_name,
+    "sd21" = inpaint_sd21(input_image, mask_image, prompt, ...),
+    stop("Unsupported model for inpainting: ", model_name)
+  )
+}
+
+#' Preprocess a mask for inpainting
+#'
+#' Loads, resizes, and binarizes a mask image for use in the inpainting pipeline.
+#' The mask is resized to latent space dimensions (height/8, width/8).
+#'
+#' @param mask_input Path to a mask image (.jpg/.jpeg/.png), a matrix, or a 3D array.
+#'   White (1) = inpaint region, Black (0) = keep region.
+#' @param height Target image height in pixels (will be divided by 8 for latent space).
+#' @param width Target image width in pixels (will be divided by 8 for latent space).
+#' @param device Target device ("cpu" or "cuda").
+#' @param dtype Torch dtype for the output tensor.
+#'
+#' @return Torch tensor of shape [1, 1, height/8, width/8] with values 0 or 1.
+#' @export
+preprocess_mask <- function(
+  mask_input,
+  height,
+  width,
+  device = "cpu",
+  dtype = torch::torch_float32()
+) {
+  # Load mask image
+  if (is.character(mask_input)) {
+    if (grepl("\\.jpg$|\\.jpeg$", mask_input, ignore.case = TRUE)) {
+      mask <- jpeg::readJPEG(mask_input)
+    } else if (grepl("\\.png$", mask_input, ignore.case = TRUE)) {
+      mask <- png::readPNG(mask_input)
+    } else {
+      stop("Unsupported mask format: only .jpg/.jpeg/.png allowed")
+    }
+  } else if (is.matrix(mask_input) || is.array(mask_input)) {
+    mask <- mask_input
+  } else {
+    stop("mask_input must be a file path, matrix, or array")
+  }
+
+  # Convert to single-channel if multi-channel (use first channel or average)
+  if (length(dim(mask)) == 3) {
+    # If RGBA, drop alpha
+    if (dim(mask)[3] == 4) {
+      mask <- mask[,, 1:3]
+    }
+    # Average RGB channels to get single-channel mask
+    mask <- apply(mask, c(1, 2), mean)
+  }
+
+  # Convert to tensor [1, 1, H, W]
+  mask_tensor <- torch::torch_tensor(mask)$unsqueeze(1)$unsqueeze(1)
+
+  # Resize to latent dimensions (H/8, W/8)
+  latent_h <- as.integer(height / 8)
+  latent_w <- as.integer(width / 8)
+  mask_tensor <- torch::nnf_interpolate(
+    mask_tensor,
+    size = c(latent_h, latent_w),
+    mode = "nearest"
+  )
+
+  # Binarize: threshold at 0.5 (white=1=inpaint, black=0=keep)
+  mask_tensor <- (mask_tensor > 0.5)$to(dtype = dtype, device = torch::torch_device(device))
+
+  mask_tensor
+}
diff --git a/R/inpaint_sd21.R b/R/inpaint_sd21.R
new file mode 100644
index 0000000..980e25c
--- /dev/null
+++ b/R/inpaint_sd21.R
@@ -0,0 +1,245 @@
+#' Inpaint an image using Stable Diffusion 2.1
+#'
+#' Generates a new image by inpainting masked regions of an input image guided
+#' by a text prompt. Uses the standard SD 2.1 pipeline with mask blending at
+#' each denoising step.
+#'
+#' @param input_image Path to the input image (.jpg/.jpeg/.png) or a 3D array.
+#' @param mask_image Path to the mask image or a matrix/array. White (1) = inpaint,
+#'   Black (0) = keep.
+#' @param prompt Text prompt to guide the inpainting.
+#' @param negative_prompt Optional negative prompt.
+#' @param img_dim Dimension of the output image (default: 512).
+#' @param pipeline Optional pre-loaded pipeline. If NULL, loaded automatically.
+#' @param devices A named list of devices for each model component, or "auto".
+#' @param unet_dtype_str Data type for the UNet (e.g., "float16", "float32").
+#' @param download_models Logical indicating whether to download models if not found.
+#' @param num_inference_steps Number of diffusion steps (default: 50).
+#' @param strength Strength of the transformation (default: 0.8). Higher values
+#'   change the masked region more.
+#' @param guidance_scale Scale for classifier-free guidance (default: 7.5).
+#' @param seed Random seed for reproducibility.
+#' @param save_file Logical indicating whether to save the generated image.
+#' @param filename Optional filename for saving the image.
+#' @param metadata_path Path to save metadata.
+#' @param use_native_decoder Logical; if TRUE, uses native R torch decoder.
+#' @param use_native_text_encoder Logical; if TRUE, uses native R torch text encoder.
+#' @param use_native_unet Logical; if TRUE, uses native R torch UNet.
+#' @param ... Additional arguments.
+#'
+#' @return A list containing the generated image array and metadata.
+#' @export
+inpaint_sd21 <- function(
+  input_image,
+  mask_image,
+  prompt,
+  negative_prompt = NULL,
+  img_dim = 512,
+  pipeline = NULL,
+  devices = "auto",
+  unet_dtype_str = NULL,
+  download_models = FALSE,
+  num_inference_steps = 50,
+  strength = 0.8,
+  guidance_scale = 7.5,
+  seed = NULL,
+  save_file = TRUE,
+  filename = NULL,
+  metadata_path = NULL,
+  use_native_decoder = FALSE,
+  use_native_text_encoder = FALSE,
+  use_native_unet = FALSE,
+  ...
+) {
+  model_name <- "sd21"
+  num_train_timesteps <- 1000
+
+  # Handle "auto" devices
+  if (identical(devices, "auto")) {
+    devices <- auto_devices(model_name)
+  }
+
+  # 1. Get models
+  m2d <- models2devices(model_name, devices = devices, unet_dtype_str = NULL,
+    download_models = download_models)
+  devices <- m2d$devices
+  unet_dtype <- m2d$unet_dtype
+  device_cpu <- m2d$device_cpu
+  device_cuda <- m2d$device_cuda
+
+  if (is.null(pipeline)) {
+    pipeline <- load_pipeline(model_name = model_name, m2d = m2d,
+      i2i = TRUE,
+      unet_dtype_str = unet_dtype_str,
+      use_native_decoder = use_native_decoder,
+      use_native_text_encoder = use_native_text_encoder,
+      use_native_unet = use_native_unet)
+  }
+
+  # Start timing
+  start_time <- proc.time()
+
+  # 2. Encode input image to latents
+  image_tensor <- preprocess_image(input_image, width = img_dim, height = img_dim,
+    device = torch::torch_device(devices$encoder))
+  message("Encoding image...")
+  encoded <- pipeline$encoder(image_tensor)
+  conv_latents <- quant_conv(encoded, dtype = unet_dtype, device = devices$unet)
+
+  latents_mean <- conv_latents[, 1:4, , ]
+  init_latents <- latents_mean$to(dtype = unet_dtype,
+    device = torch::torch_device(devices$unet)) * 0.18215
+
+  # 3. Preprocess mask to latent dimensions
+  message("Preprocessing mask...")
+  mask_latent <- preprocess_mask(mask_image, height = img_dim, width = img_dim,
+    device = devices$unet, dtype = unet_dtype)
+
+  # 4. Compute noise timestep from strength
+  t_strength <- as.integer(strength * num_train_timesteps)
+  schedule <- ddim_scheduler_create(num_train_timesteps = 1000,
+    num_inference_steps = num_inference_steps,
+    beta_schedule = "scaled_linear",
+    device = torch::torch_device(devices$unet))
+
+  all_inference_timesteps <- schedule$timesteps
+  timestep_idx <- which.min(abs(all_inference_timesteps - t_strength))
+  timestep_start <- all_inference_timesteps[timestep_idx]
+  timesteps <- all_inference_timesteps[timestep_idx:length(all_inference_timesteps)]
+
+  # 5. Add noise to latents
+  message("Adding noise to latent image...")
+  if (!is.null(seed)) {
+    set.seed(seed)
+    torch::torch_manual_seed(seed = seed)
+  }
+  noise <- torch::torch_randn_like(init_latents)
+  noised_latents <- scheduler_add_noise(original_latents = init_latents,
+    noise = noise,
+    timestep = timestep_start,
+    scheduler_obj = schedule)
+  latents <- noised_latents$to(dtype = unet_dtype,
+    device = torch::torch_device(devices$unet))
+
+  # 6. Process text prompt
+  message("Processing prompt...")
+  tokens <- CLIPTokenizer(prompt)
+  prompt_embed <- pipeline$text_encoder(tokens)
+
+  if (is.null(negative_prompt)) {
+    empty_tokens <- CLIPTokenizer("")
+  } else {
+    empty_tokens <- CLIPTokenizer(negative_prompt)
+  }
+  empty_prompt_embed <- pipeline$text_encoder(empty_tokens)
+
+  empty_prompt_embed <- empty_prompt_embed$to(dtype = unet_dtype,
+    device = torch::torch_device(devices$unet))
+  prompt_embed <- prompt_embed$to(dtype = unet_dtype,
+    device = torch::torch_device(devices$unet))
+
+  # 7. Denoising loop with mask blending
+  message("Inpainting...")
+  pb <- utils::txtProgressBar(min = 0, max = length(timesteps), style = 3)
+  torch::with_no_grad({
+    for (i in seq_along(timesteps)) {
+      timestep <- torch::torch_tensor(timesteps[i],
+        dtype = torch::torch_long(),
+        device = torch::torch_device(devices$unet))
+
+      # CFG: get conditional and unconditional predictions
+      noise_pred_uncond <- pipeline$unet(latents, timestep, empty_prompt_embed)
+      noise_pred_cond <- pipeline$unet(latents, timestep, prompt_embed)
+
+      noise_pred <- noise_pred_uncond + guidance_scale *
+        (noise_pred_cond - noise_pred_uncond)
+
+      # DDIM step
+      latents <- ddim_scheduler_step(model_output = noise_pred,
+        timestep = timestep,
+        sample = latents,
+        schedule = schedule,
+        prediction_type = "v_prediction",
+        device = devices$unet)
+      latents <- latents$to(dtype = unet_dtype,
+        device = torch::torch_device(devices$unet))
+
+      # Mask blending: keep original in unmasked regions
+      # mask=1 means inpaint (use denoised), mask=0 means keep (use original)
+      # Re-noise original latents to current timestep for seamless blending
+      if (i < length(timesteps)) {
+        next_timestep <- timesteps[i + 1]
+        original_at_t <- scheduler_add_noise(
+          original_latents = init_latents,
+          noise = noise,
+          timestep = next_timestep,
+          scheduler_obj = schedule)
+        original_at_t <- original_at_t$to(dtype = unet_dtype,
+          device = torch::torch_device(devices$unet))
+      } else {
+        # Final step: use clean original latents
+        original_at_t <- init_latents
+      }
+      latents <- latents * mask_latent + original_at_t * (1 - mask_latent)
+
+      utils::setTxtProgressBar(pb, i)
+    }
+  })
+  close(pb)
+
+  # 8. Decode latents to image
+  scaled_latent <- latents / 0.18215
+  scaled_latent <- scaled_latent$to(dtype = torch::torch_float32(),
+    device = torch::torch_device(devices$decoder))
+  message("Decoding image...")
+  decoded_output <- pipeline$decoder(scaled_latent)
+  img <- decoded_output$cpu()
+
+  if (length(img$shape) == 4) {
+    img <- img$squeeze(1)
+  }
+
+  img <- img$permute(c(2, 3, 1))
+  img <- (img + 1) / 2
+  img <- torch::torch_clamp(img, min = 0, max = 1)
+  img_array <- as.array(img)
+
+  # Save if requested
+  if (save_file) {
+    if (is.null(filename)) {
+      filename <- filename_from_prompt(prompt, datetime = TRUE)
+    }
+    message("Saving image to ", filename)
+    save_image(img = img_array, filename)
+  } else {
+    if (interactive()) {
+      grid::grid.raster(img_array)
+    }
+  }
+
+  # Save metadata
+  metadata <- list(
+    prompt = prompt,
+    negative_prompt = negative_prompt,
+    width = img_dim,
+    height = img_dim,
+    num_inference_steps = num_inference_steps,
+    strength = strength,
+    guidance_scale = guidance_scale,
+    seed = seed,
+    model = model_name,
+    mode = "inpaint"
+  )
+  if (!is.null(metadata_path)) {
+    utils::write.csv(metadata, file = metadata_path, row.names = FALSE)
+    message("Metadata saved to: ", metadata_path)
+  }
+
+  elapsed <- proc.time() - start_time
+  message(sprintf("Inpainting completed in %.2f seconds", elapsed[3]))
+
+  list(
+    image = img_array,
+    metadata = metadata
+  )
+}
diff --git a/man/inpaint.Rd b/man/inpaint.Rd
new file mode 100644
index 0000000..8542c1e
--- /dev/null
+++ b/man/inpaint.Rd
@@ -0,0 +1,26 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{inpaint}
+\alias{inpaint}
+\title{Inpaint an image using a diffusion model}
+\usage{
+inpaint(input_image, mask_image, prompt, model_name = "sd21", ...)
+}
+\arguments{
+\item{input_image}{Path to the input image (.jpg/.jpeg/.png) or a 3D array.}
+
+\item{mask_image}{Path to the mask image or a matrix/array. White (1) = inpaint,
+Black (0) = keep.}
+
+\item{prompt}{Text prompt to guide the inpainting.}
+
+\item{model_name}{Name of the model to use (currently "sd21").}
+
+\item{...}{Additional parameters passed to the model-specific inpainting function.}
+}
+\value{
+A list containing the generated image array and metadata.
+}
+\description{
+Generates a new image by inpainting masked regions of an input image guided
+by a text prompt. Unmasked regions are preserved from the original image.
+}
diff --git a/man/inpaint_sd21.Rd b/man/inpaint_sd21.Rd
new file mode 100644
index 0000000..c7ad1a4
--- /dev/null
+++ b/man/inpaint_sd21.Rd
@@ -0,0 +1,79 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{inpaint_sd21}
+\alias{inpaint_sd21}
+\title{Inpaint an image using Stable Diffusion 2.1}
+\usage{
+inpaint_sd21(
+  input_image,
+  mask_image,
+  prompt,
+  negative_prompt = NULL,
+  img_dim = 512,
+  pipeline = NULL,
+  devices = "auto",
+  unet_dtype_str = NULL,
+  download_models = FALSE,
+  num_inference_steps = 50,
+  strength = 0.8,
+  guidance_scale = 7.5,
+  seed = NULL,
+  save_file = TRUE,
+  filename = NULL,
+  metadata_path = NULL,
+  use_native_decoder = FALSE,
+  use_native_text_encoder = FALSE,
+  use_native_unet = FALSE,
+  ...
+)
+}
+\arguments{
+\item{input_image}{Path to the input image (.jpg/.jpeg/.png) or a 3D array.}
+
+\item{mask_image}{Path to the mask image or a matrix/array. White (1) = inpaint,
+Black (0) = keep.}
+
+\item{prompt}{Text prompt to guide the inpainting.}
+
+\item{negative_prompt}{Optional negative prompt.}
+
+\item{img_dim}{Dimension of the output image (default: 512).}
+
+\item{pipeline}{Optional pre-loaded pipeline. If NULL, loaded automatically.}
+
+\item{devices}{A named list of devices for each model component, or "auto".}
+
+\item{unet_dtype_str}{Data type for the UNet (e.g., "float16", "float32").}
+
+\item{download_models}{Logical indicating whether to download models if not found.}
+
+\item{num_inference_steps}{Number of diffusion steps (default: 50).}
+
+\item{strength}{Strength of the transformation (default: 0.8). Higher values
+change the masked region more.}
+
+\item{guidance_scale}{Scale for classifier-free guidance (default: 7.5).}
+
+\item{seed}{Random seed for reproducibility.}
+
+\item{save_file}{Logical indicating whether to save the generated image.}
+
+\item{filename}{Optional filename for saving the image.}
+
+\item{metadata_path}{Path to save metadata.}
+
+\item{use_native_decoder}{Logical; if TRUE, uses native R torch decoder.}
+
+\item{use_native_text_encoder}{Logical; if TRUE, uses native R torch text encoder.}
+
+\item{use_native_unet}{Logical; if TRUE, uses native R torch UNet.}
+
+\item{...}{Additional arguments.}
+}
+\value{
+A list containing the generated image array and metadata.
+}
+\description{
+Generates a new image by inpainting masked regions of an input image guided
+by a text prompt. Uses the standard SD 2.1 pipeline with mask blending at
+each denoising step.
+}
diff --git a/man/preprocess_mask.Rd b/man/preprocess_mask.Rd
new file mode 100644
index 0000000..bfa7801
--- /dev/null
+++ b/man/preprocess_mask.Rd
@@ -0,0 +1,32 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{preprocess_mask}
+\alias{preprocess_mask}
+\title{Preprocess a mask for inpainting}
+\usage{
+preprocess_mask(
+  mask_input,
+  height,
+  width,
+  device = "cpu",
+  dtype = torch::torch_float32()
+)
+}
+\arguments{
+\item{mask_input}{Path to a mask image (.jpg/.jpeg/.png), a matrix, or a 3D array.
+White (1) = inpaint region, Black (0) = keep region.}
+
+\item{height}{Target image height in pixels (will be divided by 8 for latent space).}
+
+\item{width}{Target image width in pixels (will be divided by 8 for latent space).}
+
+\item{device}{Target device ("cpu" or "cuda").}
+
+\item{dtype}{Torch dtype for the output tensor.}
+}
+\value{
+Torch tensor of shape [1, 1, height/8, width/8] with values 0 or 1.
+}
+\description{
+Loads, resizes, and binarizes a mask image for use in the inpainting pipeline.
+The mask is resized to latent space dimensions (height/8, width/8).
+}