From 80784fbe313a02dacb8fb28b838d16bf0f812bdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 2 Dec 2025 23:28:27 +0100 Subject: [PATCH 01/12] add Flux.2 VAE proj matrix for previews --- latent-preview.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/latent-preview.h b/latent-preview.h index 97409a7d8..61c9434d5 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -91,6 +91,41 @@ const float flux_latent_rgb_proj[16][3] = { {-0.111849f, -0.055589f, -0.032361f}}; float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f}; +const float flux_2_latent_rgb_proj[32][3] = { + {0.000736f, -0.008385f, -0.019710f}, + {-0.001352f, -0.016392f, 0.020693f}, + {-0.006376f, 0.002428f, 0.036736f}, + {0.039384f, 0.074167f, 0.119789f}, + {0.007464f, -0.005705f, -0.004734f}, + {-0.004086f, 0.005287f, -0.000409f}, + {-0.032835f, 0.050802f, -0.028120f}, + {-0.003158f, -0.000835f, 0.000406f}, + {-0.112840f, -0.084337f, -0.023083f}, + {0.001462f, -0.006656f, 0.000549f}, + {-0.009980f, -0.007480f, 0.009702f}, + {0.032540f, 0.000214f, -0.061388f}, + {0.011023f, 0.000694f, 0.007143f}, + {-0.001468f, -0.006723f, -0.001678f}, + {-0.005921f, -0.010320f, -0.003907f}, + {-0.028434f, 0.027584f, 0.018457f}, + {0.014349f, 0.011523f, 0.000441f}, + {0.009874f, 0.003081f, 0.001507f}, + {0.002218f, 0.005712f, 0.001563f}, + {0.053010f, -0.019844f, 0.008683f}, + {-0.002507f, 0.005384f, 0.000938f}, + {-0.002177f, -0.011366f, 0.003559f}, + {-0.000261f, 0.015121f, -0.003240f}, + {-0.003944f, -0.002083f, 0.005043f}, + {-0.009138f, 0.011336f, 0.003781f}, + {0.011429f, 0.003985f, -0.003855f}, + {0.010518f, -0.005586f, 0.010131f}, + {0.007883f, 0.002912f, -0.001473f}, + {-0.003318f, -0.003160f, 0.003684f}, + {-0.034560f, -0.008740f, 0.012996f}, + {0.000166f, 0.001079f, -0.012153f}, + {0.017772f, 0.000937f, -0.011953f}}; +float flux_2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f}; + // This one was taken straight from // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303 // (MiT Licence) From beef32251e5be8ad21a2ccfeaf284d3fe3e01a98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 2 Dec 2025 23:29:36 +0100 Subject: [PATCH 02/12] Enable flux.2 proj for preview with flux model --- latent-preview.h | 4 ++-- stable-diffusion.cpp | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index 61c9434d5..0f9f27380 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -91,7 +91,7 @@ const float flux_latent_rgb_proj[16][3] = { {-0.111849f, -0.055589f, -0.032361f}}; float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f}; -const float flux_2_latent_rgb_proj[32][3] = { +const float flux2_latent_rgb_proj[32][3] = { {0.000736f, -0.008385f, -0.019710f}, {-0.001352f, -0.016392f, 0.020693f}, {-0.006376f, 0.002428f, 0.036736f}, @@ -124,7 +124,7 @@ const float flux_2_latent_rgb_proj[32][3] = { {-0.034560f, -0.008740f, 0.012996f}, {0.000166f, 0.001079f, -0.012153f}, {0.017772f, 0.000937f, -0.011953f}}; -float flux_2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f}; +float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f}; // This one was taken straight from // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303 diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 73065610d..2a1a7e9dc 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1349,6 +1349,11 @@ class StableDiffusionGGML { // unknown model return; } + } else if (dim == 32) { + if (sd_version_is_flux2(version)) { + latent_rgb_proj = flux2_latent_rgb_proj; + latent_rgb_bias = flux2_latent_rgb_bias; + } } else if (dim == 16) { // 16 channels VAE -> Flux or SD3 From 77e4620a897b2bed282c76af7dabd77534117f68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 2 Dec 2025 23:29:41 +0100 Subject: [PATCH 03/12] support Flux.2 patched latents for proj preview --- stable-diffusion.cpp | 125 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 2 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 2a1a7e9dc..eca523a38 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1337,8 +1337,72 @@ class StableDiffusionGGML { uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; if (preview_mode == PREVIEW_PROJ) { - const float(*latent_rgb_proj)[channel] = nullptr; - float* latent_rgb_bias = nullptr; + int64_t patch_sz = 1; + if (sd_version_is_flux2(version)) { + patch_sz = 2; + } + if (patch_sz != 1) { + // unshuffle latents + const int64_t N = latents->ne[3]; + const int64_t C_in = latents->ne[2]; + const int64_t H_in = latents->ne[1]; + const int64_t W_in = latents->ne[0]; + + const int64_t C_out = C_in / (patch_sz * patch_sz); + const int64_t H_out = H_in * patch_sz; + const int64_t W_out = W_in * patch_sz; + + const char* src_ptr = (char*)latents->data; + size_t elem_size = latents->nb[0]; + + std::vector dst_buffer(N * C_out * H_out * W_out * elem_size); + char* dst_base = dst_buffer.data(); + + size_t dst_stride_w = elem_size; + size_t dst_stride_h = dst_stride_w * W_out; + size_t dst_stride_c = dst_stride_h * H_out; + size_t dst_stride_n = dst_stride_c * C_out; + + size_t dst_step_w = dst_stride_w * patch_sz; + size_t dst_step_h = dst_stride_h * patch_sz; + + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C_in; ++c) { + int64_t c_out = c / (patch_sz * patch_sz); + int64_t rem = c % (patch_sz * patch_sz); + int64_t py = rem / patch_sz; + int64_t px = rem % patch_sz; + + char* dst_layer = dst_base + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w; + + for (int64_t y = 0; y < H_in; ++y) { + char* dst_row = dst_layer + y * dst_step_h; + + for (int64_t x = 0; x < W_in; ++x) { + memcpy(dst_row + x * dst_step_w, src_ptr, elem_size); + src_ptr += elem_size; + } + } + } + } + + memcpy(latents->data, dst_buffer.data(), dst_buffer.size()); + + latents->ne[0] = W_out; + latents->ne[1] = H_out; + latents->ne[2] = C_out; + + latents->nb[0] = dst_stride_w; + latents->nb[1] = dst_stride_h; + latents->nb[2] = dst_stride_c; + latents->nb[3] = dst_stride_n; + + width = W_out; + height = H_out; + dim = C_out; + } + const float (*latent_rgb_proj)[channel] = nullptr; + float* latent_rgb_bias = nullptr; if (dim == 48) { if (sd_version_is_wan(version)) { @@ -1408,6 +1472,63 @@ class StableDiffusionGGML { step_callback(step, frames, images, is_noisy, step_callback_data); free(data); free(images); + + if (patch_sz != 1) { + // restore shuffled latents + const int64_t N = latents->ne[3]; + const int64_t C_in = latents->ne[2]; + const int64_t H_in = latents->ne[1]; + const int64_t W_in = latents->ne[0]; + + const int64_t C_out = C_in * patch_sz * patch_sz; + const int64_t H_out = H_in / patch_sz; + const int64_t W_out = W_in / patch_sz; + + const char* src_base = (char*)latents->data; + const size_t elem_size = latents->nb[0]; + + const size_t src_stride_w = latents->nb[0]; + const size_t src_stride_h = latents->nb[1]; + const size_t src_stride_c = latents->nb[2]; + const size_t src_stride_n = latents->nb[3]; + + std::vector dst_buffer(N * C_out * H_out * W_out * elem_size); + char* dst_ptr = dst_buffer.data(); + + const size_t src_step_h = src_stride_h * patch_sz; + const size_t src_step_w = src_stride_w * patch_sz; + + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C_out; ++c) { + int64_t c_rem = c % (patch_sz * patch_sz); + int64_t c_in = c / (patch_sz * patch_sz); + int64_t py = c_rem / patch_sz; + int64_t px = c_rem % patch_sz; + + const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w; + + for (int64_t y = 0; y < H_out; ++y) { + const char* src_row = src_layer + y * src_step_h; + + for (int64_t x = 0; x < W_out; ++x) { + memcpy(dst_ptr, src_row + x * src_step_w, elem_size); + dst_ptr += elem_size; + } + } + } + } + + memcpy(latents->data, dst_buffer.data(), dst_buffer.size()); + + latents->ne[0] = W_out; + latents->ne[1] = H_out; + latents->ne[2] = C_out; + + latents->nb[0] = elem_size; + latents->nb[1] = latents->nb[0] * W_out; + latents->nb[2] = latents->nb[1] * H_out; + latents->nb[3] = latents->nb[2] * C_out; + } } else { if (preview_mode == PREVIEW_VAE) { process_latent_out(latents); From da8e95ebdeaba167de31aebf30e6c7d7d3183b48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 2 Dec 2025 23:29:43 +0100 Subject: [PATCH 04/12] move latent shuffle logic to latents-preview.h --- latent-preview.h | 127 +++++++++++++++++++++++++++++++++++++++++++ stable-diffusion.cpp | 116 +++------------------------------------ 2 files changed, 136 insertions(+), 107 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index 0f9f27380..52c8c8c9f 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -163,6 +163,133 @@ const float sd_latent_rgb_proj[4][3] = { {-0.178022f, -0.200862f, -0.678514f}}; float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f}; +void unpatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) { + const int64_t N = latents->ne[3]; + const int64_t C_in = latents->ne[2]; + const int64_t H_in = latents->ne[1]; + const int64_t W_in = latents->ne[0]; + + const int64_t C_out = C_in / (patch_size * patch_size); + const int64_t H_out = H_in * patch_size; + const int64_t W_out = W_in * patch_size; + + const char* src_ptr = (char*)latents->data; + size_t elem_size = latents->nb[0]; + + bool alloc_dst_buf = dst_buf == nullptr; + size_t dst_buf_size = latents->nb[3]; + if (alloc_dst_buf) { + dst_buf = (char*)malloc(dst_buf_size); + } + + size_t dst_stride_w = elem_size; + size_t dst_stride_h = dst_stride_w * W_out; + size_t dst_stride_c = dst_stride_h * H_out; + size_t dst_stride_n = dst_stride_c * C_out; + + size_t dst_step_w = dst_stride_w * patch_size; + size_t dst_step_h = dst_stride_h * patch_size; + + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C_in; ++c) { + int64_t c_out = c / (patch_size * patch_size); + int64_t rem = c % (patch_size * patch_size); + int64_t py = rem / patch_size; + int64_t px = rem % patch_size; + + char* dst_layer = dst_buf + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w; + + for (int64_t y = 0; y < H_in; ++y) { + char* dst_row = dst_layer + y * dst_step_h; + + for (int64_t x = 0; x < W_in; ++x) { + memcpy(dst_row + x * dst_step_w, src_ptr, elem_size); + src_ptr += elem_size; + } + } + } + } + + memcpy(latents->data, dst_buf, dst_buf_size); + + latents->ne[0] = W_out; + latents->ne[1] = H_out; + latents->ne[2] = C_out; + + latents->nb[0] = dst_stride_w; + latents->nb[1] = dst_stride_h; + latents->nb[2] = dst_stride_c; + latents->nb[3] = dst_stride_n; + if (alloc_dst_buf) { + free(dst_buf); + } +} + +void repatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) { + const int64_t N = latents->ne[3]; + const int64_t C_in = latents->ne[2]; + const int64_t H_in = latents->ne[1]; + const int64_t W_in = latents->ne[0]; + + const int64_t C_out = C_in * patch_size * patch_size; + const int64_t H_out = H_in / patch_size; + const int64_t W_out = W_in / patch_size; + + const char* src_base = (char*)latents->data; + const size_t elem_size = latents->nb[0]; + + const size_t src_stride_w = latents->nb[0]; + const size_t src_stride_h = latents->nb[1]; + const size_t src_stride_c = latents->nb[2]; + const size_t src_stride_n = latents->nb[3]; + + bool alloc_dst_buf = dst_buf == nullptr; + size_t dst_buf_size = src_stride_n; + if (alloc_dst_buf) { + dst_buf = (char*)malloc(dst_buf_size); + } + + char* dst_ptr = dst_buf; + + const size_t src_step_h = src_stride_h * patch_size; + const size_t src_step_w = src_stride_w * patch_size; + + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C_out; ++c) { + int64_t c_rem = c % (patch_size * patch_size); + int64_t c_in = c / (patch_size * patch_size); + int64_t py = c_rem / patch_size; + int64_t px = c_rem % patch_size; + + const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w; + + for (int64_t y = 0; y < H_out; ++y) { + const char* src_row = src_layer + y * src_step_h; + + for (int64_t x = 0; x < W_out; ++x) { + memcpy(dst_ptr, src_row + x * src_step_w, elem_size); + dst_ptr += elem_size; + } + } + } + } + + memcpy(latents->data, dst_buf, dst_buf_size); + + latents->ne[0] = W_out; + latents->ne[1] = H_out; + latents->ne[2] = C_out; + + latents->nb[0] = elem_size; + latents->nb[1] = latents->nb[0] * W_out; + latents->nb[2] = latents->nb[1] * H_out; + latents->nb[3] = latents->nb[2] * C_out; + + if (alloc_dst_buf) { + free(dst_buf); + } +} + void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) { size_t buffer_head = 0; for (int k = 0; k < frames; k++) { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index eca523a38..362f0a9a9 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1343,63 +1343,14 @@ class StableDiffusionGGML { } if (patch_sz != 1) { // unshuffle latents - const int64_t N = latents->ne[3]; - const int64_t C_in = latents->ne[2]; - const int64_t H_in = latents->ne[1]; - const int64_t W_in = latents->ne[0]; + std::vector dst_buffer(latents->nb[GGML_MAX_DIMS-1]); + char* dst_buf = dst_buffer.data(); - const int64_t C_out = C_in / (patch_sz * patch_sz); - const int64_t H_out = H_in * patch_sz; - const int64_t W_out = W_in * patch_sz; + unpatchify_latents(latents, patch_sz, dst_buf); - const char* src_ptr = (char*)latents->data; - size_t elem_size = latents->nb[0]; - - std::vector dst_buffer(N * C_out * H_out * W_out * elem_size); - char* dst_base = dst_buffer.data(); - - size_t dst_stride_w = elem_size; - size_t dst_stride_h = dst_stride_w * W_out; - size_t dst_stride_c = dst_stride_h * H_out; - size_t dst_stride_n = dst_stride_c * C_out; - - size_t dst_step_w = dst_stride_w * patch_sz; - size_t dst_step_h = dst_stride_h * patch_sz; - - for (int64_t n = 0; n < N; ++n) { - for (int64_t c = 0; c < C_in; ++c) { - int64_t c_out = c / (patch_sz * patch_sz); - int64_t rem = c % (patch_sz * patch_sz); - int64_t py = rem / patch_sz; - int64_t px = rem % patch_sz; - - char* dst_layer = dst_base + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w; - - for (int64_t y = 0; y < H_in; ++y) { - char* dst_row = dst_layer + y * dst_step_h; - - for (int64_t x = 0; x < W_in; ++x) { - memcpy(dst_row + x * dst_step_w, src_ptr, elem_size); - src_ptr += elem_size; - } - } - } - } - - memcpy(latents->data, dst_buffer.data(), dst_buffer.size()); - - latents->ne[0] = W_out; - latents->ne[1] = H_out; - latents->ne[2] = C_out; - - latents->nb[0] = dst_stride_w; - latents->nb[1] = dst_stride_h; - latents->nb[2] = dst_stride_c; - latents->nb[3] = dst_stride_n; - - width = W_out; - height = H_out; - dim = C_out; + width = latents->ne[0]; + height = latents->ne[1]; + dim = latents->ne[ggml_n_dims(latents) - 1]; } const float (*latent_rgb_proj)[channel] = nullptr; float* latent_rgb_bias = nullptr; @@ -1475,59 +1426,10 @@ class StableDiffusionGGML { if (patch_sz != 1) { // restore shuffled latents - const int64_t N = latents->ne[3]; - const int64_t C_in = latents->ne[2]; - const int64_t H_in = latents->ne[1]; - const int64_t W_in = latents->ne[0]; - - const int64_t C_out = C_in * patch_sz * patch_sz; - const int64_t H_out = H_in / patch_sz; - const int64_t W_out = W_in / patch_sz; - - const char* src_base = (char*)latents->data; - const size_t elem_size = latents->nb[0]; - - const size_t src_stride_w = latents->nb[0]; - const size_t src_stride_h = latents->nb[1]; - const size_t src_stride_c = latents->nb[2]; - const size_t src_stride_n = latents->nb[3]; - - std::vector dst_buffer(N * C_out * H_out * W_out * elem_size); - char* dst_ptr = dst_buffer.data(); - - const size_t src_step_h = src_stride_h * patch_sz; - const size_t src_step_w = src_stride_w * patch_sz; - - for (int64_t n = 0; n < N; ++n) { - for (int64_t c = 0; c < C_out; ++c) { - int64_t c_rem = c % (patch_sz * patch_sz); - int64_t c_in = c / (patch_sz * patch_sz); - int64_t py = c_rem / patch_sz; - int64_t px = c_rem % patch_sz; - - const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w; - - for (int64_t y = 0; y < H_out; ++y) { - const char* src_row = src_layer + y * src_step_h; - - for (int64_t x = 0; x < W_out; ++x) { - memcpy(dst_ptr, src_row + x * src_step_w, elem_size); - dst_ptr += elem_size; - } - } - } - } - - memcpy(latents->data, dst_buffer.data(), dst_buffer.size()); - - latents->ne[0] = W_out; - latents->ne[1] = H_out; - latents->ne[2] = C_out; + std::vector dst_buffer(latents->nb[GGML_MAX_DIMS-1]); + char* dst_buf = dst_buffer.data(); - latents->nb[0] = elem_size; - latents->nb[1] = latents->nb[0] * W_out; - latents->nb[2] = latents->nb[1] * H_out; - latents->nb[3] = latents->nb[2] * C_out; + repatchify_latents(latents, patch_sz, dst_buf); } } else { if (preview_mode == PREVIEW_VAE) { From c054c2367801275f1c1c3b67060a18f84c905c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 2 Dec 2025 23:29:44 +0100 Subject: [PATCH 05/12] refactor preview_latent_video to support flux.2 patchified latents --- latent-preview.h | 156 ++++++++----------------------------------- stable-diffusion.cpp | 129 ++++++++++++++++------------------- 2 files changed, 86 insertions(+), 199 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index 52c8c8c9f..8354a35e0 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -163,143 +163,43 @@ const float sd_latent_rgb_proj[4][3] = { {-0.178022f, -0.200862f, -0.678514f}}; float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f}; -void unpatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) { - const int64_t N = latents->ne[3]; - const int64_t C_in = latents->ne[2]; - const int64_t H_in = latents->ne[1]; - const int64_t W_in = latents->ne[0]; - const int64_t C_out = C_in / (patch_size * patch_size); - const int64_t H_out = H_in * patch_size; - const int64_t W_out = W_in * patch_size; - - const char* src_ptr = (char*)latents->data; - size_t elem_size = latents->nb[0]; - - bool alloc_dst_buf = dst_buf == nullptr; - size_t dst_buf_size = latents->nb[3]; - if (alloc_dst_buf) { - dst_buf = (char*)malloc(dst_buf_size); - } - - size_t dst_stride_w = elem_size; - size_t dst_stride_h = dst_stride_w * W_out; - size_t dst_stride_c = dst_stride_h * H_out; - size_t dst_stride_n = dst_stride_c * C_out; - - size_t dst_step_w = dst_stride_w * patch_size; - size_t dst_step_h = dst_stride_h * patch_size; - - for (int64_t n = 0; n < N; ++n) { - for (int64_t c = 0; c < C_in; ++c) { - int64_t c_out = c / (patch_size * patch_size); - int64_t rem = c % (patch_size * patch_size); - int64_t py = rem / patch_size; - int64_t px = rem % patch_size; - - char* dst_layer = dst_buf + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w; - - for (int64_t y = 0; y < H_in; ++y) { - char* dst_row = dst_layer + y * dst_step_h; - - for (int64_t x = 0; x < W_in; ++x) { - memcpy(dst_row + x * dst_step_w, src_ptr, elem_size); - src_ptr += elem_size; - } - } - } - } - - memcpy(latents->data, dst_buf, dst_buf_size); - - latents->ne[0] = W_out; - latents->ne[1] = H_out; - latents->ne[2] = C_out; - - latents->nb[0] = dst_stride_w; - latents->nb[1] = dst_stride_h; - latents->nb[2] = dst_stride_c; - latents->nb[3] = dst_stride_n; - if (alloc_dst_buf) { - free(dst_buf); - } -} - -void repatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) { - const int64_t N = latents->ne[3]; - const int64_t C_in = latents->ne[2]; - const int64_t H_in = latents->ne[1]; - const int64_t W_in = latents->ne[0]; - - const int64_t C_out = C_in * patch_size * patch_size; - const int64_t H_out = H_in / patch_size; - const int64_t W_out = W_in / patch_size; - - const char* src_base = (char*)latents->data; - const size_t elem_size = latents->nb[0]; - - const size_t src_stride_w = latents->nb[0]; - const size_t src_stride_h = latents->nb[1]; - const size_t src_stride_c = latents->nb[2]; - const size_t src_stride_n = latents->nb[3]; +void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { + size_t buffer_head = 0; - bool alloc_dst_buf = dst_buf == nullptr; - size_t dst_buf_size = src_stride_n; - if (alloc_dst_buf) { - dst_buf = (char*)malloc(dst_buf_size); + uint32_t latent_width = latents->ne[0]; + uint32_t latent_height = latents->ne[1]; + uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; + uint32_t frames = 1; + if (ggml_n_dims(latents) == 4) { + frames = latents->ne[2]; } - char* dst_ptr = dst_buf; - - const size_t src_step_h = src_stride_h * patch_size; - const size_t src_step_w = src_stride_w * patch_size; + uint32_t rgb_width = latent_width * patch_size; + uint32_t rgb_height = latent_height * patch_size; - for (int64_t n = 0; n < N; ++n) { - for (int64_t c = 0; c < C_out; ++c) { - int64_t c_rem = c % (patch_size * patch_size); - int64_t c_in = c / (patch_size * patch_size); - int64_t py = c_rem / patch_size; - int64_t px = c_rem % patch_size; + uint32_t unpatched_dim = dim / (patch_size * patch_size); - const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w; - - for (int64_t y = 0; y < H_out; ++y) { - const char* src_row = src_layer + y * src_step_h; - - for (int64_t x = 0; x < W_out; ++x) { - memcpy(dst_ptr, src_row + x * src_step_w, elem_size); - dst_ptr += elem_size; + for (int k = 0; k < frames; k++) { + for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) { + for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) { + int latent_x = rgb_x / patch_size; + int latent_y = rgb_y / patch_size; + + int channel_offset = 0; + if (patch_size > 1) { + channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size)); } - } - } - } - - memcpy(latents->data, dst_buf, dst_buf_size); - - latents->ne[0] = W_out; - latents->ne[1] = H_out; - latents->ne[2] = C_out; - latents->nb[0] = elem_size; - latents->nb[1] = latents->nb[0] * W_out; - latents->nb[2] = latents->nb[1] * H_out; - latents->nb[3] = latents->nb[2] * C_out; + size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]); - if (alloc_dst_buf) { - free(dst_buf); - } -} + // should be incremented by 1 for each pixel + size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x; -void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) { - size_t buffer_head = 0; - for (int k = 0; k < frames; k++) { - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]); float r = 0, g = 0, b = 0; if (latent_rgb_proj != nullptr) { - for (int d = 0; d < dim; d++) { - float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]); + for (int d = 0; d < unpatched_dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]); r += value * latent_rgb_proj[d][0]; g += value * latent_rgb_proj[d][1]; b += value * latent_rgb_proj[d][2]; @@ -326,9 +226,9 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl g = g >= 0 ? g <= 1 ? g : 1 : 0; b = b >= 0 ? b <= 1 ? b : 1 : 0; - buffer[buffer_head++] = (uint8_t)(r * 255); - buffer[buffer_head++] = (uint8_t)(g * 255); - buffer[buffer_head++] = (uint8_t)(b * 255); + buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255); + buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255); + buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255); } } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 362f0a9a9..ec1d38e64 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -406,8 +406,8 @@ class StableDiffusionGGML { offload_params_to_cpu, tensor_storage_map); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map); + offload_params_to_cpu, + tensor_storage_map); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -448,10 +448,10 @@ class StableDiffusionGGML { tensor_storage_map, version); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - version, - sd_ctx_params->chroma_use_dit_mask); + offload_params_to_cpu, + tensor_storage_map, + version, + sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -460,10 +460,10 @@ class StableDiffusionGGML { 1, true); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { high_noise_diffusion_model = std::make_shared(backend, offload_params_to_cpu, @@ -492,20 +492,20 @@ class StableDiffusionGGML { "", enable_vision); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); } else { // SD1.x SD2.x SDXL if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { cond_stage_model = std::make_shared(clip_backend, @@ -1338,24 +1338,16 @@ class StableDiffusionGGML { if (preview_mode == PREVIEW_PROJ) { int64_t patch_sz = 1; - if (sd_version_is_flux2(version)) { - patch_sz = 2; - } - if (patch_sz != 1) { - // unshuffle latents - std::vector dst_buffer(latents->nb[GGML_MAX_DIMS-1]); - char* dst_buf = dst_buffer.data(); - - unpatchify_latents(latents, patch_sz, dst_buf); - - width = latents->ne[0]; - height = latents->ne[1]; - dim = latents->ne[ggml_n_dims(latents) - 1]; - } const float (*latent_rgb_proj)[channel] = nullptr; float* latent_rgb_bias = nullptr; - if (dim == 48) { + if (dim == 128) { + if (sd_version_is_flux2(version)) { + latent_rgb_proj = flux2_latent_rgb_proj; + latent_rgb_bias = flux2_latent_rgb_bias; + patch_sz = 2; + } + } else if (dim == 48) { if (sd_version_is_wan(version)) { latent_rgb_proj = wan_22_latent_rgb_proj; latent_rgb_bias = wan_22_latent_rgb_bias; @@ -1413,24 +1405,19 @@ class StableDiffusionGGML { frames = latents->ne[2]; } - uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t)); + uint32_t img_width = width * patch_sz; + uint32_t img_height = height * patch_sz; - preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim); + uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t)); + + preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz); sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); for (int i = 0; i < frames; i++) { - images[i] = {width, height, channel, data + i * width * height * channel}; + images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel}; } step_callback(step, frames, images, is_noisy, step_callback_data); free(data); free(images); - - if (patch_sz != 1) { - // restore shuffled latents - std::vector dst_buffer(latents->nb[GGML_MAX_DIMS-1]); - char* dst_buf = dst_buffer.data(); - - repatchify_latents(latents, patch_sz, dst_buf); - } } else { if (preview_mode == PREVIEW_VAE) { process_latent_out(latents); @@ -1970,12 +1957,12 @@ class StableDiffusionGGML { -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; latents_std_vec = { - 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, - 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, - 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, - 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, - 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, - 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; + 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, + 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, + 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, + 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, + 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, + 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; } else if (latent->ne[channel_dim] == 128) { // flux2 latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f, @@ -1995,22 +1982,22 @@ class StableDiffusionGGML { -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f, -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f}; latents_std_vec = { - 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f, - 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f, - 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f, - 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f, - 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f, - 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f, - 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f, - 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f, - 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f, - 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f, - 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f, - 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f, - 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f, - 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f, - 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f, - 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f}; + 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f, + 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f, + 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f, + 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f, + 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f, + 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f, + 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f, + 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f, + 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f, + 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f, + 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f, + 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f, + 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f, + 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f, + 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f, + 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f}; } } @@ -2122,12 +2109,12 @@ class StableDiffusionGGML { } ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) { - int64_t t0 = ggml_time_ms(); - ggml_tensor* result = nullptr; + int64_t t0 = ggml_time_ms(); + ggml_tensor* result = nullptr; const int vae_scale_factor = get_vae_scale_factor(); int W = x->ne[0] / vae_scale_factor; int H = x->ne[1] / vae_scale_factor; - int C = get_latent_channel(); + int C = get_latent_channel(); if (vae_tiling_params.enabled && !encode_video) { // TODO wan2.2 vae support? int ne2; @@ -2252,8 +2239,8 @@ class StableDiffusionGGML { const int vae_scale_factor = get_vae_scale_factor(); int64_t W = x->ne[0] * vae_scale_factor; int64_t H = x->ne[1] * vae_scale_factor; - int64_t C = 3; - ggml_tensor* result = nullptr; + int64_t C = 3; + ggml_tensor* result = nullptr; if (decode_video) { int T = x->ne[2]; if (sd_version_is_wan(version)) { From e47c8c4e68645cdff3c4c2c78275e9e252a3d6b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 5 Dec 2025 20:47:23 +0100 Subject: [PATCH 06/12] Support LongCat Image model --- flux.hpp | 44 +++++++++++++++++++++------- ggml_extend.hpp | 69 ++++++++++++++++++++++++++++++++++++++++++++ model.cpp | 29 ++++++++++++------- model.h | 11 ++++++- name_conversion.cpp | 8 ++++- stable-diffusion.cpp | 56 +++++++++++++++++++++-------------- 6 files changed, 173 insertions(+), 44 deletions(-) diff --git a/flux.hpp b/flux.hpp index f0c65e3d7..c8b27e444 100644 --- a/flux.hpp +++ b/flux.hpp @@ -90,10 +90,15 @@ namespace Flux { SelfAttention(int64_t dim, int64_t num_heads = 8, bool qkv_bias = false, - bool proj_bias = true) + bool proj_bias = true, + bool diffusers_style = false) : num_heads(num_heads) { int64_t head_dim = dim / num_heads; - blocks["qkv"] = std::shared_ptr(new Linear(dim, dim * 3, qkv_bias)); + if(diffusers_style) { + blocks["qkv"] = std::shared_ptr(new SplitLinear(dim, {dim, dim, dim}, qkv_bias)); + } else { + blocks["qkv"] = std::shared_ptr(new Linear(dim, dim * 3, qkv_bias)); + } blocks["norm"] = std::shared_ptr(new QKNorm(head_dim)); blocks["proj"] = std::shared_ptr(new Linear(dim, dim, proj_bias)); } @@ -210,7 +215,8 @@ namespace Flux { bool prune_mod = false, bool share_modulation = false, bool mlp_proj_bias = true, - bool use_mlp_silu_act = false) + bool use_mlp_silu_act = false, + bool diffusers_style = false) : idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) { int64_t mlp_hidden_dim = hidden_size * mlp_ratio; int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1; @@ -219,7 +225,7 @@ namespace Flux { blocks["img_mod"] = std::shared_ptr(new Modulation(hidden_size, true)); } blocks["img_norm1"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-6f, false)); - blocks["img_attn"] = std::shared_ptr(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias)); + blocks["img_attn"] = std::shared_ptr(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias, diffusers_style)); blocks["img_norm2"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-6f, false)); blocks["img_mlp.0"] = std::shared_ptr(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias)); @@ -230,7 +236,7 @@ namespace Flux { blocks["txt_mod"] = std::shared_ptr(new Modulation(hidden_size, true)); } blocks["txt_norm1"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-6f, false)); - blocks["txt_attn"] = std::shared_ptr(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias)); + blocks["txt_attn"] = std::shared_ptr(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias, diffusers_style)); blocks["txt_norm2"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-6f, false)); blocks["txt_mlp.0"] = std::shared_ptr(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias)); @@ -383,6 +389,7 @@ namespace Flux { int idx = 0; bool use_mlp_silu_act; int64_t mlp_mult_factor; + bool diffusers_style = false; public: SingleStreamBlock(int64_t hidden_size, @@ -393,7 +400,8 @@ namespace Flux { bool prune_mod = false, bool share_modulation = false, bool mlp_proj_bias = true, - bool use_mlp_silu_act = false) + bool use_mlp_silu_act = false, + bool diffusers_style = false) : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) { int64_t head_dim = hidden_size / num_heads; float scale = qk_scale; @@ -405,8 +413,11 @@ namespace Flux { if (use_mlp_silu_act) { mlp_mult_factor = 2; } - - blocks["linear1"] = std::shared_ptr(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias)); + if (diffusers_style) { + blocks["linear1"] = std::shared_ptr(new SplitLinear(hidden_size, {hidden_size, hidden_size, hidden_size, mlp_hidden_dim * mlp_mult_factor}, mlp_proj_bias)); + } else { + blocks["linear1"] = std::shared_ptr(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias)); + } blocks["linear2"] = std::shared_ptr(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias)); blocks["norm"] = std::shared_ptr(new QKNorm(head_dim)); blocks["pre_norm"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-6f, false)); @@ -729,6 +740,7 @@ namespace Flux { bool use_mlp_silu_act = false; float ref_index_scale = 1.f; ChromaRadianceParams chroma_radiance_params; + bool diffusers_style = false; }; struct Flux : public GGMLBlock { @@ -770,7 +782,8 @@ namespace Flux { params.is_chroma, params.share_modulation, !params.disable_bias, - params.use_mlp_silu_act); + params.use_mlp_silu_act, + params.diffusers_style); } for (int i = 0; i < params.depth_single_blocks; i++) { @@ -782,7 +795,8 @@ namespace Flux { params.is_chroma, params.share_modulation, !params.disable_bias, - params.use_mlp_silu_act); + params.use_mlp_silu_act, + params.diffusers_style); } if (params.version == VERSION_CHROMA_RADIANCE) { @@ -1222,6 +1236,9 @@ namespace Flux { flux_params.share_modulation = true; flux_params.ref_index_scale = 10.f; flux_params.use_mlp_silu_act = true; + } else if (sd_version_is_longcat(version)) { + flux_params.context_in_dim = 3584; + flux_params.vec_in_dim = 0; } for (auto pair : tensor_storage_map) { std::string tensor_name = pair.first; @@ -1231,6 +1248,9 @@ namespace Flux { // not schnell flux_params.guidance_embed = true; } + if (tensor_name.find("model.diffusion_model.single_blocks.0.linear1.weight.1") == std::string::npos) { + flux_params.diffusers_style = true; + } if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) { // Chroma flux_params.is_chroma = true; @@ -1260,6 +1280,10 @@ namespace Flux { LOG_INFO("Flux guidance is disabled (Schnell mode)"); } + if (flux_params.diffusers_style) { + LOG_INFO("Using diffusers-style naming"); + } + flux = Flux(flux_params); flux.init(params_ctx, tensor_storage_map, prefix); } diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 92dd3b8b6..31ae18a5a 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2159,6 +2159,75 @@ class Linear : public UnaryBlock { } }; +class SplitLinear : public Linear { +protected: + int64_t in_features; + std::vector out_features_vec; + bool bias; + bool force_f32; + bool force_prec_f32; + float scale; + std::string prefix; + + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + this->prefix = prefix; + enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); + if (in_features % ggml_blck_size(wtype) != 0 || force_f32) { + wtype = GGML_TYPE_F32; + } + params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features_vec[0]); + for (int i = 1; i < out_features_vec.size(); i++) { + // most likely same type as the first weight + params["weight." + std::to_string(i)] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features_vec[i]); + } + if (bias) { + enum ggml_type wtype = GGML_TYPE_F32; + params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_features_vec[0]); + for (int i = 1; i < out_features_vec.size(); i++) { + params["bias." + std::to_string(i)] = ggml_new_tensor_1d(ctx, wtype, out_features_vec[i]); + } + } + } + +public: + SplitLinear(int64_t in_features, + std::vector out_features_vec, + bool bias = true, + bool force_f32 = false, + bool force_prec_f32 = false, + float scale = 1.f) + : Linear(in_features, out_features_vec[0], bias, force_f32, force_prec_f32, scale), + in_features(in_features), + out_features_vec(out_features_vec), + bias(bias), + force_f32(force_f32), + force_prec_f32(force_prec_f32), + scale(scale) {} + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + struct ggml_tensor* w = params["weight"]; + struct ggml_tensor* b = nullptr; + if (bias) { + b = params["bias"]; + } + // concat all weights and biases together + for (int i = 1; i < out_features_vec.size(); i++) { + w = ggml_concat(ctx->ggml_ctx, w, params["weight." + std::to_string(i)], 1); + if (bias) { + b = ggml_concat(ctx->ggml_ctx, b, params["bias." + std::to_string(i)], 0); + } + } + if (ctx->weight_adapter) { + WeightAdapter::ForwardParams forward_params; + forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR; + forward_params.linear.force_prec_f32 = force_prec_f32; + forward_params.linear.scale = scale; + return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); + } + return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale); + } +}; + __STATIC_INLINE__ bool support_get_rows(ggml_type wtype) { std::set allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0}; if (allow_types.find(wtype) != allow_types.end()) { diff --git a/model.cpp b/model.cpp index 2b74d349f..0ebc626d5 100644 --- a/model.cpp +++ b/model.cpp @@ -1042,7 +1042,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s } SDVersion ModelLoader::get_sd_version() { - TensorStorage token_embedding_weight, input_block_weight; + TensorStorage token_embedding_weight, input_block_weight, context_ebedding_weight; bool has_multiple_encoders = false; bool is_unet = false; @@ -1056,7 +1056,7 @@ SDVersion ModelLoader::get_sd_version() { for (auto& [name, tensor_storage] : tensor_storage_map) { if (!(is_xl)) { - if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) { + if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos || tensor_storage.name.find("model.diffusion_model.single_transformer_blocks.") != std::string::npos) { is_flux = true; } if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) { @@ -1120,6 +1120,9 @@ SDVersion ModelLoader::get_sd_version() { tensor_storage.name == "unet.conv_in.weight") { input_block_weight = tensor_storage; } + if (tensor_storage.name == "model.diffusion_model.txt_in.weight" || tensor_storage.name == "model.diffusion_model.context_embedder.weight") { + context_ebedding_weight = tensor_storage; + } } if (is_wan) { LOG_DEBUG("patch_embedding_channels %d", patch_embedding_channels); @@ -1147,16 +1150,20 @@ SDVersion ModelLoader::get_sd_version() { } if (is_flux) { - if (input_block_weight.ne[0] == 384) { - return VERSION_FLUX_FILL; - } - if (input_block_weight.ne[0] == 128) { - return VERSION_FLUX_CONTROLS; - } - if (input_block_weight.ne[0] == 196) { - return VERSION_FLEX_2; + if (context_ebedding_weight.ne[0] == 3584) { + return VERSION_LONGCAT; + } else { + if (input_block_weight.ne[0] == 384) { + return VERSION_FLUX_FILL; + } + if (input_block_weight.ne[0] == 128) { + return VERSION_FLUX_CONTROLS; + } + if (input_block_weight.ne[0] == 196) { + return VERSION_FLEX_2; + } + return VERSION_FLUX; } - return VERSION_FLUX; } if (token_embedding_weight.ne[0] == 768) { diff --git a/model.h b/model.h index e2ff26c49..1751d7bf3 100644 --- a/model.h +++ b/model.h @@ -45,6 +45,7 @@ enum SDVersion { VERSION_QWEN_IMAGE, VERSION_FLUX2, VERSION_Z_IMAGE, + VERSION_LONGCAT, VERSION_COUNT, }; @@ -124,6 +125,13 @@ static inline bool sd_version_is_z_image(SDVersion version) { return false; } +static inline bool sd_version_is_longcat(SDVersion version) { + if (version == VERSION_LONGCAT) { + return true; + } + return false; +} + static inline bool sd_version_is_inpaint(SDVersion version) { if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || @@ -141,7 +149,8 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || - sd_version_is_z_image(version)) { + sd_version_is_z_image(version) || + sd_version_is_longcat(version)) { return true; } return false; diff --git a/name_conversion.cpp b/name_conversion.cpp index 8b521486d..1a37dd25c 100644 --- a/name_conversion.cpp +++ b/name_conversion.cpp @@ -508,6 +508,12 @@ std::string convert_diffusers_dit_to_original_flux(std::string name) { static std::unordered_map flux_name_map; if (flux_name_map.empty()) { + // --- time_embed (longcat) --- + flux_name_map["time_embed.timestep_embedder.linear_1.weight"] = "time_in.in_layer.weight"; + flux_name_map["time_embed.timestep_embedder.linear_1.bias"] = "time_in.in_layer.bias"; + flux_name_map["time_embed.timestep_embedder.linear_2.weight"] = "time_in.out_layer.weight"; + flux_name_map["time_embed.timestep_embedder.linear_2.bias"] = "time_in.out_layer.bias"; + // --- time_text_embed --- flux_name_map["time_text_embed.timestep_embedder.linear_1.weight"] = "time_in.in_layer.weight"; flux_name_map["time_text_embed.timestep_embedder.linear_1.bias"] = "time_in.in_layer.bias"; @@ -660,7 +666,7 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S name = convert_diffusers_unet_to_original_sdxl(name); } else if (sd_version_is_sd3(version)) { name = convert_diffusers_dit_to_original_sd3(name); - } else if (sd_version_is_flux(version) || sd_version_is_flux2(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) { name = convert_diffusers_dit_to_original_flux(name); } else if (sd_version_is_z_image(version)) { name = convert_diffusers_dit_to_original_lumina2(name); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e554f0926..68b0f974e 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -46,6 +46,7 @@ const char* model_version_to_str[] = { "Qwen Image", "Flux.2", "Z-Image", + "Longcat", }; const char* sampling_methods_str[] = { @@ -378,7 +379,7 @@ class StableDiffusionGGML { } else if (sd_version_is_sd3(version)) { scale_factor = 1.5305f; shift_factor = 0.0609f; - } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) { scale_factor = 0.3611f; shift_factor = 0.1159f; } else if (sd_version_is_wan(version) || @@ -406,8 +407,8 @@ class StableDiffusionGGML { offload_params_to_cpu, tensor_storage_map); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map); + offload_params_to_cpu, + tensor_storage_map); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -448,10 +449,23 @@ class StableDiffusionGGML { tensor_storage_map, version); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - version, - sd_ctx_params->chroma_use_dit_mask); + offload_params_to_cpu, + tensor_storage_map, + version, + sd_ctx_params->chroma_use_dit_mask); + } else if (sd_version_is_longcat(version)) { + bool enable_vision = false; + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + tensor_storage_map, + version, + "", + enable_vision); + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + tensor_storage_map, + version, + sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -460,10 +474,10 @@ class StableDiffusionGGML { 1, true); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { high_noise_diffusion_model = std::make_shared(backend, offload_params_to_cpu, @@ -492,20 +506,20 @@ class StableDiffusionGGML { "", enable_vision); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); } else { // SD1.x SD2.x SDXL if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { cond_stage_model = std::make_shared(clip_backend, @@ -817,7 +831,7 @@ class StableDiffusionGGML { flow_shift = 3.f; } } - } else if (sd_version_is_flux(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_longcat(version)) { pred_type = FLUX_FLOW_PRED; if (flow_shift == INFINITY) { flow_shift = 1.0f; // TODO: validate @@ -1334,7 +1348,7 @@ class StableDiffusionGGML { if (sd_version_is_sd3(version)) { latent_rgb_proj = sd3_latent_rgb_proj; latent_rgb_bias = sd3_latent_rgb_bias; - } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) { latent_rgb_proj = flux_latent_rgb_proj; latent_rgb_bias = flux_latent_rgb_bias; } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { From 2c449046b91a61ec674b6d34e06e2ee31daf1797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 5 Dec 2025 20:58:53 +0100 Subject: [PATCH 07/12] temp fix cuda error on quant concat for splitlinear --- ggml_extend.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 31ae18a5a..4370a88f9 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2200,7 +2200,7 @@ class SplitLinear : public Linear { in_features(in_features), out_features_vec(out_features_vec), bias(bias), - force_f32(force_f32), + force_f32(true), force_prec_f32(force_prec_f32), scale(scale) {} From 00071aa7f5e64cd9540dfa6684a9049aa1d9382d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 6 Dec 2025 02:43:46 +0100 Subject: [PATCH 08/12] pre-patchify --- flux.hpp | 1 + stable-diffusion.cpp | 36 ++++++++++++++++++++++++++++++------ vae.hpp | 4 ++-- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/flux.hpp b/flux.hpp index c8b27e444..0cedb787c 100644 --- a/flux.hpp +++ b/flux.hpp @@ -1239,6 +1239,7 @@ namespace Flux { } else if (sd_version_is_longcat(version)) { flux_params.context_in_dim = 3584; flux_params.vec_in_dim = 0; + flux_params.patch_size = 1; } for (auto pair : tensor_storage_map) { std::string tensor_name = pair.first; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index b23de473f..ced0775bd 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -449,10 +449,23 @@ class StableDiffusionGGML { tensor_storage_map, version); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - version, - sd_ctx_params->chroma_use_dit_mask); + offload_params_to_cpu, + tensor_storage_map, + version, + sd_ctx_params->chroma_use_dit_mask); + } else if (sd_version_is_longcat(version)) { + bool enable_vision = false; + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + tensor_storage_map, + version, + "", + enable_vision); + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + tensor_storage_map, + version, + sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -827,6 +840,9 @@ class StableDiffusionGGML { flow_shift = 1.15f; } } + if(sd_version_is_longcat(version)) { + flow_shift = 3.0f; + } } } else if (sd_version_is_flux2(version)) { pred_type = FLUX2_FLOW_PRED; @@ -1325,7 +1341,13 @@ class StableDiffusionGGML { if (sd_version_is_flux2(version)) { latent_rgb_proj = flux2_latent_rgb_proj; latent_rgb_bias = flux2_latent_rgb_bias; - patch_sz = 2; + patch_sz = 2; + } + } else if (dim == 64) { + if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) { + latent_rgb_proj = flux_latent_rgb_proj; + latent_rgb_bias = flux_latent_rgb_bias; + patch_sz = 2; } } else if (dim == 48) { if (sd_version_is_wan(version)) { @@ -1896,7 +1918,7 @@ class StableDiffusionGGML { int vae_scale_factor = 8; if (version == VERSION_WAN2_2_TI2V) { vae_scale_factor = 16; - } else if (sd_version_is_flux2(version)) { + } else if (sd_version_is_flux2(version) || sd_version_is_longcat(version)) { vae_scale_factor = 16; } else if (version == VERSION_CHROMA_RADIANCE) { vae_scale_factor = 1; @@ -1913,6 +1935,8 @@ class StableDiffusionGGML { latent_channel = 3; } else if (sd_version_is_flux2(version)) { latent_channel = 128; + } else if (sd_version_is_longcat(version)) { + latent_channel = 64; } else { latent_channel = 16; } diff --git a/vae.hpp b/vae.hpp index ad5db1b57..740a5655b 100644 --- a/vae.hpp +++ b/vae.hpp @@ -553,7 +553,7 @@ class AutoencodingEngine : public GGMLBlock { struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) { // z: [N, z_channels, h, w] - if (sd_version_is_flux2(version)) { + if (sd_version_is_flux2(version) || sd_version_is_longcat(version)) { // [N, C*p*p, h, w] -> [N, C, h*p, w*p] int64_t p = 2; @@ -592,7 +592,7 @@ class AutoencodingEngine : public GGMLBlock { auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]); z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8] } - if (sd_version_is_flux2(version)) { + if (sd_version_is_flux2(version) || sd_version_is_longcat(version)) { z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0]; // [N, C, H, W] -> [N, C*p*p, H/p, W/p] From 7711efb4fb3a04072f62d1f26039d8152e376cb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 6 Dec 2025 02:44:20 +0100 Subject: [PATCH 09/12] longcat rope ids --- conditioner.hpp | 11 +++++++++++ flux.hpp | 7 +++---- ggml_extend.hpp | 26 +++++++++++++++++--------- rope.hpp | 35 +++++++++++++++++++++++++---------- 4 files changed, 56 insertions(+), 23 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index 403120d9b..60ce2bc30 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1825,6 +1825,17 @@ struct LLMEmbedder : public Conditioner { prompt_attn_range.second = prompt.size(); prompt += "[/INST]"; + } else if (sd_version_is_longcat(version)) { + prompt_template_encode_start_idx = 36; + // prompt_template_encode_end_idx = 5; + + prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"; + + prompt_attn_range.first = static_cast(prompt.size()); + prompt += conditioner_params.text; + prompt_attn_range.second = static_cast(prompt.size()); + + prompt += "<|im_end|>\n<|im_start|>assistant\n"; } else { prompt_template_encode_start_idx = 34; diff --git a/flux.hpp b/flux.hpp index 0cedb787c..15795f058 100644 --- a/flux.hpp +++ b/flux.hpp @@ -1282,7 +1282,7 @@ namespace Flux { } if (flux_params.diffusers_style) { - LOG_INFO("Using diffusers-style naming"); + LOG_INFO("Using diffusers-style attention blocks"); } flux = Flux(flux_params); @@ -1388,7 +1388,6 @@ namespace Flux { for (int i = 0; i < ref_latents.size(); i++) { ref_latents[i] = to_backend(ref_latents[i]); } - pe_vec = Rope::gen_flux_pe(x->ne[1], x->ne[0], flux_params.patch_size, @@ -1398,9 +1397,9 @@ namespace Flux { sd_version_is_flux2(version) ? true : increase_ref_index, flux_params.ref_index_scale, flux_params.theta, - flux_params.axes_dim); + flux_params.axes_dim, + sd_version_is_longcat(version)); int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2; - // LOG_DEBUG("pos_len %d", pos_len); auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len); // pe->data = pe_vec.data(); // print_ggml_tensor(pe); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 4370a88f9..fc571b1cc 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2200,7 +2200,7 @@ class SplitLinear : public Linear { in_features(in_features), out_features_vec(out_features_vec), bias(bias), - force_f32(true), + force_f32(force_f32), force_prec_f32(force_prec_f32), scale(scale) {} @@ -2210,21 +2210,29 @@ class SplitLinear : public Linear { if (bias) { b = params["bias"]; } - // concat all weights and biases together - for (int i = 1; i < out_features_vec.size(); i++) { - w = ggml_concat(ctx->ggml_ctx, w, params["weight." + std::to_string(i)], 1); - if (bias) { - b = ggml_concat(ctx->ggml_ctx, b, params["bias." + std::to_string(i)], 0); - } - } if (ctx->weight_adapter) { + // concat all weights and biases together so it runs in one linear layer + for (int i = 1; i < out_features_vec.size(); i++) { + w = ggml_concat(ctx->ggml_ctx, w, params["weight." + std::to_string(i)], 1); + if (bias) { + b = ggml_concat(ctx->ggml_ctx, b, params["bias." + std::to_string(i)], 0); + } + } WeightAdapter::ForwardParams forward_params; forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR; forward_params.linear.force_prec_f32 = force_prec_f32; forward_params.linear.scale = scale; return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); } - return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale); + auto x0 = ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale); + for (int i = 1; i < out_features_vec.size(); i++) { + auto wi = params["weight." + std::to_string(i)]; + auto bi = bias ? params["bias." + std::to_string(i)] : nullptr; + auto xi = ggml_ext_linear(ctx->ggml_ctx, x, wi, bi, force_prec_f32, scale); + x0 = ggml_concat(ctx->ggml_ctx, x0, xi, 0); + } + + return x0; } }; diff --git a/rope.hpp b/rope.hpp index 7a35926eb..5739f409d 100644 --- a/rope.hpp +++ b/rope.hpp @@ -82,7 +82,16 @@ namespace Rope { return txt_ids; } - __STATIC_INLINE__ std::vector> gen_flux_img_ids(int h, + __STATIC_INLINE__ std::vector> gen_longcat_txt_ids(int bs, int context_len, int axes_dim_num) { + auto txt_ids = std::vector>(bs * context_len, std::vector(axes_dim_num, 0.0f)); + for (int i = 0; i < bs * context_len; i++) { + txt_ids[i][1] = (i % context_len); + txt_ids[i][2] = (i % context_len); + } + return txt_ids; + } + + __STATIC_INLINE__ std::vector> gen_flux_img_ids(int h, int w, int patch_size, int bs, @@ -92,7 +101,6 @@ namespace Rope { int w_offset = 0) { int h_len = (h + (patch_size / 2)) / patch_size; int w_len = (w + (patch_size / 2)) / patch_size; - std::vector> img_ids(h_len * w_len, std::vector(axes_dim_num, 0.0)); std::vector row_ids = linspace(h_offset, h_len - 1 + h_offset, h_len); @@ -167,13 +175,14 @@ namespace Rope { __STATIC_INLINE__ std::vector> gen_refs_ids(int patch_size, int bs, int axes_dim_num, + int start_index, const std::vector& ref_latents, bool increase_ref_index, float ref_index_scale) { std::vector> ids; uint64_t curr_h_offset = 0; uint64_t curr_w_offset = 0; - int index = 1; + int index = start_index; for (ggml_tensor* ref : ref_latents) { uint64_t h_offset = 0; uint64_t w_offset = 0; @@ -213,13 +222,17 @@ namespace Rope { int context_len, const std::vector& ref_latents, bool increase_ref_index, - float ref_index_scale) { - auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num); - auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num); + float ref_index_scale, + bool is_longcat) { + int start_index = is_longcat ? 1 : 0; + + auto txt_ids = is_longcat ? gen_longcat_txt_ids(bs, context_len, axes_dim_num) : gen_flux_txt_ids(bs, context_len, axes_dim_num); + int offset = is_longcat ? context_len : 0; + auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, start_index, offset, offset); auto ids = concat_ids(txt_ids, img_ids, bs); if (ref_latents.size() > 0) { - auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale); + auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, start_index + 1, ref_latents, increase_ref_index, ref_index_scale); ids = concat_ids(ids, refs_ids, bs); } return ids; @@ -235,7 +248,8 @@ namespace Rope { bool increase_ref_index, float ref_index_scale, int theta, - const std::vector& axes_dim) { + const std::vector& axes_dim, + bool is_longcat) { std::vector> ids = gen_flux_ids(h, w, patch_size, @@ -244,7 +258,8 @@ namespace Rope { context_len, ref_latents, increase_ref_index, - ref_index_scale); + ref_index_scale, + is_longcat); return embed_nd(ids, bs, theta, axes_dim); } @@ -269,7 +284,7 @@ namespace Rope { auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num); auto ids = concat_ids(txt_ids_repeated, img_ids, bs); if (ref_latents.size() > 0) { - auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f); + auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, 1, ref_latents, increase_ref_index, 1.f); ids = concat_ids(ids, refs_ids, bs); } return ids; From 535543a152ca84f66b16b91c1d0194265efff97a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 6 Dec 2025 03:47:52 +0100 Subject: [PATCH 10/12] Fix diffusers_style detection --- flux.hpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/flux.hpp b/flux.hpp index 15795f058..bedd2e35a 100644 --- a/flux.hpp +++ b/flux.hpp @@ -88,19 +88,19 @@ namespace Flux { public: SelfAttention(int64_t dim, - int64_t num_heads = 8, - bool qkv_bias = false, - bool proj_bias = true, - bool diffusers_style = false) + int64_t num_heads = 8, + bool qkv_bias = false, + bool proj_bias = true, + bool diffusers_style = false) : num_heads(num_heads) { int64_t head_dim = dim / num_heads; - if(diffusers_style) { - blocks["qkv"] = std::shared_ptr(new SplitLinear(dim, {dim, dim, dim}, qkv_bias)); + if (diffusers_style) { + blocks["qkv"] = std::shared_ptr(new SplitLinear(dim, {dim, dim, dim}, qkv_bias)); } else { - blocks["qkv"] = std::shared_ptr(new Linear(dim, dim * 3, qkv_bias)); + blocks["qkv"] = std::shared_ptr(new Linear(dim, dim * 3, qkv_bias)); } - blocks["norm"] = std::shared_ptr(new QKNorm(head_dim)); - blocks["proj"] = std::shared_ptr(new Linear(dim, dim, proj_bias)); + blocks["norm"] = std::shared_ptr(new QKNorm(head_dim)); + blocks["proj"] = std::shared_ptr(new Linear(dim, dim, proj_bias)); } std::vector pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) { @@ -739,8 +739,8 @@ namespace Flux { bool share_modulation = false; bool use_mlp_silu_act = false; float ref_index_scale = 1.f; + bool diffusers_style = false; ChromaRadianceParams chroma_radiance_params; - bool diffusers_style = false; }; struct Flux : public GGMLBlock { @@ -1249,7 +1249,7 @@ namespace Flux { // not schnell flux_params.guidance_embed = true; } - if (tensor_name.find("model.diffusion_model.single_blocks.0.linear1.weight.1") == std::string::npos) { + if (tensor_name.find("model.diffusion_model.single_blocks.0.linear1.weight.1") != std::string::npos) { flux_params.diffusers_style = true; } if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) { @@ -1398,9 +1398,9 @@ namespace Flux { flux_params.ref_index_scale, flux_params.theta, flux_params.axes_dim, - sd_version_is_longcat(version)); + sd_version_is_longcat(version)); int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2; - auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len); + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len); // pe->data = pe_vec.data(); // print_ggml_tensor(pe); // pe->data = nullptr; From 61b0dcf06f9e235078e7a7f9e32574d61949add1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 6 Dec 2025 16:05:58 +0100 Subject: [PATCH 11/12] Flux: simplify when patch_size is 1 --- flux.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/flux.hpp b/flux.hpp index bedd2e35a..ba1a3776b 100644 --- a/flux.hpp +++ b/flux.hpp @@ -843,6 +843,11 @@ namespace Flux { int64_t C = x->ne[2]; int64_t H = x->ne[1]; int64_t W = x->ne[0]; + if (params.patch_size == 1) { + x = ggml_reshape_3d(ctx, x, H * W, C, N); // [N, C, H*W] + x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, H*W, C] + return x; + } int64_t p = params.patch_size; int64_t h = H / params.patch_size; int64_t w = W / params.patch_size; @@ -877,6 +882,12 @@ namespace Flux { int64_t W = w * params.patch_size; int64_t p = params.patch_size; + if (params.patch_size == 1) { + x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, C, H*W] + x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, H, W] + return x; + } + GGML_ASSERT(C * p * p == x->ne[0]); x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N); // [N, h*w, C, p*p] From deaf939bbade022ef151ea70642a2a1bc5f8de7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 6 Dec 2025 16:06:32 +0100 Subject: [PATCH 12/12] correct rope offset for image tokens --- rope.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/rope.hpp b/rope.hpp index 5739f409d..b487292dc 100644 --- a/rope.hpp +++ b/rope.hpp @@ -178,10 +178,11 @@ namespace Rope { int start_index, const std::vector& ref_latents, bool increase_ref_index, - float ref_index_scale) { + float ref_index_scale, + int base_offset = 0) { std::vector> ids; - uint64_t curr_h_offset = 0; - uint64_t curr_w_offset = 0; + uint64_t curr_h_offset = base_offset; + uint64_t curr_w_offset = base_offset; int index = start_index; for (ggml_tensor* ref : ref_latents) { uint64_t h_offset = 0; @@ -224,15 +225,15 @@ namespace Rope { bool increase_ref_index, float ref_index_scale, bool is_longcat) { - int start_index = is_longcat ? 1 : 0; + int x_index = is_longcat ? 1 : 0; auto txt_ids = is_longcat ? gen_longcat_txt_ids(bs, context_len, axes_dim_num) : gen_flux_txt_ids(bs, context_len, axes_dim_num); - int offset = is_longcat ? context_len : 0; - auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, start_index, offset, offset); + int offset = is_longcat ? context_len + 1 : 0; + auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, x_index, offset, offset); auto ids = concat_ids(txt_ids, img_ids, bs); if (ref_latents.size() > 0) { - auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, start_index + 1, ref_latents, increase_ref_index, ref_index_scale); + auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, x_index + 1, ref_latents, increase_ref_index, ref_index_scale, offset); ids = concat_ids(ids, refs_ids, bs); } return ids;