From 80784fbe313a02dacb8fb28b838d16bf0f812bdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 2 Dec 2025 23:28:27 +0100
Subject: [PATCH 01/12] add Flux.2 VAE proj matrix for previews

---
 latent-preview.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/latent-preview.h b/latent-preview.h
index 97409a7d8..61c9434d5 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -91,6 +91,41 @@ const float flux_latent_rgb_proj[16][3] = {
     {-0.111849f, -0.055589f, -0.032361f}};
 float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
 
+const float flux_2_latent_rgb_proj[32][3] = {
+    {0.000736f, -0.008385f, -0.019710f},
+    {-0.001352f, -0.016392f, 0.020693f},
+    {-0.006376f, 0.002428f, 0.036736f},
+    {0.039384f, 0.074167f, 0.119789f},
+    {0.007464f, -0.005705f, -0.004734f},
+    {-0.004086f, 0.005287f, -0.000409f},
+    {-0.032835f, 0.050802f, -0.028120f},
+    {-0.003158f, -0.000835f, 0.000406f},
+    {-0.112840f, -0.084337f, -0.023083f},
+    {0.001462f, -0.006656f, 0.000549f},
+    {-0.009980f, -0.007480f, 0.009702f},
+    {0.032540f, 0.000214f, -0.061388f},
+    {0.011023f, 0.000694f, 0.007143f},
+    {-0.001468f, -0.006723f, -0.001678f},
+    {-0.005921f, -0.010320f, -0.003907f},
+    {-0.028434f, 0.027584f, 0.018457f},
+    {0.014349f, 0.011523f, 0.000441f},
+    {0.009874f, 0.003081f, 0.001507f},
+    {0.002218f, 0.005712f, 0.001563f},
+    {0.053010f, -0.019844f, 0.008683f},
+    {-0.002507f, 0.005384f, 0.000938f},
+    {-0.002177f, -0.011366f, 0.003559f},
+    {-0.000261f, 0.015121f, -0.003240f},
+    {-0.003944f, -0.002083f, 0.005043f},
+    {-0.009138f, 0.011336f, 0.003781f},
+    {0.011429f, 0.003985f, -0.003855f},
+    {0.010518f, -0.005586f, 0.010131f},
+    {0.007883f, 0.002912f, -0.001473f},
+    {-0.003318f, -0.003160f, 0.003684f},
+    {-0.034560f, -0.008740f, 0.012996f},
+    {0.000166f, 0.001079f, -0.012153f},
+    {0.017772f, 0.000937f, -0.011953f}};
+float flux_2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
+
 // This one was taken straight from
 // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
 // (MiT Licence)

From beef32251e5be8ad21a2ccfeaf284d3fe3e01a98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 2 Dec 2025 23:29:36 +0100
Subject: [PATCH 02/12] Enable flux.2 proj for preview with flux model

---
 latent-preview.h     | 4 ++--
 stable-diffusion.cpp | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index 61c9434d5..0f9f27380 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -91,7 +91,7 @@ const float flux_latent_rgb_proj[16][3] = {
     {-0.111849f, -0.055589f, -0.032361f}};
 float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
 
-const float flux_2_latent_rgb_proj[32][3] = {
+const float flux2_latent_rgb_proj[32][3] = {
     {0.000736f, -0.008385f, -0.019710f},
     {-0.001352f, -0.016392f, 0.020693f},
     {-0.006376f, 0.002428f, 0.036736f},
@@ -124,7 +124,7 @@ const float flux_2_latent_rgb_proj[32][3] = {
     {-0.034560f, -0.008740f, 0.012996f},
     {0.000166f, 0.001079f, -0.012153f},
     {0.017772f, 0.000937f, -0.011953f}};
-float flux_2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
+float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
 
 // This one was taken straight from
 // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 73065610d..2a1a7e9dc 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1349,6 +1349,11 @@ class StableDiffusionGGML {
                     // unknown model
                     return;
                 }
+            } else if (dim == 32) {
+                if (sd_version_is_flux2(version)) {
+                    latent_rgb_proj = flux2_latent_rgb_proj;
+                    latent_rgb_bias = flux2_latent_rgb_bias;
+                }
             } else if (dim == 16) {
                 // 16 channels VAE -> Flux or SD3
 

From 77e4620a897b2bed282c76af7dabd77534117f68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 2 Dec 2025 23:29:41 +0100
Subject: [PATCH 03/12] support Flux.2 patched latents for proj preview

---
 stable-diffusion.cpp | 125 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 2a1a7e9dc..eca523a38 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1337,8 +1337,72 @@ class StableDiffusionGGML {
         uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
 
         if (preview_mode == PREVIEW_PROJ) {
-            const float(*latent_rgb_proj)[channel] = nullptr;
-            float* latent_rgb_bias                 = nullptr;
+            int64_t patch_sz = 1;
+            if (sd_version_is_flux2(version)) {
+                patch_sz = 2;
+            }
+            if (patch_sz != 1) {
+                // unshuffle latents
+                const int64_t N    = latents->ne[3];
+                const int64_t C_in = latents->ne[2];
+                const int64_t H_in = latents->ne[1];
+                const int64_t W_in = latents->ne[0];
+
+                const int64_t C_out = C_in / (patch_sz * patch_sz);
+                const int64_t H_out = H_in * patch_sz;
+                const int64_t W_out = W_in * patch_sz;
+
+                const char* src_ptr = (char*)latents->data;
+                size_t elem_size    = latents->nb[0];
+
+                std::vector<char> dst_buffer(N * C_out * H_out * W_out * elem_size);
+                char* dst_base = dst_buffer.data();
+
+                size_t dst_stride_w = elem_size;
+                size_t dst_stride_h = dst_stride_w * W_out;
+                size_t dst_stride_c = dst_stride_h * H_out;
+                size_t dst_stride_n = dst_stride_c * C_out;
+
+                size_t dst_step_w = dst_stride_w * patch_sz;
+                size_t dst_step_h = dst_stride_h * patch_sz;
+
+                for (int64_t n = 0; n < N; ++n) {
+                    for (int64_t c = 0; c < C_in; ++c) {
+                        int64_t c_out = c / (patch_sz * patch_sz);
+                        int64_t rem   = c % (patch_sz * patch_sz);
+                        int64_t py    = rem / patch_sz;
+                        int64_t px    = rem % patch_sz;
+
+                        char* dst_layer = dst_base + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w;
+
+                        for (int64_t y = 0; y < H_in; ++y) {
+                            char* dst_row = dst_layer + y * dst_step_h;
+
+                            for (int64_t x = 0; x < W_in; ++x) {
+                                memcpy(dst_row + x * dst_step_w, src_ptr, elem_size);
+                                src_ptr += elem_size;
+                            }
+                        }
+                    }
+                }
+
+                memcpy(latents->data, dst_buffer.data(), dst_buffer.size());
+
+                latents->ne[0] = W_out;
+                latents->ne[1] = H_out;
+                latents->ne[2] = C_out;
+
+                latents->nb[0] = dst_stride_w;
+                latents->nb[1] = dst_stride_h;
+                latents->nb[2] = dst_stride_c;
+                latents->nb[3] = dst_stride_n;
+
+                width  = W_out;
+                height = H_out;
+                dim    = C_out;
+            }
+            const float (*latent_rgb_proj)[channel] = nullptr;
+            float* latent_rgb_bias                  = nullptr;
 
             if (dim == 48) {
                 if (sd_version_is_wan(version)) {
@@ -1408,6 +1472,63 @@ class StableDiffusionGGML {
             step_callback(step, frames, images, is_noisy, step_callback_data);
             free(data);
             free(images);
+
+            if (patch_sz != 1) {
+                // restore shuffled latents
+                const int64_t N        = latents->ne[3];
+                const int64_t C_in     = latents->ne[2];
+                const int64_t H_in     = latents->ne[1];
+                const int64_t W_in     = latents->ne[0];
+
+                const int64_t C_out = C_in * patch_sz * patch_sz;
+                const int64_t H_out   = H_in / patch_sz;
+                const int64_t W_out   = W_in / patch_sz;
+
+                const char* src_base   = (char*)latents->data;
+                const size_t elem_size = latents->nb[0];
+
+                const size_t src_stride_w = latents->nb[0];
+                const size_t src_stride_h = latents->nb[1];
+                const size_t src_stride_c = latents->nb[2];
+                const size_t src_stride_n = latents->nb[3];
+
+                std::vector<char> dst_buffer(N * C_out * H_out * W_out * elem_size);
+                char* dst_ptr = dst_buffer.data();
+
+                const size_t src_step_h = src_stride_h * patch_sz;
+                const size_t src_step_w = src_stride_w * patch_sz;
+
+                for (int64_t n = 0; n < N; ++n) {
+                    for (int64_t c = 0; c < C_out; ++c) {
+                        int64_t c_rem = c % (patch_sz * patch_sz);
+                        int64_t c_in  = c / (patch_sz * patch_sz);
+                        int64_t py    = c_rem / patch_sz;
+                        int64_t px    = c_rem % patch_sz;
+
+                        const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w;
+
+                        for (int64_t y = 0; y < H_out; ++y) {
+                            const char* src_row = src_layer + y * src_step_h;
+
+                            for (int64_t x = 0; x < W_out; ++x) {
+                                memcpy(dst_ptr, src_row + x * src_step_w, elem_size);
+                                dst_ptr += elem_size;
+                            }
+                        }
+                    }
+                }
+
+                memcpy(latents->data, dst_buffer.data(), dst_buffer.size());
+
+                latents->ne[0] = W_out;
+                latents->ne[1] = H_out;
+                latents->ne[2] = C_out;
+
+                latents->nb[0] = elem_size;
+                latents->nb[1] = latents->nb[0] * W_out;
+                latents->nb[2] = latents->nb[1] * H_out;
+                latents->nb[3] = latents->nb[2] * C_out;
+            }
         } else {
             if (preview_mode == PREVIEW_VAE) {
                 process_latent_out(latents);

From da8e95ebdeaba167de31aebf30e6c7d7d3183b48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 2 Dec 2025 23:29:43 +0100
Subject: [PATCH 04/12] move latent shuffle logic to latents-preview.h

---
 latent-preview.h     | 127 +++++++++++++++++++++++++++++++++++++++++++
 stable-diffusion.cpp | 116 +++------------------------------------
 2 files changed, 136 insertions(+), 107 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index 0f9f27380..52c8c8c9f 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -163,6 +163,133 @@ const float sd_latent_rgb_proj[4][3] = {
     {-0.178022f, -0.200862f, -0.678514f}};
 float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
 
+void unpatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) {
+    const int64_t N    = latents->ne[3];
+    const int64_t C_in = latents->ne[2];
+    const int64_t H_in = latents->ne[1];
+    const int64_t W_in = latents->ne[0];
+
+    const int64_t C_out = C_in / (patch_size * patch_size);
+    const int64_t H_out = H_in * patch_size;
+    const int64_t W_out = W_in * patch_size;
+
+    const char* src_ptr = (char*)latents->data;
+    size_t elem_size    = latents->nb[0];
+
+    bool alloc_dst_buf  = dst_buf == nullptr;
+    size_t dst_buf_size = latents->nb[3];
+    if (alloc_dst_buf) {
+        dst_buf = (char*)malloc(dst_buf_size);
+    }
+
+    size_t dst_stride_w = elem_size;
+    size_t dst_stride_h = dst_stride_w * W_out;
+    size_t dst_stride_c = dst_stride_h * H_out;
+    size_t dst_stride_n = dst_stride_c * C_out;
+
+    size_t dst_step_w = dst_stride_w * patch_size;
+    size_t dst_step_h = dst_stride_h * patch_size;
+
+    for (int64_t n = 0; n < N; ++n) {
+        for (int64_t c = 0; c < C_in; ++c) {
+            int64_t c_out = c / (patch_size * patch_size);
+            int64_t rem   = c % (patch_size * patch_size);
+            int64_t py    = rem / patch_size;
+            int64_t px    = rem % patch_size;
+
+            char* dst_layer = dst_buf + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w;
+
+            for (int64_t y = 0; y < H_in; ++y) {
+                char* dst_row = dst_layer + y * dst_step_h;
+
+                for (int64_t x = 0; x < W_in; ++x) {
+                    memcpy(dst_row + x * dst_step_w, src_ptr, elem_size);
+                    src_ptr += elem_size;
+                }
+            }
+        }
+    }
+
+    memcpy(latents->data, dst_buf, dst_buf_size);
+
+    latents->ne[0] = W_out;
+    latents->ne[1] = H_out;
+    latents->ne[2] = C_out;
+
+    latents->nb[0] = dst_stride_w;
+    latents->nb[1] = dst_stride_h;
+    latents->nb[2] = dst_stride_c;
+    latents->nb[3] = dst_stride_n;
+    if (alloc_dst_buf) {
+        free(dst_buf);
+    }
+}
+
+void repatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) {
+    const int64_t N     = latents->ne[3];
+    const int64_t C_in  = latents->ne[2];
+    const int64_t H_in  = latents->ne[1];
+    const int64_t W_in  = latents->ne[0];
+    
+    const int64_t C_out = C_in * patch_size * patch_size;
+    const int64_t H_out = H_in / patch_size;
+    const int64_t W_out = W_in / patch_size;
+
+    const char* src_base   = (char*)latents->data;
+    const size_t elem_size = latents->nb[0];
+
+    const size_t src_stride_w = latents->nb[0];
+    const size_t src_stride_h = latents->nb[1];
+    const size_t src_stride_c = latents->nb[2];
+    const size_t src_stride_n = latents->nb[3];
+
+    bool alloc_dst_buf  = dst_buf == nullptr;
+    size_t dst_buf_size = src_stride_n;
+    if (alloc_dst_buf) {
+        dst_buf = (char*)malloc(dst_buf_size);
+    }
+
+    char* dst_ptr = dst_buf;
+
+    const size_t src_step_h = src_stride_h * patch_size;
+    const size_t src_step_w = src_stride_w * patch_size;
+
+    for (int64_t n = 0; n < N; ++n) {
+        for (int64_t c = 0; c < C_out; ++c) {
+            int64_t c_rem = c % (patch_size * patch_size);
+            int64_t c_in  = c / (patch_size * patch_size);
+            int64_t py    = c_rem / patch_size;
+            int64_t px    = c_rem % patch_size;
+
+            const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w;
+
+            for (int64_t y = 0; y < H_out; ++y) {
+                const char* src_row = src_layer + y * src_step_h;
+
+                for (int64_t x = 0; x < W_out; ++x) {
+                    memcpy(dst_ptr, src_row + x * src_step_w, elem_size);
+                    dst_ptr += elem_size;
+                }
+            }
+        }
+    }
+
+    memcpy(latents->data, dst_buf, dst_buf_size);
+
+    latents->ne[0] = W_out;
+    latents->ne[1] = H_out;
+    latents->ne[2] = C_out;
+
+    latents->nb[0] = elem_size;
+    latents->nb[1] = latents->nb[0] * W_out;
+    latents->nb[2] = latents->nb[1] * H_out;
+    latents->nb[3] = latents->nb[2] * C_out;
+
+    if (alloc_dst_buf) {
+        free(dst_buf);
+    }
+}
+
 void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
     size_t buffer_head = 0;
     for (int k = 0; k < frames; k++) {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index eca523a38..362f0a9a9 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1343,63 +1343,14 @@ class StableDiffusionGGML {
             }
             if (patch_sz != 1) {
                 // unshuffle latents
-                const int64_t N    = latents->ne[3];
-                const int64_t C_in = latents->ne[2];
-                const int64_t H_in = latents->ne[1];
-                const int64_t W_in = latents->ne[0];
+                std::vector<char> dst_buffer(latents->nb[GGML_MAX_DIMS-1]);
+                char* dst_buf = dst_buffer.data();
 
-                const int64_t C_out = C_in / (patch_sz * patch_sz);
-                const int64_t H_out = H_in * patch_sz;
-                const int64_t W_out = W_in * patch_sz;
+                unpatchify_latents(latents, patch_sz, dst_buf);
 
-                const char* src_ptr = (char*)latents->data;
-                size_t elem_size    = latents->nb[0];
-
-                std::vector<char> dst_buffer(N * C_out * H_out * W_out * elem_size);
-                char* dst_base = dst_buffer.data();
-
-                size_t dst_stride_w = elem_size;
-                size_t dst_stride_h = dst_stride_w * W_out;
-                size_t dst_stride_c = dst_stride_h * H_out;
-                size_t dst_stride_n = dst_stride_c * C_out;
-
-                size_t dst_step_w = dst_stride_w * patch_sz;
-                size_t dst_step_h = dst_stride_h * patch_sz;
-
-                for (int64_t n = 0; n < N; ++n) {
-                    for (int64_t c = 0; c < C_in; ++c) {
-                        int64_t c_out = c / (patch_sz * patch_sz);
-                        int64_t rem   = c % (patch_sz * patch_sz);
-                        int64_t py    = rem / patch_sz;
-                        int64_t px    = rem % patch_sz;
-
-                        char* dst_layer = dst_base + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w;
-
-                        for (int64_t y = 0; y < H_in; ++y) {
-                            char* dst_row = dst_layer + y * dst_step_h;
-
-                            for (int64_t x = 0; x < W_in; ++x) {
-                                memcpy(dst_row + x * dst_step_w, src_ptr, elem_size);
-                                src_ptr += elem_size;
-                            }
-                        }
-                    }
-                }
-
-                memcpy(latents->data, dst_buffer.data(), dst_buffer.size());
-
-                latents->ne[0] = W_out;
-                latents->ne[1] = H_out;
-                latents->ne[2] = C_out;
-
-                latents->nb[0] = dst_stride_w;
-                latents->nb[1] = dst_stride_h;
-                latents->nb[2] = dst_stride_c;
-                latents->nb[3] = dst_stride_n;
-
-                width  = W_out;
-                height = H_out;
-                dim    = C_out;
+                width  = latents->ne[0];
+                height = latents->ne[1];
+                dim    = latents->ne[ggml_n_dims(latents) - 1];
             }
             const float (*latent_rgb_proj)[channel] = nullptr;
             float* latent_rgb_bias                  = nullptr;
@@ -1475,59 +1426,10 @@ class StableDiffusionGGML {
 
             if (patch_sz != 1) {
                 // restore shuffled latents
-                const int64_t N        = latents->ne[3];
-                const int64_t C_in     = latents->ne[2];
-                const int64_t H_in     = latents->ne[1];
-                const int64_t W_in     = latents->ne[0];
-
-                const int64_t C_out = C_in * patch_sz * patch_sz;
-                const int64_t H_out   = H_in / patch_sz;
-                const int64_t W_out   = W_in / patch_sz;
-
-                const char* src_base   = (char*)latents->data;
-                const size_t elem_size = latents->nb[0];
-
-                const size_t src_stride_w = latents->nb[0];
-                const size_t src_stride_h = latents->nb[1];
-                const size_t src_stride_c = latents->nb[2];
-                const size_t src_stride_n = latents->nb[3];
-
-                std::vector<char> dst_buffer(N * C_out * H_out * W_out * elem_size);
-                char* dst_ptr = dst_buffer.data();
-
-                const size_t src_step_h = src_stride_h * patch_sz;
-                const size_t src_step_w = src_stride_w * patch_sz;
-
-                for (int64_t n = 0; n < N; ++n) {
-                    for (int64_t c = 0; c < C_out; ++c) {
-                        int64_t c_rem = c % (patch_sz * patch_sz);
-                        int64_t c_in  = c / (patch_sz * patch_sz);
-                        int64_t py    = c_rem / patch_sz;
-                        int64_t px    = c_rem % patch_sz;
-
-                        const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w;
-
-                        for (int64_t y = 0; y < H_out; ++y) {
-                            const char* src_row = src_layer + y * src_step_h;
-
-                            for (int64_t x = 0; x < W_out; ++x) {
-                                memcpy(dst_ptr, src_row + x * src_step_w, elem_size);
-                                dst_ptr += elem_size;
-                            }
-                        }
-                    }
-                }
-
-                memcpy(latents->data, dst_buffer.data(), dst_buffer.size());
-
-                latents->ne[0] = W_out;
-                latents->ne[1] = H_out;
-                latents->ne[2] = C_out;
+                std::vector<char> dst_buffer(latents->nb[GGML_MAX_DIMS-1]);
+                char* dst_buf = dst_buffer.data();
 
-                latents->nb[0] = elem_size;
-                latents->nb[1] = latents->nb[0] * W_out;
-                latents->nb[2] = latents->nb[1] * H_out;
-                latents->nb[3] = latents->nb[2] * C_out;
+                repatchify_latents(latents, patch_sz, dst_buf);
             }
         } else {
             if (preview_mode == PREVIEW_VAE) {

From c054c2367801275f1c1c3b67060a18f84c905c04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 2 Dec 2025 23:29:44 +0100
Subject: [PATCH 05/12] refactor preview_latent_video to support flux.2
 patchified latents

---
 latent-preview.h     | 156 ++++++++-----------------------------------
 stable-diffusion.cpp | 129 ++++++++++++++++-------------------
 2 files changed, 86 insertions(+), 199 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index 52c8c8c9f..8354a35e0 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -163,143 +163,43 @@ const float sd_latent_rgb_proj[4][3] = {
     {-0.178022f, -0.200862f, -0.678514f}};
 float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
 
-void unpatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) {
-    const int64_t N    = latents->ne[3];
-    const int64_t C_in = latents->ne[2];
-    const int64_t H_in = latents->ne[1];
-    const int64_t W_in = latents->ne[0];
 
-    const int64_t C_out = C_in / (patch_size * patch_size);
-    const int64_t H_out = H_in * patch_size;
-    const int64_t W_out = W_in * patch_size;
-
-    const char* src_ptr = (char*)latents->data;
-    size_t elem_size    = latents->nb[0];
-
-    bool alloc_dst_buf  = dst_buf == nullptr;
-    size_t dst_buf_size = latents->nb[3];
-    if (alloc_dst_buf) {
-        dst_buf = (char*)malloc(dst_buf_size);
-    }
-
-    size_t dst_stride_w = elem_size;
-    size_t dst_stride_h = dst_stride_w * W_out;
-    size_t dst_stride_c = dst_stride_h * H_out;
-    size_t dst_stride_n = dst_stride_c * C_out;
-
-    size_t dst_step_w = dst_stride_w * patch_size;
-    size_t dst_step_h = dst_stride_h * patch_size;
-
-    for (int64_t n = 0; n < N; ++n) {
-        for (int64_t c = 0; c < C_in; ++c) {
-            int64_t c_out = c / (patch_size * patch_size);
-            int64_t rem   = c % (patch_size * patch_size);
-            int64_t py    = rem / patch_size;
-            int64_t px    = rem % patch_size;
-
-            char* dst_layer = dst_buf + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w;
-
-            for (int64_t y = 0; y < H_in; ++y) {
-                char* dst_row = dst_layer + y * dst_step_h;
-
-                for (int64_t x = 0; x < W_in; ++x) {
-                    memcpy(dst_row + x * dst_step_w, src_ptr, elem_size);
-                    src_ptr += elem_size;
-                }
-            }
-        }
-    }
-
-    memcpy(latents->data, dst_buf, dst_buf_size);
-
-    latents->ne[0] = W_out;
-    latents->ne[1] = H_out;
-    latents->ne[2] = C_out;
-
-    latents->nb[0] = dst_stride_w;
-    latents->nb[1] = dst_stride_h;
-    latents->nb[2] = dst_stride_c;
-    latents->nb[3] = dst_stride_n;
-    if (alloc_dst_buf) {
-        free(dst_buf);
-    }
-}
-
-void repatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) {
-    const int64_t N     = latents->ne[3];
-    const int64_t C_in  = latents->ne[2];
-    const int64_t H_in  = latents->ne[1];
-    const int64_t W_in  = latents->ne[0];
-    
-    const int64_t C_out = C_in * patch_size * patch_size;
-    const int64_t H_out = H_in / patch_size;
-    const int64_t W_out = W_in / patch_size;
-
-    const char* src_base   = (char*)latents->data;
-    const size_t elem_size = latents->nb[0];
-
-    const size_t src_stride_w = latents->nb[0];
-    const size_t src_stride_h = latents->nb[1];
-    const size_t src_stride_c = latents->nb[2];
-    const size_t src_stride_n = latents->nb[3];
+void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
+    size_t buffer_head = 0;
 
-    bool alloc_dst_buf  = dst_buf == nullptr;
-    size_t dst_buf_size = src_stride_n;
-    if (alloc_dst_buf) {
-        dst_buf = (char*)malloc(dst_buf_size);
+    uint32_t latent_width  = latents->ne[0];
+    uint32_t latent_height = latents->ne[1];
+    uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
+    uint32_t frames        = 1;
+    if (ggml_n_dims(latents) == 4) {
+        frames = latents->ne[2];
     }
 
-    char* dst_ptr = dst_buf;
-
-    const size_t src_step_h = src_stride_h * patch_size;
-    const size_t src_step_w = src_stride_w * patch_size;
+    uint32_t rgb_width  = latent_width * patch_size;
+    uint32_t rgb_height = latent_height * patch_size;
 
-    for (int64_t n = 0; n < N; ++n) {
-        for (int64_t c = 0; c < C_out; ++c) {
-            int64_t c_rem = c % (patch_size * patch_size);
-            int64_t c_in  = c / (patch_size * patch_size);
-            int64_t py    = c_rem / patch_size;
-            int64_t px    = c_rem % patch_size;
+    uint32_t unpatched_dim = dim / (patch_size * patch_size);
 
-            const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w;
-
-            for (int64_t y = 0; y < H_out; ++y) {
-                const char* src_row = src_layer + y * src_step_h;
-
-                for (int64_t x = 0; x < W_out; ++x) {
-                    memcpy(dst_ptr, src_row + x * src_step_w, elem_size);
-                    dst_ptr += elem_size;
+    for (int k = 0; k < frames; k++) {
+        for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
+            for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
+                int latent_x = rgb_x / patch_size;
+                int latent_y = rgb_y / patch_size;
+
+                int channel_offset = 0;
+                if (patch_size > 1) {
+                    channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
                 }
-            }
-        }
-    }
-
-    memcpy(latents->data, dst_buf, dst_buf_size);
-
-    latents->ne[0] = W_out;
-    latents->ne[1] = H_out;
-    latents->ne[2] = C_out;
 
-    latents->nb[0] = elem_size;
-    latents->nb[1] = latents->nb[0] * W_out;
-    latents->nb[2] = latents->nb[1] * H_out;
-    latents->nb[3] = latents->nb[2] * C_out;
+                size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
 
-    if (alloc_dst_buf) {
-        free(dst_buf);
-    }
-}
+                // should be incremented by 1 for each pixel
+                size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
 
-void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
-    size_t buffer_head = 0;
-    for (int k = 0; k < frames; k++) {
-        for (int j = 0; j < height; j++) {
-            for (int i = 0; i < width; i++) {
-                size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
                 float r = 0, g = 0, b = 0;
                 if (latent_rgb_proj != nullptr) {
-                    for (int d = 0; d < dim; d++) {
-                        float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
+                    for (int d = 0; d < unpatched_dim; d++) {
+                        float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
                         r += value * latent_rgb_proj[d][0];
                         g += value * latent_rgb_proj[d][1];
                         b += value * latent_rgb_proj[d][2];
@@ -326,9 +226,9 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
                 g = g >= 0 ? g <= 1 ? g : 1 : 0;
                 b = b >= 0 ? b <= 1 ? b : 1 : 0;
 
-                buffer[buffer_head++] = (uint8_t)(r * 255);
-                buffer[buffer_head++] = (uint8_t)(g * 255);
-                buffer[buffer_head++] = (uint8_t)(b * 255);
+                buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
+                buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
+                buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
             }
         }
     }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 362f0a9a9..ec1d38e64 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -406,8 +406,8 @@ class StableDiffusionGGML {
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map);
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -448,10 +448,10 @@ class StableDiffusionGGML {
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<FluxModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version,
-                                                              sd_ctx_params->chroma_use_dit_mask);
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
@@ -460,10 +460,10 @@ class StableDiffusionGGML {
                                                                     1,
                                                                     true);
                 diffusion_model  = std::make_shared<WanModel>(backend,
-                                                             offload_params_to_cpu,
-                                                             tensor_storage_map,
-                                                             "model.diffusion_model",
-                                                             version);
+                                                              offload_params_to_cpu,
+                                                              tensor_storage_map,
+                                                              "model.diffusion_model",
+                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
@@ -492,20 +492,20 @@ class StableDiffusionGGML {
                                                                  "",
                                                                  enable_vision);
                 diffusion_model  = std::make_shared<QwenImageModel>(backend,
-                                                                   offload_params_to_cpu,
-                                                                   tensor_storage_map,
-                                                                   "model.diffusion_model",
-                                                                   version);
+                                                                    offload_params_to_cpu,
+                                                                    tensor_storage_map,
+                                                                    "model.diffusion_model",
+                                                                    version);
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<ZImageModel>(backend,
-                                                                offload_params_to_cpu,
-                                                                tensor_storage_map,
-                                                                "model.diffusion_model",
-                                                                version);
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 "model.diffusion_model",
+                                                                 version);
             } else {  // SD1.x SD2.x SDXL
                 if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
@@ -1338,24 +1338,16 @@ class StableDiffusionGGML {
 
         if (preview_mode == PREVIEW_PROJ) {
             int64_t patch_sz = 1;
-            if (sd_version_is_flux2(version)) {
-                patch_sz = 2;
-            }
-            if (patch_sz != 1) {
-                // unshuffle latents
-                std::vector<char> dst_buffer(latents->nb[GGML_MAX_DIMS-1]);
-                char* dst_buf = dst_buffer.data();
-
-                unpatchify_latents(latents, patch_sz, dst_buf);
-
-                width  = latents->ne[0];
-                height = latents->ne[1];
-                dim    = latents->ne[ggml_n_dims(latents) - 1];
-            }
             const float (*latent_rgb_proj)[channel] = nullptr;
             float* latent_rgb_bias                  = nullptr;
 
-            if (dim == 48) {
+            if (dim == 128) {
+                if (sd_version_is_flux2(version)) {
+                    latent_rgb_proj = flux2_latent_rgb_proj;
+                    latent_rgb_bias = flux2_latent_rgb_bias;
+                    patch_sz = 2;
+                }
+            } else if (dim == 48) {
                 if (sd_version_is_wan(version)) {
                     latent_rgb_proj = wan_22_latent_rgb_proj;
                     latent_rgb_bias = wan_22_latent_rgb_bias;
@@ -1413,24 +1405,19 @@ class StableDiffusionGGML {
                 frames = latents->ne[2];
             }
 
-            uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
+            uint32_t img_width  = width * patch_sz;
+            uint32_t img_height = height * patch_sz;
 
-            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
+            uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));
+
+            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
             sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
             for (int i = 0; i < frames; i++) {
-                images[i] = {width, height, channel, data + i * width * height * channel};
+                images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
             }
             step_callback(step, frames, images, is_noisy, step_callback_data);
             free(data);
             free(images);
-
-            if (patch_sz != 1) {
-                // restore shuffled latents
-                std::vector<char> dst_buffer(latents->nb[GGML_MAX_DIMS-1]);
-                char* dst_buf = dst_buffer.data();
-
-                repatchify_latents(latents, patch_sz, dst_buf);
-            }
         } else {
             if (preview_mode == PREVIEW_VAE) {
                 process_latent_out(latents);
@@ -1970,12 +1957,12 @@ class StableDiffusionGGML {
                                 -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
                                 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
             latents_std_vec  = {
-                 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+                0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
         } else if (latent->ne[channel_dim] == 128) {
             // flux2
             latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
@@ -1995,22 +1982,22 @@ class StableDiffusionGGML {
                                 -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
                                 -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f};
             latents_std_vec  = {
-                 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
-                 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
-                 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
-                 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
-                 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
-                 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
-                 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
-                 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
-                 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
-                 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
-                 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
-                 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
-                 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
-                 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
-                 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
-                 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f};
+                1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
+                1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
+                1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
+                1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
+                1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
+                1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
+                1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
+                1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
+                1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
+                1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
+                1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
+                1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
+                1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
+                1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
+                1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
+                1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f};
         }
     }
 
@@ -2122,12 +2109,12 @@ class StableDiffusionGGML {
     }
 
     ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
-        int64_t t0          = ggml_time_ms();
-        ggml_tensor* result = nullptr;
+        int64_t t0                 = ggml_time_ms();
+        ggml_tensor* result        = nullptr;
         const int vae_scale_factor = get_vae_scale_factor();
         int W                      = x->ne[0] / vae_scale_factor;
         int H                      = x->ne[1] / vae_scale_factor;
-        int C               = get_latent_channel();
+        int C                      = get_latent_channel();
         if (vae_tiling_params.enabled && !encode_video) {
             // TODO wan2.2 vae support?
             int ne2;
@@ -2252,8 +2239,8 @@ class StableDiffusionGGML {
         const int vae_scale_factor = get_vae_scale_factor();
         int64_t W                  = x->ne[0] * vae_scale_factor;
         int64_t H                  = x->ne[1] * vae_scale_factor;
-        int64_t C           = 3;
-        ggml_tensor* result = nullptr;
+        int64_t C                  = 3;
+        ggml_tensor* result        = nullptr;
         if (decode_video) {
             int T = x->ne[2];
             if (sd_version_is_wan(version)) {

From e47c8c4e68645cdff3c4c2c78275e9e252a3d6b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 5 Dec 2025 20:47:23 +0100
Subject: [PATCH 06/12] Support LongCat Image model

---
 flux.hpp             | 44 +++++++++++++++++++++-------
 ggml_extend.hpp      | 69 ++++++++++++++++++++++++++++++++++++++++++++
 model.cpp            | 29 ++++++++++++-------
 model.h              | 11 ++++++-
 name_conversion.cpp  |  8 ++++-
 stable-diffusion.cpp | 56 +++++++++++++++++++++--------------
 6 files changed, 173 insertions(+), 44 deletions(-)

diff --git a/flux.hpp b/flux.hpp
index f0c65e3d7..c8b27e444 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -90,10 +90,15 @@ namespace Flux {
         SelfAttention(int64_t dim,
                       int64_t num_heads = 8,
                       bool qkv_bias     = false,
-                      bool proj_bias    = true)
+                      bool proj_bias    = true,
+                    bool diffusers_style = false)
             : num_heads(num_heads) {
             int64_t head_dim = dim / num_heads;
-            blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
+            if(diffusers_style) {
+                blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new SplitLinear(dim, {dim, dim, dim}, qkv_bias));
+            } else {
+                blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
+            }
             blocks["norm"]   = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
             blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim, proj_bias));
         }
@@ -210,7 +215,8 @@ namespace Flux {
                           bool prune_mod        = false,
                           bool share_modulation = false,
                           bool mlp_proj_bias    = true,
-                          bool use_mlp_silu_act = false)
+                          bool use_mlp_silu_act = false,
+                          bool diffusers_style  = false)
             : idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) {
             int64_t mlp_hidden_dim  = hidden_size * mlp_ratio;
             int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1;
@@ -219,7 +225,7 @@ namespace Flux {
                 blocks["img_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
             }
             blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));
+            blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias, diffusers_style));
 
             blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
             blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
@@ -230,7 +236,7 @@ namespace Flux {
                 blocks["txt_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
             }
             blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));
+            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias, diffusers_style));
 
             blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
             blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
@@ -383,6 +389,7 @@ namespace Flux {
         int idx = 0;
         bool use_mlp_silu_act;
         int64_t mlp_mult_factor;
+        bool diffusers_style = false;
 
     public:
         SingleStreamBlock(int64_t hidden_size,
@@ -393,7 +400,8 @@ namespace Flux {
                           bool prune_mod        = false,
                           bool share_modulation = false,
                           bool mlp_proj_bias    = true,
-                          bool use_mlp_silu_act = false)
+                          bool use_mlp_silu_act = false,
+                          bool diffusers_style  = false)
             : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) {
             int64_t head_dim = hidden_size / num_heads;
             float scale      = qk_scale;
@@ -405,8 +413,11 @@ namespace Flux {
             if (use_mlp_silu_act) {
                 mlp_mult_factor = 2;
             }
-
-            blocks["linear1"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
+            if (diffusers_style) {
+                blocks["linear1"] = std::shared_ptr<GGMLBlock>(new SplitLinear(hidden_size, {hidden_size, hidden_size, hidden_size, mlp_hidden_dim * mlp_mult_factor}, mlp_proj_bias));
+            } else {
+                blocks["linear1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
+            }
             blocks["linear2"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias));
             blocks["norm"]     = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
             blocks["pre_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
@@ -729,6 +740,7 @@ namespace Flux {
         bool use_mlp_silu_act       = false;
         float ref_index_scale       = 1.f;
         ChromaRadianceParams chroma_radiance_params;
+        bool diffusers_style = false;
     };
 
     struct Flux : public GGMLBlock {
@@ -770,7 +782,8 @@ namespace Flux {
                                                                                                    params.is_chroma,
                                                                                                    params.share_modulation,
                                                                                                    !params.disable_bias,
-                                                                                                   params.use_mlp_silu_act);
+                                                                                                   params.use_mlp_silu_act,
+                                                                                                   params.diffusers_style);
             }
 
             for (int i = 0; i < params.depth_single_blocks; i++) {
@@ -782,7 +795,8 @@ namespace Flux {
                                                                                                    params.is_chroma,
                                                                                                    params.share_modulation,
                                                                                                    !params.disable_bias,
-                                                                                                   params.use_mlp_silu_act);
+                                                                                                   params.use_mlp_silu_act,
+                                                                                                   params.diffusers_style);
             }
 
             if (params.version == VERSION_CHROMA_RADIANCE) {
@@ -1222,6 +1236,9 @@ namespace Flux {
                 flux_params.share_modulation = true;
                 flux_params.ref_index_scale  = 10.f;
                 flux_params.use_mlp_silu_act = true;
+            } else if (sd_version_is_longcat(version)) {
+                flux_params.context_in_dim = 3584;
+                flux_params.vec_in_dim     = 0;
             }
             for (auto pair : tensor_storage_map) {
                 std::string tensor_name = pair.first;
@@ -1231,6 +1248,9 @@ namespace Flux {
                     // not schnell
                     flux_params.guidance_embed = true;
                 }
+                if (tensor_name.find("model.diffusion_model.single_blocks.0.linear1.weight.1") == std::string::npos) {
+                    flux_params.diffusers_style = true;
+                }
                 if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
                     // Chroma
                     flux_params.is_chroma = true;
@@ -1260,6 +1280,10 @@ namespace Flux {
                 LOG_INFO("Flux guidance is disabled (Schnell mode)");
             }
 
+            if (flux_params.diffusers_style) {
+                LOG_INFO("Using diffusers-style naming");
+            }
+
             flux = Flux(flux_params);
             flux.init(params_ctx, tensor_storage_map, prefix);
         }
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 92dd3b8b6..31ae18a5a 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2159,6 +2159,75 @@ class Linear : public UnaryBlock {
     }
 };
 
+class SplitLinear : public Linear {
+protected:
+    int64_t in_features;
+    std::vector<int64_t> out_features_vec;
+    bool bias;
+    bool force_f32;
+    bool force_prec_f32;
+    float scale;
+    std::string prefix;
+
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        this->prefix         = prefix;
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
+        if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
+            wtype = GGML_TYPE_F32;
+        }
+        params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features_vec[0]);
+        for (int i = 1; i < out_features_vec.size(); i++) {
+            // most likely same type as the first weight
+            params["weight." + std::to_string(i)] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features_vec[i]);
+        }
+        if (bias) {
+            enum ggml_type wtype = GGML_TYPE_F32;
+            params["bias"]       = ggml_new_tensor_1d(ctx, wtype, out_features_vec[0]);
+            for (int i = 1; i < out_features_vec.size(); i++) {
+                params["bias." + std::to_string(i)] = ggml_new_tensor_1d(ctx, wtype, out_features_vec[i]);
+            }
+        }
+    }
+
+public:
+    SplitLinear(int64_t in_features,
+                std::vector<int64_t> out_features_vec,
+                bool bias           = true,
+                bool force_f32      = false,
+                bool force_prec_f32 = false,
+                float scale         = 1.f)
+        : Linear(in_features, out_features_vec[0], bias, force_f32, force_prec_f32, scale),
+          in_features(in_features),
+          out_features_vec(out_features_vec),
+          bias(bias),
+          force_f32(force_f32),
+          force_prec_f32(force_prec_f32),
+          scale(scale) {}
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* w = params["weight"];
+        struct ggml_tensor* b = nullptr;
+        if (bias) {
+            b = params["bias"];
+        }
+        // concat all weights and biases together
+        for (int i = 1; i < out_features_vec.size(); i++) {
+            w = ggml_concat(ctx->ggml_ctx, w, params["weight." + std::to_string(i)], 1);
+            if (bias) {
+                b = ggml_concat(ctx->ggml_ctx, b, params["bias." + std::to_string(i)], 0);
+            }
+        }
+        if (ctx->weight_adapter) {
+            WeightAdapter::ForwardParams forward_params;
+            forward_params.op_type               = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
+            forward_params.linear.force_prec_f32 = force_prec_f32;
+            forward_params.linear.scale          = scale;
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+        }
+        return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
+    }
+};
+
 __STATIC_INLINE__ bool support_get_rows(ggml_type wtype) {
     std::set<ggml_type> allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0};
     if (allow_types.find(wtype) != allow_types.end()) {
diff --git a/model.cpp b/model.cpp
index 2b74d349f..0ebc626d5 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1042,7 +1042,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 }
 
 SDVersion ModelLoader::get_sd_version() {
-    TensorStorage token_embedding_weight, input_block_weight;
+    TensorStorage token_embedding_weight, input_block_weight, context_ebedding_weight;
 
     bool has_multiple_encoders = false;
     bool is_unet               = false;
@@ -1056,7 +1056,7 @@ SDVersion ModelLoader::get_sd_version() {
 
     for (auto& [name, tensor_storage] : tensor_storage_map) {
         if (!(is_xl)) {
-            if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
+            if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos || tensor_storage.name.find("model.diffusion_model.single_transformer_blocks.") != std::string::npos) {
                 is_flux = true;
             }
             if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
@@ -1120,6 +1120,9 @@ SDVersion ModelLoader::get_sd_version() {
             tensor_storage.name == "unet.conv_in.weight") {
             input_block_weight = tensor_storage;
         }
+        if (tensor_storage.name == "model.diffusion_model.txt_in.weight" || tensor_storage.name == "model.diffusion_model.context_embedder.weight") {
+            context_ebedding_weight = tensor_storage;
+        }
     }
     if (is_wan) {
         LOG_DEBUG("patch_embedding_channels %d", patch_embedding_channels);
@@ -1147,16 +1150,20 @@ SDVersion ModelLoader::get_sd_version() {
     }
 
     if (is_flux) {
-        if (input_block_weight.ne[0] == 384) {
-            return VERSION_FLUX_FILL;
-        }
-        if (input_block_weight.ne[0] == 128) {
-            return VERSION_FLUX_CONTROLS;
-        }
-        if (input_block_weight.ne[0] == 196) {
-            return VERSION_FLEX_2;
+        if (context_ebedding_weight.ne[0] == 3584) {
+            return VERSION_LONGCAT;
+        } else {
+            if (input_block_weight.ne[0] == 384) {
+                return VERSION_FLUX_FILL;
+            }
+            if (input_block_weight.ne[0] == 128) {
+                return VERSION_FLUX_CONTROLS;
+            }
+            if (input_block_weight.ne[0] == 196) {
+                return VERSION_FLEX_2;
+            }
+            return VERSION_FLUX;
         }
-        return VERSION_FLUX;
     }
 
     if (token_embedding_weight.ne[0] == 768) {
diff --git a/model.h b/model.h
index e2ff26c49..1751d7bf3 100644
--- a/model.h
+++ b/model.h
@@ -45,6 +45,7 @@ enum SDVersion {
     VERSION_QWEN_IMAGE,
     VERSION_FLUX2,
     VERSION_Z_IMAGE,
+    VERSION_LONGCAT,
     VERSION_COUNT,
 };
 
@@ -124,6 +125,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_longcat(SDVersion version) {
+    if (version == VERSION_LONGCAT) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_inpaint(SDVersion version) {
     if (version == VERSION_SD1_INPAINT ||
         version == VERSION_SD2_INPAINT ||
@@ -141,7 +149,8 @@ static inline bool sd_version_is_dit(SDVersion version) {
         sd_version_is_sd3(version) ||
         sd_version_is_wan(version) ||
         sd_version_is_qwen_image(version) ||
-        sd_version_is_z_image(version)) {
+        sd_version_is_z_image(version) ||
+        sd_version_is_longcat(version)) {
         return true;
     }
     return false;
diff --git a/name_conversion.cpp b/name_conversion.cpp
index 8b521486d..1a37dd25c 100644
--- a/name_conversion.cpp
+++ b/name_conversion.cpp
@@ -508,6 +508,12 @@ std::string convert_diffusers_dit_to_original_flux(std::string name) {
     static std::unordered_map<std::string, std::string> flux_name_map;
 
     if (flux_name_map.empty()) {
+        // --- time_embed (longcat) ---
+        flux_name_map["time_embed.timestep_embedder.linear_1.weight"] = "time_in.in_layer.weight";
+        flux_name_map["time_embed.timestep_embedder.linear_1.bias"]   = "time_in.in_layer.bias";
+        flux_name_map["time_embed.timestep_embedder.linear_2.weight"] = "time_in.out_layer.weight";
+        flux_name_map["time_embed.timestep_embedder.linear_2.bias"]   = "time_in.out_layer.bias";
+
         // --- time_text_embed ---
         flux_name_map["time_text_embed.timestep_embedder.linear_1.weight"] = "time_in.in_layer.weight";
         flux_name_map["time_text_embed.timestep_embedder.linear_1.bias"]   = "time_in.in_layer.bias";
@@ -660,7 +666,7 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
         name = convert_diffusers_unet_to_original_sdxl(name);
     } else if (sd_version_is_sd3(version)) {
         name = convert_diffusers_dit_to_original_sd3(name);
-    } else if (sd_version_is_flux(version) || sd_version_is_flux2(version)) {
+    } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
         name = convert_diffusers_dit_to_original_flux(name);
     } else if (sd_version_is_z_image(version)) {
         name = convert_diffusers_dit_to_original_lumina2(name);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e554f0926..68b0f974e 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -46,6 +46,7 @@ const char* model_version_to_str[] = {
     "Qwen Image",
     "Flux.2",
     "Z-Image",
+    "Longcat",
 };
 
 const char* sampling_methods_str[] = {
@@ -378,7 +379,7 @@ class StableDiffusionGGML {
         } else if (sd_version_is_sd3(version)) {
             scale_factor = 1.5305f;
             shift_factor = 0.0609f;
-        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
+        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
             scale_factor = 0.3611f;
             shift_factor = 0.1159f;
         } else if (sd_version_is_wan(version) ||
@@ -406,8 +407,8 @@ class StableDiffusionGGML {
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map);
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -448,10 +449,23 @@ class StableDiffusionGGML {
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<FluxModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version,
-                                                              sd_ctx_params->chroma_use_dit_mask);
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
+            } else if (sd_version_is_longcat(version)) {
+                bool enable_vision = false;
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 version,
+                                                                 "",
+                                                                 enable_vision);
+                diffusion_model  = std::make_shared<FluxModel>(backend,
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
@@ -460,10 +474,10 @@ class StableDiffusionGGML {
                                                                     1,
                                                                     true);
                 diffusion_model  = std::make_shared<WanModel>(backend,
-                                                             offload_params_to_cpu,
-                                                             tensor_storage_map,
-                                                             "model.diffusion_model",
-                                                             version);
+                                                              offload_params_to_cpu,
+                                                              tensor_storage_map,
+                                                              "model.diffusion_model",
+                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
@@ -492,20 +506,20 @@ class StableDiffusionGGML {
                                                                  "",
                                                                  enable_vision);
                 diffusion_model  = std::make_shared<QwenImageModel>(backend,
-                                                                   offload_params_to_cpu,
-                                                                   tensor_storage_map,
-                                                                   "model.diffusion_model",
-                                                                   version);
+                                                                    offload_params_to_cpu,
+                                                                    tensor_storage_map,
+                                                                    "model.diffusion_model",
+                                                                    version);
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<ZImageModel>(backend,
-                                                                offload_params_to_cpu,
-                                                                tensor_storage_map,
-                                                                "model.diffusion_model",
-                                                                version);
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 "model.diffusion_model",
+                                                                 version);
             } else {  // SD1.x SD2.x SDXL
                 if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
@@ -817,7 +831,7 @@ class StableDiffusionGGML {
                             flow_shift = 3.f;
                         }
                     }
-                } else if (sd_version_is_flux(version)) {
+                } else if (sd_version_is_flux(version) || sd_version_is_longcat(version)) {
                     pred_type = FLUX_FLOW_PRED;
                     if (flow_shift == INFINITY) {
                         flow_shift = 1.0f;  // TODO: validate
@@ -1334,7 +1348,7 @@ class StableDiffusionGGML {
                 if (sd_version_is_sd3(version)) {
                     latent_rgb_proj = sd3_latent_rgb_proj;
                     latent_rgb_bias = sd3_latent_rgb_bias;
-                } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
+                } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
                     latent_rgb_bias = flux_latent_rgb_bias;
                 } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {

From 2c449046b91a61ec674b6d34e06e2ee31daf1797 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 5 Dec 2025 20:58:53 +0100
Subject: [PATCH 07/12] temp fix cuda error on quant concat for splitlinear

---
 ggml_extend.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 31ae18a5a..4370a88f9 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2200,7 +2200,7 @@ class SplitLinear : public Linear {
           in_features(in_features),
           out_features_vec(out_features_vec),
           bias(bias),
-          force_f32(force_f32),
+          force_f32(true),
           force_prec_f32(force_prec_f32),
           scale(scale) {}
 

From 00071aa7f5e64cd9540dfa6684a9049aa1d9382d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 6 Dec 2025 02:43:46 +0100
Subject: [PATCH 08/12] pre-patchify

---
 flux.hpp             |  1 +
 stable-diffusion.cpp | 36 ++++++++++++++++++++++++++++++------
 vae.hpp              |  4 ++--
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/flux.hpp b/flux.hpp
index c8b27e444..0cedb787c 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -1239,6 +1239,7 @@ namespace Flux {
             } else if (sd_version_is_longcat(version)) {
                 flux_params.context_in_dim = 3584;
                 flux_params.vec_in_dim     = 0;
+                flux_params.patch_size     = 1;
             }
             for (auto pair : tensor_storage_map) {
                 std::string tensor_name = pair.first;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index b23de473f..ced0775bd 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -449,10 +449,23 @@ class StableDiffusionGGML {
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<FluxModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version,
-                                                              sd_ctx_params->chroma_use_dit_mask);
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
+            } else if (sd_version_is_longcat(version)) {
+                bool enable_vision = false;
+                cond_stage_model   = std::make_shared<LLMEmbedder>(clip_backend,
+                                                                   offload_params_to_cpu,
+                                                                   tensor_storage_map,
+                                                                   version,
+                                                                   "",
+                                                                   enable_vision);
+                diffusion_model    = std::make_shared<FluxModel>(backend,
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 version,
+                                                                 sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
@@ -827,6 +840,9 @@ class StableDiffusionGGML {
                                 flow_shift = 1.15f;
                             }
                         }
+                        if(sd_version_is_longcat(version)) {
+                            flow_shift = 3.0f;
+                        }
                     }
                 } else if (sd_version_is_flux2(version)) {
                     pred_type = FLUX2_FLOW_PRED;
@@ -1325,7 +1341,13 @@ class StableDiffusionGGML {
                 if (sd_version_is_flux2(version)) {
                     latent_rgb_proj = flux2_latent_rgb_proj;
                     latent_rgb_bias = flux2_latent_rgb_bias;
-                    patch_sz = 2;
+                    patch_sz        = 2;
+                }
+            } else if (dim == 64) {
+                if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
+                    latent_rgb_proj = flux_latent_rgb_proj;
+                    latent_rgb_bias = flux_latent_rgb_bias;
+                    patch_sz        = 2;
                 }
             } else if (dim == 48) {
                 if (sd_version_is_wan(version)) {
@@ -1896,7 +1918,7 @@ class StableDiffusionGGML {
         int vae_scale_factor = 8;
         if (version == VERSION_WAN2_2_TI2V) {
             vae_scale_factor = 16;
-        } else if (sd_version_is_flux2(version)) {
+        } else if (sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
             vae_scale_factor = 16;
         } else if (version == VERSION_CHROMA_RADIANCE) {
             vae_scale_factor = 1;
@@ -1913,6 +1935,8 @@ class StableDiffusionGGML {
                 latent_channel = 3;
             } else if (sd_version_is_flux2(version)) {
                 latent_channel = 128;
+            } else if (sd_version_is_longcat(version)) {
+                latent_channel = 64;
             } else {
                 latent_channel = 16;
             }
diff --git a/vae.hpp b/vae.hpp
index ad5db1b57..740a5655b 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -553,7 +553,7 @@ class AutoencodingEngine : public GGMLBlock {
 
     struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
         // z: [N, z_channels, h, w]
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
             // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
             int64_t p = 2;
 
@@ -592,7 +592,7 @@ class AutoencodingEngine : public GGMLBlock {
             auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
             z               = quant_conv->forward(ctx, z);  // [N, 2*embed_dim, h/8, w/8]
         }
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
             z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
 
             // [N, C, H, W] -> [N, C*p*p, H/p, W/p]

From 7711efb4fb3a04072f62d1f26039d8152e376cb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 6 Dec 2025 02:44:20 +0100
Subject: [PATCH 09/12] longcat rope ids

---
 conditioner.hpp | 11 +++++++++++
 flux.hpp        |  7 +++----
 ggml_extend.hpp | 26 +++++++++++++++++---------
 rope.hpp        | 35 +++++++++++++++++++++++++----------
 4 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/conditioner.hpp b/conditioner.hpp
index 403120d9b..60ce2bc30 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -1825,6 +1825,17 @@ struct LLMEmbedder : public Conditioner {
             prompt_attn_range.second = prompt.size();
 
             prompt += "[/INST]";
+        } else if (sd_version_is_longcat(version)) {
+            prompt_template_encode_start_idx = 36;
+            // prompt_template_encode_end_idx = 5;
+
+            prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n";
         } else {
             prompt_template_encode_start_idx = 34;
 
diff --git a/flux.hpp b/flux.hpp
index 0cedb787c..15795f058 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -1282,7 +1282,7 @@ namespace Flux {
             }
 
             if (flux_params.diffusers_style) {
-                LOG_INFO("Using diffusers-style naming");
+                LOG_INFO("Using diffusers-style attention blocks");
             }
 
             flux = Flux(flux_params);
@@ -1388,7 +1388,6 @@ namespace Flux {
             for (int i = 0; i < ref_latents.size(); i++) {
                 ref_latents[i] = to_backend(ref_latents[i]);
             }
-
             pe_vec      = Rope::gen_flux_pe(x->ne[1],
                                             x->ne[0],
                                             flux_params.patch_size,
@@ -1398,9 +1397,9 @@ namespace Flux {
                                        sd_version_is_flux2(version) ? true : increase_ref_index,
                                             flux_params.ref_index_scale,
                                             flux_params.theta,
-                                            flux_params.axes_dim);
+                                            flux_params.axes_dim,
+                                        sd_version_is_longcat(version));
             int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
-            // LOG_DEBUG("pos_len %d", pos_len);
             auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
             // pe->data = pe_vec.data();
             // print_ggml_tensor(pe);
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 4370a88f9..fc571b1cc 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2200,7 +2200,7 @@ class SplitLinear : public Linear {
           in_features(in_features),
           out_features_vec(out_features_vec),
           bias(bias),
-          force_f32(true),
+          force_f32(force_f32),
           force_prec_f32(force_prec_f32),
           scale(scale) {}
 
@@ -2210,21 +2210,29 @@ class SplitLinear : public Linear {
         if (bias) {
             b = params["bias"];
         }
-        // concat all weights and biases together
-        for (int i = 1; i < out_features_vec.size(); i++) {
-            w = ggml_concat(ctx->ggml_ctx, w, params["weight." + std::to_string(i)], 1);
-            if (bias) {
-                b = ggml_concat(ctx->ggml_ctx, b, params["bias." + std::to_string(i)], 0);
-            }
-        }
         if (ctx->weight_adapter) {
+            // concat all weights and biases together so it runs in one linear layer
+            for (int i = 1; i < out_features_vec.size(); i++) {
+                w = ggml_concat(ctx->ggml_ctx, w, params["weight." + std::to_string(i)], 1);
+                if (bias) {
+                    b = ggml_concat(ctx->ggml_ctx, b, params["bias." + std::to_string(i)], 0);
+                }
+            }
             WeightAdapter::ForwardParams forward_params;
             forward_params.op_type               = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
             forward_params.linear.force_prec_f32 = force_prec_f32;
             forward_params.linear.scale          = scale;
             return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
         }
-        return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
+        auto x0 = ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
+        for (int i = 1; i < out_features_vec.size(); i++) {
+            auto wi = params["weight." + std::to_string(i)];
+            auto bi = bias ? params["bias." + std::to_string(i)] : nullptr;
+            auto xi = ggml_ext_linear(ctx->ggml_ctx, x, wi, bi, force_prec_f32, scale);
+            x0 = ggml_concat(ctx->ggml_ctx, x0, xi, 0);
+        }
+
+        return x0;
     }
 };
 
diff --git a/rope.hpp b/rope.hpp
index 7a35926eb..5739f409d 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -82,7 +82,16 @@ namespace Rope {
         return txt_ids;
     }
 
-    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h,
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_longcat_txt_ids(int bs, int context_len, int axes_dim_num) {
+        auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
+        for (int i = 0; i < bs * context_len; i++) {
+            txt_ids[i][1] = (i % context_len);
+            txt_ids[i][2] = (i % context_len);
+        }
+        return txt_ids;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>>  gen_flux_img_ids(int h,
                                                                        int w,
                                                                        int patch_size,
                                                                        int bs,
@@ -92,7 +101,6 @@ namespace Rope {
                                                                        int w_offset = 0) {
         int h_len = (h + (patch_size / 2)) / patch_size;
         int w_len = (w + (patch_size / 2)) / patch_size;
-
         std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));
 
         std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
@@ -167,13 +175,14 @@ namespace Rope {
     __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
                                                                    int bs,
                                                                    int axes_dim_num,
+                                                                   int start_index,
                                                                    const std::vector<ggml_tensor*>& ref_latents,
                                                                    bool increase_ref_index,
                                                                    float ref_index_scale) {
         std::vector<std::vector<float>> ids;
         uint64_t curr_h_offset = 0;
         uint64_t curr_w_offset = 0;
-        int index              = 1;
+        int index              = start_index;
         for (ggml_tensor* ref : ref_latents) {
             uint64_t h_offset = 0;
             uint64_t w_offset = 0;
@@ -213,13 +222,17 @@ namespace Rope {
                                                                    int context_len,
                                                                    const std::vector<ggml_tensor*>& ref_latents,
                                                                    bool increase_ref_index,
-                                                                   float ref_index_scale) {
-        auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num);
-        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
+                                                                   float ref_index_scale,
+                                                                   bool is_longcat) {
+        int start_index = is_longcat ? 1 : 0;
+
+        auto txt_ids = is_longcat ? gen_longcat_txt_ids(bs, context_len, axes_dim_num) : gen_flux_txt_ids(bs, context_len, axes_dim_num);
+        int offset   = is_longcat ? context_len : 0;
+        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, start_index, offset, offset);
 
         auto ids = concat_ids(txt_ids, img_ids, bs);
         if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, start_index + 1, ref_latents, increase_ref_index, ref_index_scale);
             ids           = concat_ids(ids, refs_ids, bs);
         }
         return ids;
@@ -235,7 +248,8 @@ namespace Rope {
                                                      bool increase_ref_index,
                                                      float ref_index_scale,
                                                      int theta,
-                                                     const std::vector<int>& axes_dim) {
+                                                     const std::vector<int>& axes_dim,
+                                                     bool is_longcat) {
         std::vector<std::vector<float>> ids = gen_flux_ids(h,
                                                            w,
                                                            patch_size,
@@ -244,7 +258,8 @@ namespace Rope {
                                                            context_len,
                                                            ref_latents,
                                                            increase_ref_index,
-                                                           ref_index_scale);
+                                                           ref_index_scale,
+                                                           is_longcat);
         return embed_nd(ids, bs, theta, axes_dim);
     }
 
@@ -269,7 +284,7 @@ namespace Rope {
         auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
         auto ids         = concat_ids(txt_ids_repeated, img_ids, bs);
         if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, 1, ref_latents, increase_ref_index, 1.f);
             ids           = concat_ids(ids, refs_ids, bs);
         }
         return ids;

From 535543a152ca84f66b16b91c1d0194265efff97a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 6 Dec 2025 03:47:52 +0100
Subject: [PATCH 10/12] Fix diffusers_style detection

---
 flux.hpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/flux.hpp b/flux.hpp
index 15795f058..bedd2e35a 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -88,19 +88,19 @@ namespace Flux {
 
     public:
         SelfAttention(int64_t dim,
-                      int64_t num_heads = 8,
-                      bool qkv_bias     = false,
-                      bool proj_bias    = true,
-                    bool diffusers_style = false)
+                      int64_t num_heads    = 8,
+                      bool qkv_bias        = false,
+                      bool proj_bias       = true,
+                      bool diffusers_style = false)
             : num_heads(num_heads) {
             int64_t head_dim = dim / num_heads;
-            if(diffusers_style) {
-                blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new SplitLinear(dim, {dim, dim, dim}, qkv_bias));
+            if (diffusers_style) {
+                blocks["qkv"] = std::shared_ptr<GGMLBlock>(new SplitLinear(dim, {dim, dim, dim}, qkv_bias));
             } else {
-                blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
+                blocks["qkv"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
             }
-            blocks["norm"]   = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
-            blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim, proj_bias));
+            blocks["norm"] = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
+            blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim, proj_bias));
         }
 
         std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
@@ -739,8 +739,8 @@ namespace Flux {
         bool share_modulation       = false;
         bool use_mlp_silu_act       = false;
         float ref_index_scale       = 1.f;
+        bool diffusers_style        = false;
         ChromaRadianceParams chroma_radiance_params;
-        bool diffusers_style = false;
     };
 
     struct Flux : public GGMLBlock {
@@ -1249,7 +1249,7 @@ namespace Flux {
                     // not schnell
                     flux_params.guidance_embed = true;
                 }
-                if (tensor_name.find("model.diffusion_model.single_blocks.0.linear1.weight.1") == std::string::npos) {
+                if (tensor_name.find("model.diffusion_model.single_blocks.0.linear1.weight.1") != std::string::npos) {
                     flux_params.diffusers_style = true;
                 }
                 if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
@@ -1398,9 +1398,9 @@ namespace Flux {
                                             flux_params.ref_index_scale,
                                             flux_params.theta,
                                             flux_params.axes_dim,
-                                        sd_version_is_longcat(version));
+                                            sd_version_is_longcat(version));
             int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
-            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
+            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
             // pe->data = pe_vec.data();
             // print_ggml_tensor(pe);
             // pe->data = nullptr;

From 61b0dcf06f9e235078e7a7f9e32574d61949add1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 6 Dec 2025 16:05:58 +0100
Subject: [PATCH 11/12] Flux: simplify when patch_size is 1

---
 flux.hpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/flux.hpp b/flux.hpp
index bedd2e35a..ba1a3776b 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -843,6 +843,11 @@ namespace Flux {
             int64_t C = x->ne[2];
             int64_t H = x->ne[1];
             int64_t W = x->ne[0];
+            if (params.patch_size == 1) {
+                x = ggml_reshape_3d(ctx, x, H * W, C, N);              // [N, C, H*W]
+                x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, H*W, C]
+                return x;
+            }
             int64_t p = params.patch_size;
             int64_t h = H / params.patch_size;
             int64_t w = W / params.patch_size;
@@ -877,6 +882,12 @@ namespace Flux {
             int64_t W = w * params.patch_size;
             int64_t p = params.patch_size;
 
+            if (params.patch_size == 1) {
+                x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, C, H*W]
+                x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, H, W]
+                return x;
+            }
+
             GGML_ASSERT(C * p * p == x->ne[0]);
 
             x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N);       // [N, h*w, C, p*p]

From deaf939bbade022ef151ea70642a2a1bc5f8de7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 6 Dec 2025 16:06:32 +0100
Subject: [PATCH 12/12] correct rope offset for image tokens

---
 rope.hpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/rope.hpp b/rope.hpp
index 5739f409d..b487292dc 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -178,10 +178,11 @@ namespace Rope {
                                                                    int start_index,
                                                                    const std::vector<ggml_tensor*>& ref_latents,
                                                                    bool increase_ref_index,
-                                                                   float ref_index_scale) {
+                                                                   float ref_index_scale,
+                                                                    int base_offset = 0) {
         std::vector<std::vector<float>> ids;
-        uint64_t curr_h_offset = 0;
-        uint64_t curr_w_offset = 0;
+        uint64_t curr_h_offset = base_offset;
+        uint64_t curr_w_offset = base_offset;
         int index              = start_index;
         for (ggml_tensor* ref : ref_latents) {
             uint64_t h_offset = 0;
@@ -224,15 +225,15 @@ namespace Rope {
                                                                    bool increase_ref_index,
                                                                    float ref_index_scale,
                                                                    bool is_longcat) {
-        int start_index = is_longcat ? 1 : 0;
+        int x_index = is_longcat ? 1 : 0;
 
         auto txt_ids = is_longcat ? gen_longcat_txt_ids(bs, context_len, axes_dim_num) : gen_flux_txt_ids(bs, context_len, axes_dim_num);
-        int offset   = is_longcat ? context_len : 0;
-        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, start_index, offset, offset);
+        int offset   = is_longcat ? context_len + 1 : 0;
+        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, x_index, offset, offset);
 
         auto ids = concat_ids(txt_ids, img_ids, bs);
         if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, start_index + 1, ref_latents, increase_ref_index, ref_index_scale);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, x_index + 1, ref_latents, increase_ref_index, ref_index_scale, offset);
             ids           = concat_ids(ids, refs_ids, bs);
         }
         return ids;