Skip to content

Commit 5ef066a

Browse files
committed
fix: adjust timestep calculations for DDIM and TCD
On img2img, the number of steps correspond to the last precalculated sigma values, but the internal alphas_cumprod and compvis_sigmas were being computed over the entire step range.
1 parent 710169d commit 5ef066a

File tree

2 files changed

+24
-9
lines changed

2 files changed

+24
-9
lines changed

denoiser.hpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,7 @@ static void sample_k_diffusion(sample_method_t method,
671671
ggml_context* work_ctx,
672672
ggml_tensor* x,
673673
std::vector<float> sigmas,
674+
int initial_step,
674675
std::shared_ptr<RNG> rng,
675676
float eta) {
676677
size_t steps = sigmas.size() - 1;
@@ -1248,12 +1249,13 @@ static void sample_k_diffusion(sample_method_t method,
12481249
// - pred_sample_direction -> "direction pointing to
12491250
// x_t"
12501251
// - pred_prev_sample -> "x_t-1"
1251-
int timestep =
1252-
roundf(TIMESTEPS -
1253-
i * ((float)TIMESTEPS / steps)) -
1254-
1;
1252+
int timestep = TIMESTEPS - 1 -
1253+
(int)roundf((initial_step + i) *
1254+
(TIMESTEPS / float(initial_step + steps)));
12551255
// 1. get previous step value (=t-1)
1256-
int prev_timestep = timestep - TIMESTEPS / steps;
1256+
int prev_timestep = TIMESTEPS - 1 -
1257+
(int)roundf((initial_step + i + 1) *
1258+
(TIMESTEPS / float(initial_step + steps)));
12571259
// The sigma here is chosen to cause the
12581260
// CompVisDenoiser to produce t = timestep
12591261
float sigma = compvis_sigmas[timestep];
@@ -1425,9 +1427,14 @@ static void sample_k_diffusion(sample_method_t method,
14251427
// Analytic form for TCD timesteps
14261428
int timestep = TIMESTEPS - 1 -
14271429
(TIMESTEPS / original_steps) *
1428-
(int)floor(i * ((float)original_steps / steps));
1430+
(int)floor((initial_step + i) *
1431+
((float)original_steps / (initial_step + steps)));
14291432
// 1. get previous step value
1430-
int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
1433+
int prev_timestep = i >= steps - 1 ? 0 :
1434+
TIMESTEPS - 1 -
1435+
(TIMESTEPS / original_steps) *
1436+
(int)floor((initial_step + i + 1) *
1437+
((float)original_steps / (initial_step + steps)));
14311438
// Here timestep_s is tau_n' in Algorithm 4. The _s
14321439
// notation appears to be that from C. Lu,
14331440
// "DPM-Solver: A Fast ODE Solver for Diffusion

stable-diffusion.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,6 +1480,7 @@ class StableDiffusionGGML {
14801480
int shifted_timestep,
14811481
sample_method_t method,
14821482
const std::vector<float>& sigmas,
1483+
int initial_step,
14831484
int start_merge_step,
14841485
SDCondition id_cond,
14851486
std::vector<ggml_tensor*> ref_latents = {},
@@ -1837,7 +1838,7 @@ class StableDiffusionGGML {
18371838
return denoised;
18381839
};
18391840

1840-
sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta);
1841+
sample_k_diffusion(method, denoise, work_ctx, x, sigmas, initial_step, sampler_rng, eta);
18411842

18421843
if (easycache_enabled) {
18431844
size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
@@ -2762,6 +2763,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
27622763
int height,
27632764
enum sample_method_t sample_method,
27642765
const std::vector<float>& sigmas,
2766+
int initial_step,
27652767
int64_t seed,
27662768
int batch_count,
27672769
sd_image_t control_image,
@@ -3056,6 +3058,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
30563058
shifted_timestep,
30573059
sample_method,
30583060
sigmas,
3061+
initial_step,
30593062
start_merge_step,
30603063
id_cond,
30613064
ref_latents,
@@ -3173,6 +3176,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
31733176
sd_ctx->sd->get_image_seq_len(height, width),
31743177
sd_img_gen_params->sample_params.scheduler,
31753178
sd_ctx->sd->version);
3179+
int initial_step = 0;
31763180

31773181
ggml_tensor* init_latent = nullptr;
31783182
ggml_tensor* concat_latent = nullptr;
@@ -3185,7 +3189,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
31853189
t_enc--;
31863190
LOG_INFO("target t_enc is %zu steps", t_enc);
31873191
std::vector<float> sigma_sched;
3188-
sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
3192+
initial_step = sample_steps - t_enc - 1;
3193+
sigma_sched.assign(sigmas.begin() + initial_step, sigmas.end());
31893194
sigmas = sigma_sched;
31903195

31913196
ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
@@ -3373,6 +3378,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
33733378
height,
33743379
sample_method,
33753380
sigmas,
3381+
initial_step,
33763382
seed,
33773383
sd_img_gen_params->batch_count,
33783384
sd_img_gen_params->control_image,
@@ -3709,6 +3715,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
37093715
sd_vid_gen_params->high_noise_sample_params.shifted_timestep,
37103716
high_noise_sample_method,
37113717
high_noise_sigmas,
3718+
0,
37123719
-1,
37133720
{},
37143721
{},
@@ -3746,6 +3753,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
37463753
sd_vid_gen_params->sample_params.shifted_timestep,
37473754
sample_method,
37483755
sigmas,
3756+
0,
37493757
-1,
37503758
{},
37513759
{},

0 commit comments

Comments
 (0)