Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <string.h>
#include <time.h>
#include <cctype>
#include <cstdlib>
#include <filesystem>
#include <functional>
#include <iostream>
Expand Down Expand Up @@ -655,6 +656,21 @@ int main(int argc, const char* argv[]) {
}
}

SDAudioPtr input_audio;
if (gen_params.init_audio_path.size() > 0) {
input_audio.reset(static_cast<sd_audio_t*>(malloc(sizeof(sd_audio_t))));
if (input_audio == nullptr) {
LOG_ERROR("malloc input audio failed");
return 1;
}
*input_audio = load_pcm_wav_from_file(gen_params.init_audio_path);
if (input_audio->data == nullptr || input_audio->sample_count == 0) {
LOG_ERROR("load audio from '%s' failed", gen_params.init_audio_path.c_str());
return 1;
}
gen_params.input_audio = input_audio.get();
}

if (gen_params.ref_image_paths.size() > 0) {
gen_params.ref_images.clear();
for (auto& path : gen_params.ref_image_paths) {
Expand Down
11 changes: 11 additions & 0 deletions examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,10 @@ ArgOptions SDGenerationParams::get_options() {
"--end-img",
"path to the end image, required by flf2v",
&end_image_path},
{"",
"--init-audio",
"path to the init audio WAV, for use with audio-to-video models",
&init_audio_path},
{"",
"--mask",
"path to the mask image",
Expand Down Expand Up @@ -2223,6 +2227,11 @@ bool SDGenerationParams::validate(SDMode mode) {
}
}

if (mode != VID_GEN && init_audio_path.length() > 0) {
LOG_ERROR("error: init audio (--init-audio) is only supported in vid_gen mode\n");
return false;
}

return true;
}

Expand Down Expand Up @@ -2362,6 +2371,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
params.clip_skip = clip_skip;
params.init_image = init_image.get();
params.end_image = end_image.get();
params.input_audio = input_audio;
params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data();
params.control_frames_size = static_cast<int>(control_frame_views.size());
params.width = get_resolved_width();
Expand Down Expand Up @@ -2431,6 +2441,7 @@ std::string SDGenerationParams::to_string() const {
<< " batch_count: " << batch_count << ",\n"
<< " init_image_path: \"" << init_image_path << "\",\n"
<< " end_image_path: \"" << end_image_path << "\",\n"
<< " init_audio_path: \"" << init_audio_path << "\",\n"
<< " mask_image_path: \"" << mask_image_path << "\",\n"
<< " control_image_path: \"" << control_image_path << "\",\n"
<< " ref_image_paths: " << vec_str_to_string(ref_image_paths) << ",\n"
Expand Down
2 changes: 2 additions & 0 deletions examples/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ struct SDGenerationParams {

std::string init_image_path;
std::string end_image_path;
std::string init_audio_path;
std::string mask_image_path;
std::string control_image_path;
std::vector<std::string> ref_image_paths;
Expand Down Expand Up @@ -268,6 +269,7 @@ struct SDGenerationParams {
SDImageOwner control_image;
std::vector<SDImageOwner> pm_id_images;
std::vector<SDImageOwner> control_frames;
const sd_audio_t* input_audio = nullptr;

// Backing storage for sd_img_gen_params_t view fields.
std::vector<sd_image_t> ref_image_views;
Expand Down
115 changes: 115 additions & 0 deletions examples/common/media_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,20 @@ uint32_t read_u32_le_bytes(const uint8_t* data) {
(static_cast<uint32_t>(data[3]) << 24);
}

uint16_t read_u16_le_bytes(const uint8_t* p) {
return static_cast<uint16_t>(p[0]) | (static_cast<uint16_t>(p[1]) << 8);
}

int32_t read_s24_le_bytes(const uint8_t* p) {
int32_t value = static_cast<int32_t>(p[0]) |
(static_cast<int32_t>(p[1]) << 8) |
(static_cast<int32_t>(p[2]) << 16);
if (value & 0x00800000) {
value |= 0xff000000;
}
return value;
}

int stbi_ext_write_png_to_func(stbi_write_func* func,
void* context,
int x,
Expand Down Expand Up @@ -1374,3 +1388,104 @@ bool write_wav_to_file(const std::string& path,
file.write(reinterpret_cast<const char*>(pcm.data()), static_cast<std::streamsize>(pcm.size() * sizeof(int16_t)));
return file.good();
}

sd_audio_t load_pcm_wav_from_file(const std::string& path) {
sd_audio_t audio = {0, 0, 0, nullptr};
if (path.empty()) {
return audio;
}

std::vector<uint8_t> wav;
if (!read_binary_file_bytes(path.c_str(), wav)) {
LOG_ERROR("load WAV from '%s' failed", path.c_str());
return audio;
}
if (wav.size() < 44 || std::memcmp(wav.data(), "RIFF", 4) != 0 || std::memcmp(wav.data() + 8, "WAVE", 4) != 0) {
LOG_ERROR("input audio file '%s' is not a RIFF/WAVE file", path.c_str());
return audio;
}

uint16_t format = 0;
uint16_t channels = 0;
uint32_t sample_rate = 0;
uint16_t bits_per_sample = 0;
const uint8_t* data = nullptr;
uint32_t data_size = 0;

size_t pos = 12;
while (pos + 8 <= wav.size()) {
const uint8_t* chunk = wav.data() + pos;
uint32_t chunk_size = read_u32_le_bytes(chunk + 4);
size_t chunk_data = pos + 8;
if (chunk_data + chunk_size > wav.size()) {
break;
}

if (std::memcmp(chunk, "fmt ", 4) == 0 && chunk_size >= 16) {
format = read_u16_le_bytes(wav.data() + chunk_data);
channels = read_u16_le_bytes(wav.data() + chunk_data + 2);
sample_rate = read_u32_le_bytes(wav.data() + chunk_data + 4);
bits_per_sample = read_u16_le_bytes(wav.data() + chunk_data + 14);
} else if (std::memcmp(chunk, "data", 4) == 0) {
data = wav.data() + chunk_data;
data_size = chunk_size;
}
pos = chunk_data + chunk_size + (chunk_size & 1);
}

if (data == nullptr || data_size == 0 || channels == 0 || sample_rate == 0) {
LOG_ERROR("input WAV '%s' is missing fmt/data chunks", path.c_str());
return audio;
}
if (format != 1 && format != 3) {
LOG_ERROR("unsupported WAV format %u in '%s', only PCM and float WAV are supported",
static_cast<unsigned>(format),
path.c_str());
return audio;
}

uint16_t bytes_per_sample = static_cast<uint16_t>((bits_per_sample + 7) / 8);
uint32_t frame_bytes = static_cast<uint32_t>(bytes_per_sample) * channels;
if (bytes_per_sample == 0 || frame_bytes == 0 || data_size < frame_bytes) {
LOG_ERROR("invalid WAV sample format in '%s'", path.c_str());
return audio;
}

uint64_t sample_count = data_size / frame_bytes;
size_t float_count = static_cast<size_t>(sample_count) * channels;
float* samples = (float*)malloc(float_count * sizeof(float));
if (samples == nullptr) {
return audio;
}

for (uint64_t i = 0; i < sample_count; ++i) {
for (uint16_t ch = 0; ch < channels; ++ch) {
const uint8_t* src = data + i * frame_bytes + ch * bytes_per_sample;
float sample = 0.f;
if (format == 3 && bits_per_sample == 32) {
std::memcpy(&sample, src, sizeof(float));
} else if (format == 1 && bits_per_sample == 8) {
sample = (static_cast<int>(src[0]) - 128) / 128.f;
} else if (format == 1 && bits_per_sample == 16) {
sample = static_cast<int16_t>(read_u16_le_bytes(src)) / 32768.f;
} else if (format == 1 && bits_per_sample == 24) {
sample = read_s24_le_bytes(src) / 8388608.f;
} else if (format == 1 && bits_per_sample == 32) {
sample = static_cast<int32_t>(read_u32_le_bytes(src)) / 2147483648.f;
} else {
LOG_ERROR("unsupported WAV bit depth %u in '%s'",
static_cast<unsigned>(bits_per_sample),
path.c_str());
free(samples);
return audio;
}
samples[i * channels + ch] = std::clamp(sample, -1.0f, 1.0f);
}
}

audio.sample_rate = sample_rate;
audio.channels = channels;
audio.sample_count = sample_count;
audio.data = samples;
return audio;
}
2 changes: 2 additions & 0 deletions examples/common/media_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,6 @@ bool write_wav_to_file(const std::string& path,
uint32_t channels,
uint32_t sample_rate);

sd_audio_t load_pcm_wav_from_file(const std::string& path);

#endif // __MEDIA_IO_H__
9 changes: 9 additions & 0 deletions examples/common/resource_owners.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,21 @@ struct UpscalerCtxDeleter {
}
};

struct SDAudioDeleter {
void operator()(sd_audio_t* audio) const {
if (audio != nullptr) {
free_sd_audio(audio);
}
}
};

template <typename T>
using FreeUniquePtr = std::unique_ptr<T, FreeDeleter>;

using FilePtr = std::unique_ptr<FILE, FileCloser>;
using SDCtxPtr = std::unique_ptr<sd_ctx_t, SDCtxDeleter>;
using UpscalerCtxPtr = std::unique_ptr<upscaler_ctx_t, UpscalerCtxDeleter>;
using SDAudioPtr = std::unique_ptr<sd_audio_t, SDAudioDeleter>;

class SDImageOwner {
private:
Expand Down
1 change: 1 addition & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@ typedef struct {
int64_t seed;
int video_frames;
int fps;
const sd_audio_t* input_audio;
float vace_strength;
sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
Expand Down
Loading
Loading