From f985f4b5f9d1aa0da2fbee0a33dd9a6865029206 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 6 Dec 2025 14:54:40 -0300 Subject: [PATCH 1/2] feat: support mmap for model loading --- examples/cli/main.cpp | 7 +++ model.cpp | 23 ++++++-- model.h | 5 +- stable-diffusion.cpp | 3 +- stable-diffusion.h | 1 + util.cpp | 127 ++++++++++++++++++++++++++++++++++++++++++ util.h | 23 ++++++++ 7 files changed, 182 insertions(+), 7 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index bf42f5aa4..8803401c7 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -504,6 +504,7 @@ struct SDContextParams { rng_type_t rng_type = CUDA_RNG; rng_type_t sampler_rng_type = RNG_TYPE_COUNT; bool offload_params_to_cpu = false; + bool use_mmap = false; bool control_net_cpu = false; bool clip_on_cpu = false; bool vae_on_cpu = false; @@ -639,6 +640,10 @@ struct SDContextParams { "--offload-to-cpu", "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", true, &offload_params_to_cpu}, + {"", + "--use-mmap", + "use mmap to load weights", + true, &use_mmap}, {"", "--control-net-cpu", "keep controlnet in cpu (for low vram)", @@ -874,6 +879,7 @@ struct SDContextParams { << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" + << " use_mmap: " << (use_mmap ? "true" : "false") << ",\n" << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" @@ -924,6 +930,7 @@ struct SDContextParams { prediction, lora_apply_mode, offload_params_to_cpu, + use_mmap, clip_on_cpu, control_net_cpu, vae_on_cpu, diff --git a/model.cpp b/model.cpp index b314139c2..700cc0fc9 100644 --- a/model.cpp +++ b/model.cpp @@ -1337,7 +1337,7 @@ std::string ModelLoader::load_umt5_tokenizer_json() { return json_str; } -bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) { +bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool use_mmap) { int64_t process_time_ms = 0; std::atomic read_time_ms(0); std::atomic memcpy_time_ms(0); @@ -1387,6 +1387,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } } + std::shared_ptr mmapped; + if (use_mmap && !is_zip) { + LOG_DEBUG("using mmap for I/O"); + mmapped = MmapWrapper::create(file_path); + if (!mmapped) { + LOG_WARN("failed to memory-map '%s'", file_path.c_str()); + } + } + int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size()); if (n_threads < 1) { n_threads = 1; @@ -1408,7 +1417,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread failed = true; return; } - } else { + } else if (!mmapped) { file.open(file_path, std::ios::binary); if (!file.is_open()) { LOG_ERROR("failed to open '%s'", file_path.c_str()); @@ -1461,6 +1470,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread zip_entry_noallocread(zip, (void*)buf, n); } zip_entry_close(zip); + } else if (mmapped) { + if (!mmapped->copy_data(buf, n, tensor_storage.offset)) { + LOG_ERROR("read tensor data failed: '%s'", file_path.c_str()); + failed = true; + } } else { file.seekg(tensor_storage.offset); file.read(buf, n); @@ -1580,7 +1594,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread bool ModelLoader::load_tensors(std::map& tensors, std::set ignore_tensors, - int n_threads) { + int n_threads, + bool use_mmap) { std::set tensor_names_in_file; std::mutex tensor_names_mutex; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { @@ -1623,7 +1638,7 @@ bool ModelLoader::load_tensors(std::map& tenso return true; }; - bool success = load_tensors(on_new_tensor_cb, n_threads); + bool success = load_tensors(on_new_tensor_cb, n_threads, use_mmap); if (!success) { LOG_ERROR("load tensors from file failed"); return false; diff --git a/model.h b/model.h index 71a22a8f9..c60c9ed61 100644 --- a/model.h +++ b/model.h @@ -308,10 +308,11 @@ class ModelLoader { std::map get_vae_wtype_stat(); String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; } void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = ""); - bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0); + bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false); bool load_tensors(std::map& tensors, std::set ignore_tensors = {}, - int n_threads = 0); + int n_threads = 0, + bool use_mmap = false); std::vector get_tensor_names() const { std::vector names; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index fe2a26ca3..14c7fb0c1 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -693,7 +693,7 @@ class StableDiffusionGGML { if (version == VERSION_SVD) { ignore_tensors.insert("conditioner.embedders.3"); } - bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads); + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->use_mmap); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); @@ -2478,6 +2478,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->prediction = PREDICTION_COUNT; sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; + sd_ctx_params->use_mmap = false; sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false; sd_ctx_params->keep_vae_on_cpu = false; diff --git a/stable-diffusion.h b/stable-diffusion.h index e34cdec17..c985022a8 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -176,6 +176,7 @@ typedef struct { enum prediction_t prediction; enum lora_apply_mode_t lora_apply_mode; bool offload_params_to_cpu; + bool use_mmap; bool keep_clip_on_cpu; bool keep_control_net_on_cpu; bool keep_vae_on_cpu; diff --git a/util.cpp b/util.cpp index 4a59852e2..949b247ce 100644 --- a/util.cpp +++ b/util.cpp @@ -109,9 +109,78 @@ std::string get_full_path(const std::string& dir, const std::string& filename) { } } +class MmapWrapperImpl : public MmapWrapper { +public: + MmapWrapperImpl(void* data, size_t size, HANDLE hfile, HANDLE hmapping) + : MmapWrapper(data, size), hfile_(hfile), hmapping_(hmapping) {} + + ~WindowsMmapWrapper() override { + if (data_) { + UnmapViewOfFile(data_); + } + if (hmapping_ != NULL) { + CloseHandle(mapping_handle_); + } + if (hfile_ != INVALID_HANDLE_VALUE) { + CloseHandle(file_handle_); + } + } + +private: + HANDLE hfile_; + HANDLE hmapping_; +}; + +std::shared_ptr MmapWrapper::create(const std::string& filename) { + void* mapped_data = nullptr; + size_t file_size = 0; + + HANDLE file_handle = CreateFileA( + filename.c_str(), + GENERIC_READ, + FILE_SHARE_READ, + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + NULL + ); + + if (file_handle == INVALID_HANDLE_VALUE) { + return nullptr; + } + + LARGE_INTEGER size; + if (!GetFileSizeEx(file_handle, &size)) { + CloseHandle(file_handle); + return nullptr; + } + + file_size = static_cast(size.QuadPart); + + HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL); + + if (mapping_handle == NULL) { + CloseHandle(file_handle); + return nullptr; + } + + mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size); + + if (mapped_data == NULL) { + CloseHandle(mapping_handle); + CloseHandle(file_handle); + return nullptr; + } + + return std::make_shared(mapped_data, file_size, file_handle, mapping_handle); +} + #else // Unix #include +#include +#include #include +#include bool file_exists(const std::string& filename) { struct stat buffer; @@ -143,8 +212,66 @@ std::string get_full_path(const std::string& dir, const std::string& filename) { return ""; } +class MmapWrapperImpl : public MmapWrapper { +public: + MmapWrapperImpl(void* data, size_t size) : MmapWrapper(data, size) {} + + ~MmapWrapperImpl() override { + if (data_) { + munmap(data_, size_); + } + } +}; + +std::shared_ptr MmapWrapper::create(const std::string& filename) { + + int file_descriptor = open(filename.c_str(), O_RDONLY); + if (file_descriptor == -1) { + return nullptr; + } + + int mmap_flags = MAP_PRIVATE; + +#ifdef __linux__ + // performance flags used by llama.cpp + //posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL); + //mmap_flags |= MAP_POPULATE; +#endif + + struct stat sb; + if (fstat(file_descriptor, &sb) == -1) { + close(file_descriptor); + return nullptr; + } + + size_t file_size = sb.st_size; + + void* mapped_data = mmap(NULL, file_size, PROT_READ, mmap_flags, file_descriptor, 0); + + close(file_descriptor); + + if (mapped_data == MAP_FAILED) { + return nullptr; + } + +#ifdef __linux__ + // performance flags used by llama.cpp + //posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED); +#endif + + return std::make_shared(mapped_data, file_size); +} + #endif +bool MmapWrapper::copy_data(void* buf, size_t n, size_t offset) const { + if (offset >= size_ || n > (size_ - offset)) { + return false; + } + std::memcpy(buf, data() + offset, n); + return true; +} + // get_num_physical_cores is copy from // https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp // LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE diff --git a/util.h b/util.h index 61ca9334a..79849872c 100644 --- a/util.h +++ b/util.h @@ -2,6 +2,7 @@ #define __UTIL_H__ #include +#include #include #include @@ -44,6 +45,28 @@ sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height); +class MmapWrapper { +public: + static std::shared_ptr create(const std::string& filename); + + virtual ~MmapWrapper() = default; + + MmapWrapper(const MmapWrapper&) = delete; + MmapWrapper& operator=(const MmapWrapper&) = delete; + MmapWrapper(MmapWrapper&&) = delete; + MmapWrapper& operator=(MmapWrapper&&) = delete; + + const uint8_t* data() const { return static_cast(data_); } + size_t size() const { return size_; } + bool copy_data(void* buf, size_t n, size_t offset) const; + +protected: + MmapWrapper(void* data, size_t size) + : data_(data), size_(size) {} + void* data_ = nullptr; + size_t size_ = 0; +}; + std::string path_join(const std::string& p1, const std::string& p2); std::vector split_string(const std::string& str, char delimiter); void pretty_progress(int step, int steps, float time); From db1592e9499e3116955fcc17a27a38391626cd42 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 6 Dec 2025 16:56:57 -0300 Subject: [PATCH 2/2] fix a few obvious Windows build errors --- util.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util.cpp b/util.cpp index 949b247ce..c86c9f771 100644 --- a/util.cpp +++ b/util.cpp @@ -114,15 +114,15 @@ class MmapWrapperImpl : public MmapWrapper { MmapWrapperImpl(void* data, size_t size, HANDLE hfile, HANDLE hmapping) : MmapWrapper(data, size), hfile_(hfile), hmapping_(hmapping) {} - ~WindowsMmapWrapper() override { + ~MmapWrapperImpl() override { if (data_) { UnmapViewOfFile(data_); } if (hmapping_ != NULL) { - CloseHandle(mapping_handle_); + CloseHandle(hmapping_); } if (hfile_ != INVALID_HANDLE_VALUE) { - CloseHandle(file_handle_); + CloseHandle(hfile_); } }