From 65754fba411dd4959c2d7f6ab1b05d03ab76f84b Mon Sep 17 00:00:00 2001 From: rasbid <104773487+rasbid@users.noreply.github.com> Date: Tue, 23 Sep 2025 12:26:22 +0300 Subject: [PATCH 1/2] Add detailed startup logging for llama-cli --- common/common.cpp | 51 +++++++++++++++++++++++++++++++++++++++++++++ tools/main/main.cpp | 25 ++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 0c92d4d57dd..c78e61a634a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -4,6 +4,7 @@ #include "ggml.h" #include "gguf.h" +#include "ggml-backend.h" #include "common.h" #include "log.h" @@ -899,6 +900,28 @@ struct common_init_result common_init_from_params(common_params & params) { common_init_result iparams; auto mparams = common_model_params_to_llama(params); + LOG_INF("%s: begin model initialization (path: '%s')\n", __func__, params.model.path.c_str()); + LOG_INF("%s: requested context %d, batch %d, ubatch %d, parallel %d\n", + __func__, params.n_ctx, params.n_batch, params.n_ubatch, params.n_parallel); + LOG_INF("%s: GPU layers: %d, main GPU: %d, split mode: %d\n", + __func__, params.n_gpu_layers, params.main_gpu, (int) params.split_mode); + + if (!params.devices.empty()) { + for (size_t i = 0; i < params.devices.size(); ++i) { + ggml_backend_dev_t dev = params.devices[i]; + LOG_INF("%s: offload device[%zu]: %s (%s)\n", + __func__, i, + ggml_backend_dev_name(dev), + ggml_backend_dev_description(dev)); + } + } else { + LOG_INF("%s: no explicit offload devices configured (default device selection applies)\n", __func__); + } + + if (!params.lora_adapters.empty()) { + LOG_INF("%s: %zu LoRA adapter(s) requested\n", __func__, params.lora_adapters.size()); + } + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", @@ -906,10 +929,19 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } + { + char model_desc[512] = {0}; + llama_model_desc(model, model_desc, sizeof(model_desc)); + LOG_INF("%s: model loaded successfully: %s\n", __func__, model_desc); + } + const llama_vocab * vocab = llama_model_get_vocab(model); auto cparams = common_context_params_to_llama(params); + LOG_INF("%s: creating context with %d threads (%d batch threads)\n", + __func__, cparams.n_threads, cparams.n_threads_batch); + llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", @@ -918,12 +950,25 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } + LOG_INF("%s: context created (ctx size: %d, batch size: %d, sequence max: %d)\n", + __func__, llama_n_ctx(lctx), cparams.n_batch, cparams.n_seq_max); + + if (llama_model_has_encoder(model)) { + LOG_INF("%s: encoder component detected in model\n", __func__); + } + if (llama_model_has_decoder(model)) { + LOG_INF("%s: decoder component detected in model\n", __func__); + } + if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) { LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__); params.ctx_shift = false; } if (!params.control_vectors.empty()) { + LOG_INF("%s: loading %zu control vector(s) (layers %d -> %d)\n", + __func__, params.control_vectors.size(), + params.control_vector_layer_start, params.control_vector_layer_end); if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model); @@ -981,6 +1026,8 @@ struct common_init_result common_init_from_params(common_params & params) { // load and optionally apply lora adapters for (auto & la : params.lora_adapters) { + LOG_INF("%s: initializing LoRA adapter '%s' (scale %.3f)\n", + __func__, la.path.c_str(), la.scale); llama_adapter_lora_ptr lora; lora.reset(llama_adapter_lora_init(model, la.path.c_str())); if (lora == nullptr) { @@ -1000,6 +1047,8 @@ struct common_init_result common_init_from_params(common_params & params) { } if (!params.lora_init_without_apply) { + LOG_INF("%s: applying active LoRA adapters (%zu)\n", + __func__, params.lora_adapters.size()); common_set_adapter_lora(lctx, params.lora_adapters); } @@ -1074,6 +1123,8 @@ struct common_init_result common_init_from_params(common_params & params) { iparams.model.reset(model); iparams.context.reset(lctx); + LOG_INF("%s: model initialization complete\n", __func__); + return iparams; } diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 083fc0cf26c..9451a398c8f 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -86,12 +86,22 @@ static void sigint_handler(int signo) { int main(int argc, char ** argv) { common_params params; g_params = ¶ms; + LOG_INF("llama-cli: parsing command line arguments\n"); if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { return 1; } common_init(); + LOG_INF("llama-cli: initialization complete, preparing backend\n"); + + if (!params.model.path.empty()) { + LOG_INF("llama-cli: model path '%s' (alias: '%s')\n", + params.model.path.c_str(), params.model_alias.c_str()); + } + LOG_INF("llama-cli: target context %d, batch %d, ubatch %d, gpu layers %d\n", + params.n_ctx, params.n_batch, params.n_ubatch, params.n_gpu_layers); + auto & sparams = params.sampling; // save choice to use color for later @@ -123,7 +133,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: llama backend init\n", __func__); llama_backend_init(); + LOG_INF("%s: llama backend initialized\n", __func__); + llama_numa_init(params.numa); + LOG_INF("%s: NUMA strategy applied: %d\n", __func__, (int) params.numa); llama_model * model = nullptr; llama_context * ctx = nullptr; @@ -142,16 +155,23 @@ int main(int argc, char ** argv) { model = llama_init.model.get(); ctx = llama_init.context.get(); + LOG_INF("%s: common_init_from_params returned (model ptr: %p, ctx ptr: %p)\n", + __func__, static_cast(model), static_cast(ctx)); + if (model == NULL) { LOG_ERR("%s: error: unable to load model\n", __func__); return 1; } auto * mem = llama_get_memory(ctx); + LOG_INF("%s: llama memory subsystem ready (%p)\n", __func__, static_cast(mem)); const llama_vocab * vocab = llama_model_get_vocab(model); auto chat_templates = common_chat_templates_init(model, params.chat_template); + LOG_INF("%s: chat templates initialized (%s)\n", + __func__, chat_templates ? "available" : "none"); + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); @@ -172,6 +192,8 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { + LOG_INF("%s: creating batch threadpool with %d threads (paused=%d)\n", + __func__, tpp_batch.n_threads, tpp_batch.paused); threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); if (!threadpool_batch) { LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); @@ -182,6 +204,8 @@ int main(int argc, char ** argv) { tpp.paused = true; } + LOG_INF("%s: creating main threadpool with %d threads (paused=%d)\n", + __func__, tpp.n_threads, tpp.paused); struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); @@ -189,6 +213,7 @@ int main(int argc, char ** argv) { } llama_attach_threadpool(ctx, threadpool, threadpool_batch); + LOG_INF("%s: threadpools attached to llama context\n", __func__); const int n_ctx_train = llama_model_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); From bafbd320e44cda3b8dc46d725d2c2073ed7407d6 Mon Sep 17 00:00:00 2001 From: rasbid <104773487+rasbid@users.noreply.github.com> Date: Tue, 23 Sep 2025 13:01:33 +0300 Subject: [PATCH 2/2] Add detailed tensor streaming logs --- src/llama-model-loader.cpp | 50 ++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 8182a9adf53..22f706b8fa0 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -925,6 +925,9 @@ bool llama_model_loader::load_all_data( void * progress_callback_user_data) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); + LLAMA_LOG_INFO("%s: begin weight streaming (%d tensors, %zu total bytes, use_mmap=%s, check_tensors=%s, buffers=%zu)\n", + __func__, n_tensors, size_data, use_mmap ? "true" : "false", check_tensors ? "true" : "false", bufs.size()); + std::vector> read_buf; std::vector>> validation_result; @@ -1024,13 +1027,31 @@ bool llama_model_loader::load_all_data( continue; } + const char * tensor_name = ggml_get_name(cur); + const char * tensor_type = ggml_type_name(cur->type); + const std::string tensor_shape = llama_format_tensor_shape(cur); + const size_t tensor_size = ggml_nbytes(cur); + ggml_backend_buffer_type_t cur_buft = cur->buffer ? ggml_backend_buffer_get_type(cur->buffer) : nullptr; + const char * buf_type_name = cur_buft ? ggml_backend_buft_name(cur_buft) : "(none)"; + const bool buffer_is_host = !cur->buffer || ggml_backend_buffer_is_host(cur->buffer); + ggml_backend_dev_t tensor_dev = nullptr; + if (cur_buft) { + tensor_dev = ggml_backend_buft_get_device(cur_buft); + } + const char * tensor_dev_name = tensor_dev ? ggml_backend_dev_name(tensor_dev) : (buffer_is_host ? "host" : "unknown"); + + LLAMA_LOG_INFO( + "%s: tensor '%s' (%s %s, %zu bytes) file[%u]@%zu -> buffer type %s (%s) on %s\n", + __func__, tensor_name, tensor_type, tensor_shape.c_str(), tensor_size, weight->idx, weight->offs, + buf_type_name, buffer_is_host ? "host" : "device", tensor_dev_name); + if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { return false; } } - size_t n_size = ggml_nbytes(cur); + size_t n_size = tensor_size; if (use_mmap) { const auto & mapping = mappings.at(weight->idx); @@ -1076,19 +1097,32 @@ bool llama_model_loader::load_all_data( file->seek(weight->offs, SEEK_SET); size_t bytes_read = 0; + const size_t chunk_count = (n_size + buffer_size - 1) / buffer_size; + LLAMA_LOG_INFO( + "%s: tensor '%s' async upload via %s (%zu chunks, chunk size %zu)\n", + __func__, tensor_name, ggml_backend_name(upload_backend), chunk_count, buffer_size); while (bytes_read < n_size) { size_t read_iteration = std::min(buffer_size, n_size - bytes_read); - ggml_backend_event_synchronize(events[buffer_idx]); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - ggml_backend_event_record(events[buffer_idx], upload_backend); + const size_t staging_idx = buffer_idx; + LLAMA_LOG_DEBUG("%s: tensor '%s' waiting for staging buffer %zu\n", + __func__, tensor_name, staging_idx); + ggml_backend_event_synchronize(events[staging_idx]); + LLAMA_LOG_DEBUG("%s: tensor '%s' staging buffer %zu ready, reading %zu bytes (offset %zu/%zu)\n", + __func__, tensor_name, staging_idx, read_iteration, bytes_read, n_size); + file->read_raw(host_ptrs[staging_idx], read_iteration); + ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[staging_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[staging_idx], upload_backend); + LLAMA_LOG_DEBUG("%s: tensor '%s' submitted chunk %zu/%zu (%zu bytes) via staging buffer %zu\n", + __func__, tensor_name, (bytes_read / buffer_size) + 1, chunk_count, read_iteration, staging_idx); bytes_read += read_iteration; ++buffer_idx; buffer_idx %= n_buffers; } + LLAMA_LOG_INFO("%s: tensor '%s' async upload complete (%zu bytes)\n", + __func__, tensor_name, n_size); } else { read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); @@ -1097,11 +1131,17 @@ bool llama_model_loader::load_all_data( if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); } + LLAMA_LOG_INFO("%s: tensor '%s' synchronous upload to device buffer complete (%zu bytes)\n", + __func__, tensor_name, n_size); } } } size_done += n_size; + LLAMA_LOG_DEBUG("%s: cumulative load progress %.2f%% (%zu/%zu bytes)\n", + __func__, 100.0f * size_done / size_data, size_done, size_data); + LLAMA_LOG_INFO("%s: tensor '%s' load complete (cumulative %.2f%%)\n", + __func__, tensor_name, 100.0f * size_done / size_data); } // free temporary resources used for async uploads