rasbid · rasbid · Sep 23, 2025 · Sep 23, 2025
diff --git a/common/common.cpp b/common/common.cpp
@@ -4,6 +4,7 @@
 
 #include "ggml.h"
 #include "gguf.h"
+#include "ggml-backend.h"
 
 #include "common.h"
 #include "log.h"
@@ -899,17 +900,48 @@ struct common_init_result common_init_from_params(common_params & params) {
     common_init_result iparams;
     auto mparams = common_model_params_to_llama(params);
 
+    LOG_INF("%s: begin model initialization (path: '%s')\n", __func__, params.model.path.c_str());
+    LOG_INF("%s: requested context %d, batch %d, ubatch %d, parallel %d\n",
+            __func__, params.n_ctx, params.n_batch, params.n_ubatch, params.n_parallel);
+    LOG_INF("%s: GPU layers: %d, main GPU: %d, split mode: %d\n",
+            __func__, params.n_gpu_layers, params.main_gpu, (int) params.split_mode);
+
+    if (!params.devices.empty()) {
+        for (size_t i = 0; i < params.devices.size(); ++i) {
+            ggml_backend_dev_t dev = params.devices[i];
+            LOG_INF("%s: offload device[%zu]: %s (%s)\n",
+                    __func__, i,
+                    ggml_backend_dev_name(dev),
+                    ggml_backend_dev_description(dev));
+        }
+    } else {
+        LOG_INF("%s: no explicit offload devices configured (default device selection applies)\n", __func__);
+    }
+
+    if (!params.lora_adapters.empty()) {
+        LOG_INF("%s: %zu LoRA adapter(s) requested\n", __func__, params.lora_adapters.size());
+    }
+
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
     if (model == NULL) {
         LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
             __func__, params.model.path.c_str());
         return iparams;
     }
 
+    {
+        char model_desc[512] = {0};
+        llama_model_desc(model, model_desc, sizeof(model_desc));
+        LOG_INF("%s: model loaded successfully: %s\n", __func__, model_desc);
+    }
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     auto cparams = common_context_params_to_llama(params);
 
+    LOG_INF("%s: creating context with %d threads (%d batch threads)\n",
+            __func__, cparams.n_threads, cparams.n_threads_batch);
+
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
@@ -918,12 +950,25 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
+    LOG_INF("%s: context created (ctx size: %d, batch size: %d, sequence max: %d)\n",
+            __func__, llama_n_ctx(lctx), cparams.n_batch, cparams.n_seq_max);
+
+    if (llama_model_has_encoder(model)) {
+        LOG_INF("%s: encoder component detected in model\n", __func__);
+    }
+    if (llama_model_has_decoder(model)) {
+        LOG_INF("%s: decoder component detected in model\n", __func__);
+    }
+
     if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
         LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
 
     if (!params.control_vectors.empty()) {
+        LOG_INF("%s: loading %zu control vector(s) (layers %d -> %d)\n",
+                __func__, params.control_vectors.size(),
+                params.control_vector_layer_start, params.control_vector_layer_end);
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_model_n_layer(model);
 
@@ -981,6 +1026,8 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
+        LOG_INF("%s: initializing LoRA adapter '%s' (scale %.3f)\n",
+                __func__, la.path.c_str(), la.scale);
         llama_adapter_lora_ptr lora;
         lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
         if (lora == nullptr) {
@@ -1000,6 +1047,8 @@ struct common_init_result common_init_from_params(common_params & params) {
     }
 
     if (!params.lora_init_without_apply) {
+        LOG_INF("%s: applying active LoRA adapters (%zu)\n",
+                __func__, params.lora_adapters.size());
         common_set_adapter_lora(lctx, params.lora_adapters);
     }
 
@@ -1074,6 +1123,8 @@ struct common_init_result common_init_from_params(common_params & params) {
     iparams.model.reset(model);
     iparams.context.reset(lctx);
 
+    LOG_INF("%s: model initialization complete\n", __func__);
+
     return iparams;
 }
 

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -925,6 +925,9 @@ bool llama_model_loader::load_all_data(
         void * progress_callback_user_data) {
     GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
+    LLAMA_LOG_INFO("%s: begin weight streaming (%d tensors, %zu total bytes, use_mmap=%s, check_tensors=%s, buffers=%zu)\n",
+        __func__, n_tensors, size_data, use_mmap ? "true" : "false", check_tensors ? "true" : "false", bufs.size());
+
     std::vector<no_init<uint8_t>> read_buf;
     std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
 
@@ -1024,13 +1027,31 @@ bool llama_model_loader::load_all_data(
             continue;
         }
 
+        const char * tensor_name  = ggml_get_name(cur);
+        const char * tensor_type  = ggml_type_name(cur->type);
+        const std::string tensor_shape = llama_format_tensor_shape(cur);
+        const size_t tensor_size = ggml_nbytes(cur);
+        ggml_backend_buffer_type_t cur_buft = cur->buffer ? ggml_backend_buffer_get_type(cur->buffer) : nullptr;
+        const char * buf_type_name = cur_buft ? ggml_backend_buft_name(cur_buft) : "(none)";
+        const bool buffer_is_host = !cur->buffer || ggml_backend_buffer_is_host(cur->buffer);
+        ggml_backend_dev_t tensor_dev = nullptr;
+        if (cur_buft) {
+            tensor_dev = ggml_backend_buft_get_device(cur_buft);
+        }
+        const char * tensor_dev_name = tensor_dev ? ggml_backend_dev_name(tensor_dev) : (buffer_is_host ? "host" : "unknown");
+
+        LLAMA_LOG_INFO(
+            "%s: tensor '%s' (%s %s, %zu bytes) file[%u]@%zu -> buffer type %s (%s) on %s\n",
+            __func__, tensor_name, tensor_type, tensor_shape.c_str(), tensor_size, weight->idx, weight->offs,
+            buf_type_name, buffer_is_host ? "host" : "device", tensor_dev_name);
+
         if (progress_callback) {
             if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
                 return false;
             }
         }
 
-        size_t n_size = ggml_nbytes(cur);
+        size_t n_size = tensor_size;
 
         if (use_mmap) {
             const auto & mapping = mappings.at(weight->idx);
@@ -1076,19 +1097,32 @@ bool llama_model_loader::load_all_data(
                     file->seek(weight->offs, SEEK_SET);
 
                     size_t bytes_read = 0;
+                    const size_t chunk_count = (n_size + buffer_size - 1) / buffer_size;
+                    LLAMA_LOG_INFO(
+                        "%s: tensor '%s' async upload via %s (%zu chunks, chunk size %zu)\n",
+                        __func__, tensor_name, ggml_backend_name(upload_backend), chunk_count, buffer_size);
 
                     while (bytes_read < n_size) {
                         size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
 
-                        ggml_backend_event_synchronize(events[buffer_idx]);
-                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
-                        ggml_backend_event_record(events[buffer_idx], upload_backend);
+                        const size_t staging_idx = buffer_idx;
+                        LLAMA_LOG_DEBUG("%s: tensor '%s' waiting for staging buffer %zu\n",
+                            __func__, tensor_name, staging_idx);
+                        ggml_backend_event_synchronize(events[staging_idx]);
+                        LLAMA_LOG_DEBUG("%s: tensor '%s' staging buffer %zu ready, reading %zu bytes (offset %zu/%zu)\n",
+                            __func__, tensor_name, staging_idx, read_iteration, bytes_read, n_size);
+                        file->read_raw(host_ptrs[staging_idx], read_iteration);
+                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[staging_idx], bytes_read, read_iteration);
+                        ggml_backend_event_record(events[staging_idx], upload_backend);
+                        LLAMA_LOG_DEBUG("%s: tensor '%s' submitted chunk %zu/%zu (%zu bytes) via staging buffer %zu\n",
+                            __func__, tensor_name, (bytes_read / buffer_size) + 1, chunk_count, read_iteration, staging_idx);
 
                         bytes_read += read_iteration;
                         ++buffer_idx;
                         buffer_idx %= n_buffers;
                     }
+                    LLAMA_LOG_INFO("%s: tensor '%s' async upload complete (%zu bytes)\n",
+                        __func__, tensor_name, n_size);
                 } else {
                     read_buf.resize(n_size);
                     file->seek(weight->offs, SEEK_SET);
@@ -1097,11 +1131,17 @@ bool llama_model_loader::load_all_data(
                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
                     }
+                    LLAMA_LOG_INFO("%s: tensor '%s' synchronous upload to device buffer complete (%zu bytes)\n",
+                        __func__, tensor_name, n_size);
                 }
             }
         }
 
         size_done += n_size;
+        LLAMA_LOG_DEBUG("%s: cumulative load progress %.2f%% (%zu/%zu bytes)\n",
+            __func__, 100.0f * size_done / size_data, size_done, size_data);
+        LLAMA_LOG_INFO("%s: tensor '%s' load complete (cumulative %.2f%%)\n",
+            __func__, tensor_name, 100.0f * size_done / size_data);
     }
 
     // free temporary resources used for async uploads

diff --git a/tools/main/main.cpp b/tools/main/main.cpp
@@ -86,12 +86,22 @@ static void sigint_handler(int signo) {
 int main(int argc, char ** argv) {
     common_params params;
     g_params = &params;
+    LOG_INF("llama-cli: parsing command line arguments\n");
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
         return 1;
     }
 
     common_init();
 
+    LOG_INF("llama-cli: initialization complete, preparing backend\n");
+
+    if (!params.model.path.empty()) {
+        LOG_INF("llama-cli: model path '%s' (alias: '%s')\n",
+                params.model.path.c_str(), params.model_alias.c_str());
+    }
+    LOG_INF("llama-cli: target context %d, batch %d, ubatch %d, gpu layers %d\n",
+            params.n_ctx, params.n_batch, params.n_ubatch, params.n_gpu_layers);
+
     auto & sparams = params.sampling;
 
     // save choice to use color for later
@@ -123,7 +133,10 @@ int main(int argc, char ** argv) {
     LOG_INF("%s: llama backend init\n", __func__);
 
     llama_backend_init();
+    LOG_INF("%s: llama backend initialized\n", __func__);
+
     llama_numa_init(params.numa);
+    LOG_INF("%s: NUMA strategy applied: %d\n", __func__, (int) params.numa);
 
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
@@ -142,16 +155,23 @@ int main(int argc, char ** argv) {
     model = llama_init.model.get();
     ctx = llama_init.context.get();
 
+    LOG_INF("%s: common_init_from_params returned (model ptr: %p, ctx ptr: %p)\n",
+            __func__, static_cast<void *>(model), static_cast<void *>(ctx));
+
     if (model == NULL) {
         LOG_ERR("%s: error: unable to load model\n", __func__);
         return 1;
     }
 
     auto * mem = llama_get_memory(ctx);
+    LOG_INF("%s: llama memory subsystem ready (%p)\n", __func__, static_cast<void *>(mem));
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
     auto chat_templates = common_chat_templates_init(model, params.chat_template);
 
+    LOG_INF("%s: chat templates initialized (%s)\n",
+            __func__, chat_templates ? "available" : "none");
+
     LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
 
     auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -172,6 +192,8 @@ int main(int argc, char ** argv) {
 
     struct ggml_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        LOG_INF("%s: creating batch threadpool with %d threads (paused=%d)\n",
+                __func__, tpp_batch.n_threads, tpp_batch.paused);
         threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
         if (!threadpool_batch) {
             LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
@@ -182,13 +204,16 @@ int main(int argc, char ** argv) {
         tpp.paused = true;
     }
 
+    LOG_INF("%s: creating main threadpool with %d threads (paused=%d)\n",
+            __func__, tpp.n_threads, tpp.paused);
     struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
     if (!threadpool) {
         LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
         return 1;
     }
 
     llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+    LOG_INF("%s: threadpools attached to llama context\n", __func__);
 
     const int n_ctx_train = llama_model_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);