Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "ggml.h"
#include "gguf.h"
#include "ggml-backend.h"

#include "common.h"
#include "log.h"
Expand Down Expand Up @@ -899,17 +900,48 @@ struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);

LOG_INF("%s: begin model initialization (path: '%s')\n", __func__, params.model.path.c_str());
LOG_INF("%s: requested context %d, batch %d, ubatch %d, parallel %d\n",
__func__, params.n_ctx, params.n_batch, params.n_ubatch, params.n_parallel);
LOG_INF("%s: GPU layers: %d, main GPU: %d, split mode: %d\n",
__func__, params.n_gpu_layers, params.main_gpu, (int) params.split_mode);

if (!params.devices.empty()) {
for (size_t i = 0; i < params.devices.size(); ++i) {
ggml_backend_dev_t dev = params.devices[i];
LOG_INF("%s: offload device[%zu]: %s (%s)\n",
__func__, i,
ggml_backend_dev_name(dev),
ggml_backend_dev_description(dev));
}
} else {
LOG_INF("%s: no explicit offload devices configured (default device selection applies)\n", __func__);
}

if (!params.lora_adapters.empty()) {
LOG_INF("%s: %zu LoRA adapter(s) requested\n", __func__, params.lora_adapters.size());
}

llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
__func__, params.model.path.c_str());
return iparams;
}

{
char model_desc[512] = {0};
llama_model_desc(model, model_desc, sizeof(model_desc));
LOG_INF("%s: model loaded successfully: %s\n", __func__, model_desc);
}

const llama_vocab * vocab = llama_model_get_vocab(model);

auto cparams = common_context_params_to_llama(params);

LOG_INF("%s: creating context with %d threads (%d batch threads)\n",
__func__, cparams.n_threads, cparams.n_threads_batch);

llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
Expand All @@ -918,12 +950,25 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}

LOG_INF("%s: context created (ctx size: %d, batch size: %d, sequence max: %d)\n",
__func__, llama_n_ctx(lctx), cparams.n_batch, cparams.n_seq_max);

if (llama_model_has_encoder(model)) {
LOG_INF("%s: encoder component detected in model\n", __func__);
}
if (llama_model_has_decoder(model)) {
LOG_INF("%s: decoder component detected in model\n", __func__);
}

if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}

if (!params.control_vectors.empty()) {
LOG_INF("%s: loading %zu control vector(s) (layers %d -> %d)\n",
__func__, params.control_vectors.size(),
params.control_vector_layer_start, params.control_vector_layer_end);
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);

Expand Down Expand Up @@ -981,6 +1026,8 @@ struct common_init_result common_init_from_params(common_params & params) {

// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
LOG_INF("%s: initializing LoRA adapter '%s' (scale %.3f)\n",
__func__, la.path.c_str(), la.scale);
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
Expand All @@ -1000,6 +1047,8 @@ struct common_init_result common_init_from_params(common_params & params) {
}

if (!params.lora_init_without_apply) {
LOG_INF("%s: applying active LoRA adapters (%zu)\n",
__func__, params.lora_adapters.size());
common_set_adapter_lora(lctx, params.lora_adapters);
}

Expand Down Expand Up @@ -1074,6 +1123,8 @@ struct common_init_result common_init_from_params(common_params & params) {
iparams.model.reset(model);
iparams.context.reset(lctx);

LOG_INF("%s: model initialization complete\n", __func__);

return iparams;
}

Expand Down
50 changes: 45 additions & 5 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -925,6 +925,9 @@ bool llama_model_loader::load_all_data(
void * progress_callback_user_data) {
GGML_ASSERT(size_data != 0 && "call init_mappings() first");

LLAMA_LOG_INFO("%s: begin weight streaming (%d tensors, %zu total bytes, use_mmap=%s, check_tensors=%s, buffers=%zu)\n",
__func__, n_tensors, size_data, use_mmap ? "true" : "false", check_tensors ? "true" : "false", bufs.size());

std::vector<no_init<uint8_t>> read_buf;
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;

Expand Down Expand Up @@ -1024,13 +1027,31 @@ bool llama_model_loader::load_all_data(
continue;
}

const char * tensor_name = ggml_get_name(cur);
const char * tensor_type = ggml_type_name(cur->type);
const std::string tensor_shape = llama_format_tensor_shape(cur);
const size_t tensor_size = ggml_nbytes(cur);
ggml_backend_buffer_type_t cur_buft = cur->buffer ? ggml_backend_buffer_get_type(cur->buffer) : nullptr;
const char * buf_type_name = cur_buft ? ggml_backend_buft_name(cur_buft) : "(none)";
const bool buffer_is_host = !cur->buffer || ggml_backend_buffer_is_host(cur->buffer);
ggml_backend_dev_t tensor_dev = nullptr;
if (cur_buft) {
tensor_dev = ggml_backend_buft_get_device(cur_buft);
}
const char * tensor_dev_name = tensor_dev ? ggml_backend_dev_name(tensor_dev) : (buffer_is_host ? "host" : "unknown");

LLAMA_LOG_INFO(
"%s: tensor '%s' (%s %s, %zu bytes) file[%u]@%zu -> buffer type %s (%s) on %s\n",
__func__, tensor_name, tensor_type, tensor_shape.c_str(), tensor_size, weight->idx, weight->offs,
buf_type_name, buffer_is_host ? "host" : "device", tensor_dev_name);

if (progress_callback) {
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
return false;
}
}

size_t n_size = ggml_nbytes(cur);
size_t n_size = tensor_size;

if (use_mmap) {
const auto & mapping = mappings.at(weight->idx);
Expand Down Expand Up @@ -1076,19 +1097,32 @@ bool llama_model_loader::load_all_data(
file->seek(weight->offs, SEEK_SET);

size_t bytes_read = 0;
const size_t chunk_count = (n_size + buffer_size - 1) / buffer_size;
LLAMA_LOG_INFO(
"%s: tensor '%s' async upload via %s (%zu chunks, chunk size %zu)\n",
__func__, tensor_name, ggml_backend_name(upload_backend), chunk_count, buffer_size);

while (bytes_read < n_size) {
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);

ggml_backend_event_synchronize(events[buffer_idx]);
file->read_raw(host_ptrs[buffer_idx], read_iteration);
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
ggml_backend_event_record(events[buffer_idx], upload_backend);
const size_t staging_idx = buffer_idx;
LLAMA_LOG_DEBUG("%s: tensor '%s' waiting for staging buffer %zu\n",
__func__, tensor_name, staging_idx);
ggml_backend_event_synchronize(events[staging_idx]);
LLAMA_LOG_DEBUG("%s: tensor '%s' staging buffer %zu ready, reading %zu bytes (offset %zu/%zu)\n",
__func__, tensor_name, staging_idx, read_iteration, bytes_read, n_size);
file->read_raw(host_ptrs[staging_idx], read_iteration);
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[staging_idx], bytes_read, read_iteration);
ggml_backend_event_record(events[staging_idx], upload_backend);
LLAMA_LOG_DEBUG("%s: tensor '%s' submitted chunk %zu/%zu (%zu bytes) via staging buffer %zu\n",
__func__, tensor_name, (bytes_read / buffer_size) + 1, chunk_count, read_iteration, staging_idx);

bytes_read += read_iteration;
++buffer_idx;
buffer_idx %= n_buffers;
}
LLAMA_LOG_INFO("%s: tensor '%s' async upload complete (%zu bytes)\n",
__func__, tensor_name, n_size);
} else {
read_buf.resize(n_size);
file->seek(weight->offs, SEEK_SET);
Expand All @@ -1097,11 +1131,17 @@ bool llama_model_loader::load_all_data(
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
}
LLAMA_LOG_INFO("%s: tensor '%s' synchronous upload to device buffer complete (%zu bytes)\n",
__func__, tensor_name, n_size);
}
}
}

size_done += n_size;
LLAMA_LOG_DEBUG("%s: cumulative load progress %.2f%% (%zu/%zu bytes)\n",
__func__, 100.0f * size_done / size_data, size_done, size_data);
LLAMA_LOG_INFO("%s: tensor '%s' load complete (cumulative %.2f%%)\n",
__func__, tensor_name, 100.0f * size_done / size_data);
}

// free temporary resources used for async uploads
Expand Down
25 changes: 25 additions & 0 deletions tools/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,22 @@ static void sigint_handler(int signo) {
int main(int argc, char ** argv) {
common_params params;
g_params = &params;
LOG_INF("llama-cli: parsing command line arguments\n");
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
return 1;
}

common_init();

LOG_INF("llama-cli: initialization complete, preparing backend\n");

if (!params.model.path.empty()) {
LOG_INF("llama-cli: model path '%s' (alias: '%s')\n",
params.model.path.c_str(), params.model_alias.c_str());
}
LOG_INF("llama-cli: target context %d, batch %d, ubatch %d, gpu layers %d\n",
params.n_ctx, params.n_batch, params.n_ubatch, params.n_gpu_layers);

auto & sparams = params.sampling;

// save choice to use color for later
Expand Down Expand Up @@ -123,7 +133,10 @@ int main(int argc, char ** argv) {
LOG_INF("%s: llama backend init\n", __func__);

llama_backend_init();
LOG_INF("%s: llama backend initialized\n", __func__);

llama_numa_init(params.numa);
LOG_INF("%s: NUMA strategy applied: %d\n", __func__, (int) params.numa);

llama_model * model = nullptr;
llama_context * ctx = nullptr;
Expand All @@ -142,16 +155,23 @@ int main(int argc, char ** argv) {
model = llama_init.model.get();
ctx = llama_init.context.get();

LOG_INF("%s: common_init_from_params returned (model ptr: %p, ctx ptr: %p)\n",
__func__, static_cast<void *>(model), static_cast<void *>(ctx));

if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n", __func__);
return 1;
}

auto * mem = llama_get_memory(ctx);
LOG_INF("%s: llama memory subsystem ready (%p)\n", __func__, static_cast<void *>(mem));

const llama_vocab * vocab = llama_model_get_vocab(model);
auto chat_templates = common_chat_templates_init(model, params.chat_template);

LOG_INF("%s: chat templates initialized (%s)\n",
__func__, chat_templates ? "available" : "none");

LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
Expand All @@ -172,6 +192,8 @@ int main(int argc, char ** argv) {

struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
LOG_INF("%s: creating batch threadpool with %d threads (paused=%d)\n",
__func__, tpp_batch.n_threads, tpp_batch.paused);
threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
if (!threadpool_batch) {
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
Expand All @@ -182,13 +204,16 @@ int main(int argc, char ** argv) {
tpp.paused = true;
}

LOG_INF("%s: creating main threadpool with %d threads (paused=%d)\n",
__func__, tpp.n_threads, tpp.paused);
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) {
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
return 1;
}

llama_attach_threadpool(ctx, threadpool, threadpool_batch);
LOG_INF("%s: threadpools attached to llama context\n", __func__);

const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
Expand Down