Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 8f6bddc

Browse files
committed
feat: Add more logs for llamaCPP and llama_server
1 parent fdd770d commit 8f6bddc

File tree

3 files changed

+76
-45
lines changed

3 files changed

+76
-45
lines changed

context/llama_server_context.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ struct llama_server_context {
532532

533533
std::tie(model, ctx) = llama_init_from_gpt_params(params);
534534
if (model == nullptr) {
535-
LOG_ERROR_LLAMA("unable to load model", {{"model", params.model}});
535+
LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}});
536536
return false;
537537
}
538538

@@ -551,6 +551,10 @@ struct llama_server_context {
551551
}
552552
}
553553

554+
if (ctx == nullptr) {
555+
LOG_ERROR_LLAMA("Unable to get llama.cpp context", {});
556+
return false;
557+
}
554558
n_ctx = llama_n_ctx(ctx);
555559

556560
add_bos_token = llama_should_add_bos_token(model);
@@ -578,7 +582,11 @@ struct llama_server_context {
578582
slots.push_back(slot);
579583
}
580584

581-
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
585+
try {
586+
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
587+
} catch (const std::exception& e) {
588+
LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata" , {{"exception", e.what()}, {"n_tokens_alloc", n_ctx}, {"embd", 0}, {"n_seq_max", params.n_parallel}});
589+
}
582590

583591
// empty system prompt
584592
system_prompt = "";
@@ -1296,7 +1304,9 @@ struct llama_server_context {
12961304
}
12971305

12981306
if (queue_results[i].id == task_id) {
1299-
assert(queue_results[i].multitask_id == -1);
1307+
if (queue_results[i].multitask_id != -1) {
1308+
LOG_ERROR_LLAMA("Incorrect multitask ID", {{"task_id", task_id}});
1309+
}
13001310
task_result res = queue_results[i];
13011311
queue_results.erase(queue_results.begin() + i);
13021312
return res;

controllers/llamaCPP.cc

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ std::shared_ptr<inferenceState> create_inference_state(llamaCPP* instance) {
5050
bool llamaCPP::CheckModelLoaded(
5151
std::function<void(const HttpResponsePtr&)>& callback) {
5252
if (!llama.model_loaded_external) {
53+
LOG_ERROR << "Model has not been loaded";
5354
Json::Value jsonResp;
5455
jsonResp["message"] =
5556
"Model has not been loaded, please load model into nitro";
@@ -159,6 +160,7 @@ llamaCPP::~llamaCPP() {
159160
void llamaCPP::WarmupModel() {
160161
json pseudo;
161162

163+
LOG_INFO << "Warm-up model";
162164
pseudo["prompt"] = "Hello";
163165
pseudo["n_predict"] = 2;
164166
pseudo["stream"] = false;
@@ -187,6 +189,8 @@ void llamaCPP::InferenceImpl(
187189
inferences::ChatCompletionRequest&& completion,
188190
std::function<void(const HttpResponsePtr&)>& callback) {
189191
std::string formatted_output = pre_prompt;
192+
int request_id = ++no_of_inference_requests;
193+
LOG_INFO_REQUEST(request_id) << "Generating reponse for inference request";
190194

191195
json data;
192196
json stopWords;
@@ -196,9 +200,9 @@ void llamaCPP::InferenceImpl(
196200
// Increase number of chats received and clean the prompt
197201
no_of_chats++;
198202
if (no_of_chats % clean_cache_threshold == 0) {
199-
LOG_INFO << "Clean cache threshold reached!";
203+
LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!";
200204
llama.kv_cache_clear();
201-
LOG_INFO << "Cache cleaned";
205+
LOG_INFO_REQUEST(request_id) << "Cache cleaned";
202206
}
203207

204208
// Default values to enable auto caching
@@ -207,9 +211,7 @@ void llamaCPP::InferenceImpl(
207211

208212
// Passing load value
209213
data["repeat_last_n"] = this->repeat_last_n;
210-
211-
LOG_INFO << "Messages:" << completion.messages.toStyledString();
212-
LOG_INFO << "Stop:" << completion.stop.toStyledString();
214+
LOG_INFO_REQUEST(request_id) << "Stop words:" << completion.stop.toStyledString();
213215

214216
data["stream"] = completion.stream;
215217
data["n_predict"] = completion.max_tokens;
@@ -268,18 +270,18 @@ void llamaCPP::InferenceImpl(
268270
auto image_url = content_piece["image_url"]["url"].asString();
269271
std::string base64_image_data;
270272
if (image_url.find("http") != std::string::npos) {
271-
LOG_INFO << "Remote image detected but not supported yet";
273+
LOG_INFO_REQUEST(request_id) << "Remote image detected but not supported yet";
272274
} else if (image_url.find("data:image") != std::string::npos) {
273-
LOG_INFO << "Base64 image detected";
275+
LOG_INFO_REQUEST(request_id) << "Base64 image detected";
274276
base64_image_data = nitro_utils::extractBase64(image_url);
275-
LOG_INFO << base64_image_data;
277+
LOG_INFO_REQUEST(request_id) << base64_image_data;
276278
} else {
277-
LOG_INFO << "Local image detected";
279+
LOG_INFO_REQUEST(request_id) << "Local image detected";
278280
nitro_utils::processLocalImage(
279281
image_url, [&](const std::string& base64Image) {
280282
base64_image_data = base64Image;
281283
});
282-
LOG_INFO << base64_image_data;
284+
LOG_INFO_REQUEST(request_id) << base64_image_data;
283285
}
284286
content_piece_image_data["data"] = base64_image_data;
285287

@@ -306,7 +308,7 @@ void llamaCPP::InferenceImpl(
306308
}
307309
}
308310
formatted_output += ai_prompt;
309-
LOG_INFO << formatted_output;
311+
LOG_INFO_REQUEST(request_id) << formatted_output;
310312
}
311313

312314
data["prompt"] = formatted_output;
@@ -322,35 +324,36 @@ void llamaCPP::InferenceImpl(
322324
bool is_streamed = data["stream"];
323325
// Enable full message debugging
324326
#ifdef DEBUG
325-
LOG_INFO << "Current completion text";
326-
LOG_INFO << formatted_output;
327+
LOG_INFO_REQUEST(request_id) << "Current completion text";
328+
LOG_INFO_REQUEST(request_id) << formatted_output;
327329
#endif
328330

329331
if (is_streamed) {
332+
LOG_INFO_REQUEST(request_id) << "Streamed, waiting for respone";
330333
auto state = create_inference_state(this);
331334
auto chunked_content_provider =
332-
[state, data](char* pBuffer, std::size_t nBuffSize) -> std::size_t {
335+
[state, data, request_id](char* pBuffer, std::size_t nBuffSize) -> std::size_t {
333336
if (state->inference_status == PENDING) {
334337
state->inference_status = RUNNING;
335338
} else if (state->inference_status == FINISHED) {
336339
return 0;
337340
}
338341

339342
if (!pBuffer) {
340-
LOG_INFO << "Connection closed or buffer is null. Reset context";
343+
LOG_WARN_REQUEST(request_id) "Connection closed or buffer is null. Reset context";
341344
state->inference_status = FINISHED;
342345
return 0;
343346
}
344347

345348
if (state->inference_status == EOS) {
346-
LOG_INFO << "End of result";
349+
LOG_INFO_REQUEST(request_id) << "End of result";
347350
const std::string str =
348351
"data: " +
349352
create_return_json(nitro_utils::generate_random_string(20), "_", "",
350353
"stop") +
351354
"\n\n" + "data: [DONE]" + "\n\n";
352355

353-
LOG_VERBOSE("data stream", {{"to_send", str}});
356+
LOG_VERBOSE("data stream", {{"request_id": request_id}, {"to_send", str}});
354357
std::size_t nRead = std::min(str.size(), nBuffSize);
355358
memcpy(pBuffer, str.data(), nRead);
356359
state->inference_status = FINISHED;
@@ -370,7 +373,7 @@ void llamaCPP::InferenceImpl(
370373
memcpy(pBuffer, str.data(), nRead);
371374

372375
if (result.stop) {
373-
LOG_INFO << "reached result stop";
376+
LOG_INFO_REQUEST(request_id) << "Reached result stop";
374377
state->inference_status = EOS;
375378
return nRead;
376379
}
@@ -383,14 +386,14 @@ void llamaCPP::InferenceImpl(
383386

384387
return nRead;
385388
} else {
386-
LOG_INFO << "Error during inference";
389+
LOG_ERROR_REQUEST(request_id) << "Error during inference";
387390
}
388391
state->inference_status = FINISHED;
389392
return 0;
390393
};
391394
// Queued task
392395
state->instance->queue->runTaskInQueue(
393-
[callback, state, data, chunked_content_provider]() {
396+
[callback, state, data, chunked_content_provider, request_id]() {
394397
state->task_id =
395398
state->instance->llama.request_completion(data, false, false, -1);
396399

@@ -410,22 +413,22 @@ void llamaCPP::InferenceImpl(
410413
retries += 1;
411414
}
412415
if (state->inference_status != RUNNING)
413-
LOG_INFO << "Wait for task to be released:" << state->task_id;
416+
LOG_INFO_REQUEST(request_id) << "Wait for task to be released:" << state->task_id;
414417
std::this_thread::sleep_for(std::chrono::milliseconds(100));
415418
}
416-
LOG_INFO << "Task completed, release it";
419+
LOG_INFO_REQUEST(request_id) << "Task completed, release it";
417420
// Request completed, release it
418421
state->instance->llama.request_cancel(state->task_id);
422+
LOG_INFO_REQUEST(request_id) << "Inference completed";
419423
});
420424
} else {
421425
Json::Value respData;
422426
auto resp = nitro_utils::nitroHttpResponse();
423427
int task_id = llama.request_completion(data, false, false, -1);
424-
LOG_INFO << "sent the non stream, waiting for respone";
428+
LOG_INFO_REQUEST(request_id) << "Non stream, waiting for respone";
425429
if (!json_value(data, "stream", false)) {
426430
std::string completion_text;
427431
task_result result = llama.next_result(task_id);
428-
LOG_INFO << "Here is the result:" << result.error;
429432
if (!result.error && result.stop) {
430433
int prompt_tokens = result.result_json["tokens_evaluated"];
431434
int predicted_tokens = result.result_json["tokens_predicted"];
@@ -435,9 +438,12 @@ void llamaCPP::InferenceImpl(
435438
prompt_tokens, predicted_tokens);
436439
resp->setBody(full_return);
437440
} else {
438-
resp->setBody("Internal error during inference");
441+
respData["message"] = "Internal error during inference";
442+
resp = nitro_utils::nitroHttpJsonResponse(respData);
443+
LOG_ERROR_REQUEST(request_id) << "Error during inference";
439444
}
440445
callback(resp);
446+
LOG_INFO_REQUEST(request_id) << "Inference completed";
441447
}
442448
}
443449
}
@@ -458,10 +464,12 @@ void llamaCPP::Embedding(
458464
void llamaCPP::EmbeddingImpl(
459465
std::shared_ptr<Json::Value> jsonBody,
460466
std::function<void(const HttpResponsePtr&)>& callback) {
467+
int request_id = ++no_of_inference_requests;
468+
LOG_INFO_REQUEST(request_id) << "Generating reponse for embedding request";
461469
// Queue embedding task
462470
auto state = create_inference_state(this);
463471

464-
state->instance->queue->runTaskInQueue([this, state, jsonBody, callback]() {
472+
state->instance->queue->runTaskInQueue([this, state, jsonBody, callback, request_id]() {
465473
Json::Value responseData(Json::arrayValue);
466474

467475
if (jsonBody->isMember("input")) {
@@ -502,50 +510,47 @@ void llamaCPP::EmbeddingImpl(
502510
resp->setBody(Json::writeString(Json::StreamWriterBuilder(), root));
503511
resp->setContentTypeString("application/json");
504512
callback(resp);
513+
LOG_INFO_REQUEST(request_id) << "Embedding completed";
505514
});
506515
}
507516

508517
void llamaCPP::UnloadModel(
509518
const HttpRequestPtr& req,
510519
std::function<void(const HttpResponsePtr&)>&& callback) {
511520
Json::Value jsonResp;
512-
jsonResp["message"] = "No model loaded";
513-
if (llama.model_loaded_external) {
521+
if (CheckModelLoaded(callback)) {
514522
StopBackgroundTask();
515523

516524
llama_free(llama.ctx);
517525
llama_free_model(llama.model);
518526
llama.ctx = nullptr;
519527
llama.model = nullptr;
520528
jsonResp["message"] = "Model unloaded successfully";
529+
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
530+
callback(resp);
531+
LOG_INFO << "Model unloaded successfully";
521532
}
522-
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
523-
callback(resp);
524-
return;
525533
}
526534

527535
void llamaCPP::ModelStatus(
528536
const HttpRequestPtr& req,
529537
std::function<void(const HttpResponsePtr&)>&& callback) {
530538
Json::Value jsonResp;
531539
bool is_model_loaded = llama.model_loaded_external;
532-
if (is_model_loaded) {
540+
if (CheckModelLoaded(callback)) {
533541
jsonResp["model_loaded"] = is_model_loaded;
534542
jsonResp["model_data"] = llama.get_model_props().dump();
535-
} else {
536-
jsonResp["model_loaded"] = is_model_loaded;
537-
}
538-
539-
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
540-
callback(resp);
541-
return;
543+
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
544+
callback(resp);
545+
LOG_INFO << "Model status responded";
546+
}
542547
}
543548

544549
void llamaCPP::LoadModel(
545550
const HttpRequestPtr& req,
546551
std::function<void(const HttpResponsePtr&)>&& callback) {
547552
if (llama.model_loaded_external) {
548-
LOG_INFO << "model loaded";
553+
LOG_INFO << "Model already loaded";
549554
Json::Value jsonResp;
550555
jsonResp["message"] = "Model already loaded";
551556
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
@@ -568,6 +573,7 @@ void llamaCPP::LoadModel(
568573
jsonResp["message"] = "Model loaded successfully";
569574
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
570575
callback(resp);
576+
LOG_INFO << "Model loaded successfully";
571577
}
572578
}
573579

@@ -602,7 +608,17 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
602608
}
603609
};
604610

605-
params.model = jsonBody->operator[]("llama_model_path").asString();
611+
Json::Value model_path = jsonBody->operator[]("llama_model_path");
612+
if (model_path.isNull()) {
613+
LOG_ERROR << "Missing model path in request";
614+
} else {
615+
if (std::filesystem::exists(std::filesystem::path(model_path.asString()))) {
616+
params.model = model_path.asString();
617+
} else {
618+
LOG_ERROR << "Could not find model in path " << model_path.asString();
619+
}
620+
}
621+
606622
params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
607623
params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
608624
params.embedding = jsonBody->get("embedding", true).asBool();
@@ -681,7 +697,7 @@ void llamaCPP::StopBackgroundTask() {
681697
if (llama.model_loaded_external) {
682698
llama.model_loaded_external = false;
683699
llama.condition_tasks.notify_one();
684-
LOG_INFO << "changed to false";
700+
LOG_INFO << "Background task stopped! ";
685701
if (backgroundThread.joinable()) {
686702
backgroundThread.join();
687703
}

controllers/llamaCPP.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
#define SERVER_VERBOSE 1
2828
#endif
2929

30+
#define LOG_INFO_REQUEST(RID) LOG_INFO << "Request " << RID << ": "
31+
#define LOG_WARN_REQUEST(RID) LOG_WARN << "Request " << RID << ": "
32+
#define LOG_ERROR_REQUEST(RID) LOG_ERROR << "Request " << RID << ": "
33+
3034
using json = nlohmann::json;
3135

3236
using namespace drogon;
@@ -85,6 +89,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
8589
std::string pre_prompt;
8690
int repeat_last_n;
8791
bool caching_enabled;
92+
std::atomic<int> no_of_inference_requests = 0;
8893
std::atomic<int> no_of_chats = 0;
8994
int clean_cache_threshold;
9095
std::string grammar_file_content;

0 commit comments

Comments
 (0)