Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 03a5a4c

Browse files
authored
Merge pull request #485 from janhq/378-update-logging
feat: Add more logs for `llamaCPP` and `llama_server`
2 parents 33d0af9 + 543be27 commit 03a5a4c

File tree

5 files changed

+117
-48
lines changed

5 files changed

+117
-48
lines changed

context/llama_server_context.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ struct llama_server_context {
532532

533533
std::tie(model, ctx) = llama_init_from_gpt_params(params);
534534
if (model == nullptr) {
535-
LOG_ERROR_LLAMA("unable to load model", {{"model", params.model}});
535+
LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}});
536536
return false;
537537
}
538538

@@ -551,6 +551,10 @@ struct llama_server_context {
551551
}
552552
}
553553

554+
if (ctx == nullptr) {
555+
LOG_ERROR_LLAMA("Unable to get llama.cpp context", {});
556+
return false;
557+
}
554558
n_ctx = llama_n_ctx(ctx);
555559

556560
add_bos_token = llama_should_add_bos_token(model);
@@ -578,7 +582,11 @@ struct llama_server_context {
578582
slots.push_back(slot);
579583
}
580584

581-
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
585+
try {
586+
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
587+
} catch (const std::exception& e) {
588+
LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata" , {{"exception", e.what()}, {"n_tokens_alloc", n_ctx}, {"embd", 0}, {"n_seq_max", params.n_parallel}});
589+
}
582590

583591
// empty system prompt
584592
system_prompt = "";
@@ -1295,7 +1303,9 @@ struct llama_server_context {
12951303
}
12961304

12971305
if (queue_results[i].id == task_id) {
1298-
assert(queue_results[i].multitask_id == -1);
1306+
if (queue_results[i].multitask_id != -1) {
1307+
LOG_ERROR_LLAMA("Incorrect multitask ID", {{"task_id", task_id}});
1308+
}
12991309
task_result res = queue_results[i];
13001310
queue_results.erase(queue_results.begin() + i);
13011311
return res;

controllers/llamaCPP.cc

Lines changed: 71 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@
33
#include <fstream>
44
#include <iostream>
55
#include "log.h"
6+
#include "utils/nitro_utils.h"
7+
#include "utils/logging_utils.h"
68

79
// External
810
#include "common.h"
911
#include "llama.h"
1012

11-
#include "log.h"
12-
#include "utils/nitro_utils.h"
13-
1413
using namespace inferences;
1514
using json = nlohmann::json;
1615

@@ -50,6 +49,7 @@ std::shared_ptr<inferenceState> create_inference_state(llamaCPP* instance) {
5049
bool llamaCPP::CheckModelLoaded(
5150
std::function<void(const HttpResponsePtr&)>& callback) {
5251
if (!llama.model_loaded_external) {
52+
LOG_ERROR << "Model has not been loaded";
5353
Json::Value jsonResp;
5454
jsonResp["message"] =
5555
"Model has not been loaded, please load model into nitro";
@@ -159,6 +159,7 @@ llamaCPP::~llamaCPP() {
159159
void llamaCPP::WarmupModel() {
160160
json pseudo;
161161

162+
LOG_INFO << "Warm-up model";
162163
pseudo["prompt"] = "Hello";
163164
pseudo["n_predict"] = 2;
164165
pseudo["stream"] = false;
@@ -187,6 +188,8 @@ void llamaCPP::InferenceImpl(
187188
inferences::ChatCompletionRequest&& completion,
188189
std::function<void(const HttpResponsePtr&)>& callback) {
189190
std::string formatted_output = pre_prompt;
191+
int request_id = ++no_of_requests;
192+
LOG_INFO_REQUEST(request_id) << "Generating reponse for inference request";
190193

191194
json data;
192195
json stopWords;
@@ -196,9 +199,9 @@ void llamaCPP::InferenceImpl(
196199
// Increase number of chats received and clean the prompt
197200
no_of_chats++;
198201
if (no_of_chats % clean_cache_threshold == 0) {
199-
LOG_INFO << "Clean cache threshold reached!";
202+
LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!";
200203
llama.kv_cache_clear();
201-
LOG_INFO << "Cache cleaned";
204+
LOG_INFO_REQUEST(request_id) << "Cache cleaned";
202205
}
203206

204207
// Default values to enable auto caching
@@ -207,9 +210,7 @@ void llamaCPP::InferenceImpl(
207210

208211
// Passing load value
209212
data["repeat_last_n"] = this->repeat_last_n;
210-
211-
LOG_INFO << "Messages:" << completion.messages.toStyledString();
212-
LOG_INFO << "Stop:" << completion.stop.toStyledString();
213+
LOG_INFO_REQUEST(request_id) << "Stop words:" << completion.stop.toStyledString();
213214

214215
data["stream"] = completion.stream;
215216
data["n_predict"] = completion.max_tokens;
@@ -268,18 +269,18 @@ void llamaCPP::InferenceImpl(
268269
auto image_url = content_piece["image_url"]["url"].asString();
269270
std::string base64_image_data;
270271
if (image_url.find("http") != std::string::npos) {
271-
LOG_INFO << "Remote image detected but not supported yet";
272+
LOG_INFO_REQUEST(request_id) << "Remote image detected but not supported yet";
272273
} else if (image_url.find("data:image") != std::string::npos) {
273-
LOG_INFO << "Base64 image detected";
274+
LOG_INFO_REQUEST(request_id) << "Base64 image detected";
274275
base64_image_data = nitro_utils::extractBase64(image_url);
275-
LOG_INFO << base64_image_data;
276+
LOG_INFO_REQUEST(request_id) << base64_image_data;
276277
} else {
277-
LOG_INFO << "Local image detected";
278+
LOG_INFO_REQUEST(request_id) << "Local image detected";
278279
nitro_utils::processLocalImage(
279280
image_url, [&](const std::string& base64Image) {
280281
base64_image_data = base64Image;
281282
});
282-
LOG_INFO << base64_image_data;
283+
LOG_INFO_REQUEST(request_id) << base64_image_data;
283284
}
284285
content_piece_image_data["data"] = base64_image_data;
285286

@@ -306,7 +307,7 @@ void llamaCPP::InferenceImpl(
306307
}
307308
}
308309
formatted_output += ai_prompt;
309-
LOG_INFO << formatted_output;
310+
LOG_INFO_REQUEST(request_id) << formatted_output;
310311
}
311312

312313
data["prompt"] = formatted_output;
@@ -322,35 +323,36 @@ void llamaCPP::InferenceImpl(
322323
bool is_streamed = data["stream"];
323324
// Enable full message debugging
324325
#ifdef DEBUG
325-
LOG_INFO << "Current completion text";
326-
LOG_INFO << formatted_output;
326+
LOG_INFO_REQUEST(request_id) << "Current completion text";
327+
LOG_INFO_REQUEST(request_id) << formatted_output;
327328
#endif
328329

329330
if (is_streamed) {
331+
LOG_INFO_REQUEST(request_id) << "Streamed, waiting for respone";
330332
auto state = create_inference_state(this);
331333
auto chunked_content_provider =
332-
[state, data](char* pBuffer, std::size_t nBuffSize) -> std::size_t {
334+
[state, data, request_id](char* pBuffer, std::size_t nBuffSize) -> std::size_t {
333335
if (state->inference_status == PENDING) {
334336
state->inference_status = RUNNING;
335337
} else if (state->inference_status == FINISHED) {
336338
return 0;
337339
}
338340

339341
if (!pBuffer) {
340-
LOG_INFO << "Connection closed or buffer is null. Reset context";
342+
LOG_WARN_REQUEST(request_id) "Connection closed or buffer is null. Reset context";
341343
state->inference_status = FINISHED;
342344
return 0;
343345
}
344346

345347
if (state->inference_status == EOS) {
346-
LOG_INFO << "End of result";
348+
LOG_INFO_REQUEST(request_id) << "End of result";
347349
const std::string str =
348350
"data: " +
349351
create_return_json(nitro_utils::generate_random_string(20), "_", "",
350352
"stop") +
351353
"\n\n" + "data: [DONE]" + "\n\n";
352354

353-
LOG_VERBOSE("data stream", {{"to_send", str}});
355+
LOG_VERBOSE("data stream", {{"request_id": request_id}, {"to_send", str}});
354356
std::size_t nRead = std::min(str.size(), nBuffSize);
355357
memcpy(pBuffer, str.data(), nRead);
356358
state->inference_status = FINISHED;
@@ -370,7 +372,7 @@ void llamaCPP::InferenceImpl(
370372
memcpy(pBuffer, str.data(), nRead);
371373

372374
if (result.stop) {
373-
LOG_INFO << "reached result stop";
375+
LOG_INFO_REQUEST(request_id) << "Reached result stop";
374376
state->inference_status = EOS;
375377
return nRead;
376378
}
@@ -383,14 +385,14 @@ void llamaCPP::InferenceImpl(
383385

384386
return nRead;
385387
} else {
386-
LOG_INFO << "Error during inference";
388+
LOG_ERROR_REQUEST(request_id) << "Error during inference";
387389
}
388390
state->inference_status = FINISHED;
389391
return 0;
390392
};
391393
// Queued task
392394
state->instance->queue->runTaskInQueue(
393-
[callback, state, data, chunked_content_provider]() {
395+
[callback, state, data, chunked_content_provider, request_id]() {
394396
state->task_id =
395397
state->instance->llama.request_completion(data, false, false, -1);
396398

@@ -410,22 +412,22 @@ void llamaCPP::InferenceImpl(
410412
retries += 1;
411413
}
412414
if (state->inference_status != RUNNING)
413-
LOG_INFO << "Wait for task to be released:" << state->task_id;
415+
LOG_INFO_REQUEST(request_id) << "Wait for task to be released:" << state->task_id;
414416
std::this_thread::sleep_for(std::chrono::milliseconds(100));
415417
}
416-
LOG_INFO << "Task completed, release it";
418+
LOG_INFO_REQUEST(request_id) << "Task completed, release it";
417419
// Request completed, release it
418420
state->instance->llama.request_cancel(state->task_id);
421+
LOG_INFO_REQUEST(request_id) << "Inference completed";
419422
});
420423
} else {
421424
Json::Value respData;
422425
auto resp = nitro_utils::nitroHttpResponse();
423426
int task_id = llama.request_completion(data, false, false, -1);
424-
LOG_INFO << "sent the non stream, waiting for respone";
427+
LOG_INFO_REQUEST(request_id) << "Non stream, waiting for respone";
425428
if (!json_value(data, "stream", false)) {
426429
std::string completion_text;
427430
task_result result = llama.next_result(task_id);
428-
LOG_INFO << "Here is the result:" << result.error;
429431
if (!result.error && result.stop) {
430432
int prompt_tokens = result.result_json["tokens_evaluated"];
431433
int predicted_tokens = result.result_json["tokens_predicted"];
@@ -435,9 +437,12 @@ void llamaCPP::InferenceImpl(
435437
prompt_tokens, predicted_tokens);
436438
resp->setBody(full_return);
437439
} else {
438-
resp->setBody("Internal error during inference");
440+
respData["message"] = "Internal error during inference";
441+
resp = nitro_utils::nitroHttpJsonResponse(respData);
442+
LOG_ERROR_REQUEST(request_id) << "Error during inference";
439443
}
440444
callback(resp);
445+
LOG_INFO_REQUEST(request_id) << "Inference completed";
441446
}
442447
}
443448
}
@@ -458,10 +463,12 @@ void llamaCPP::Embedding(
458463
void llamaCPP::EmbeddingImpl(
459464
std::shared_ptr<Json::Value> jsonBody,
460465
std::function<void(const HttpResponsePtr&)>& callback) {
466+
int request_id = ++no_of_requests;
467+
LOG_INFO_REQUEST(request_id) << "Generating reponse for embedding request";
461468
// Queue embedding task
462469
auto state = create_inference_state(this);
463470

464-
state->instance->queue->runTaskInQueue([this, state, jsonBody, callback]() {
471+
state->instance->queue->runTaskInQueue([this, state, jsonBody, callback, request_id]() {
465472
Json::Value responseData(Json::arrayValue);
466473

467474
if (jsonBody->isMember("input")) {
@@ -502,50 +509,58 @@ void llamaCPP::EmbeddingImpl(
502509
resp->setBody(Json::writeString(Json::StreamWriterBuilder(), root));
503510
resp->setContentTypeString("application/json");
504511
callback(resp);
512+
LOG_INFO_REQUEST(request_id) << "Embedding completed";
505513
});
506514
}
507515

508516
void llamaCPP::UnloadModel(
509517
const HttpRequestPtr& req,
510518
std::function<void(const HttpResponsePtr&)>&& callback) {
511519
Json::Value jsonResp;
512-
jsonResp["message"] = "No model loaded";
513-
if (llama.model_loaded_external) {
520+
if (CheckModelLoaded(callback)) {
514521
StopBackgroundTask();
515522

516523
llama_free(llama.ctx);
517524
llama_free_model(llama.model);
518525
llama.ctx = nullptr;
519526
llama.model = nullptr;
520527
jsonResp["message"] = "Model unloaded successfully";
528+
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
529+
callback(resp);
530+
LOG_INFO << "Model unloaded successfully";
521531
}
522-
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
523-
callback(resp);
524-
return;
525532
}
526533

527534
void llamaCPP::ModelStatus(
528535
const HttpRequestPtr& req,
529536
std::function<void(const HttpResponsePtr&)>&& callback) {
530537
Json::Value jsonResp;
531538
bool is_model_loaded = llama.model_loaded_external;
532-
if (is_model_loaded) {
539+
if (CheckModelLoaded(callback)) {
533540
jsonResp["model_loaded"] = is_model_loaded;
534541
jsonResp["model_data"] = llama.get_model_props().dump();
535-
} else {
536-
jsonResp["model_loaded"] = is_model_loaded;
537-
}
538-
539-
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
540-
callback(resp);
541-
return;
542+
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
543+
callback(resp);
544+
LOG_INFO << "Model status responded";
545+
}
542546
}
543547

544548
void llamaCPP::LoadModel(
545549
const HttpRequestPtr& req,
546550
std::function<void(const HttpResponsePtr&)>&& callback) {
551+
552+
if (!nitro_utils::isAVX2Supported() && ggml_cpu_has_avx2()) {
553+
LOG_ERROR << "AVX2 is not supported by your processor";
554+
Json::Value jsonResp;
555+
jsonResp["message"] = "AVX2 is not supported by your processor, please download and replace the correct Nitro asset version";
556+
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
557+
resp->setStatusCode(drogon::k500InternalServerError);
558+
callback(resp);
559+
return;
560+
}
561+
547562
if (llama.model_loaded_external) {
548-
LOG_INFO << "model loaded";
563+
LOG_INFO << "Model already loaded";
549564
Json::Value jsonResp;
550565
jsonResp["message"] = "Model already loaded";
551566
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
@@ -568,6 +583,7 @@ void llamaCPP::LoadModel(
568583
jsonResp["message"] = "Model loaded successfully";
569584
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
570585
callback(resp);
586+
LOG_INFO << "Model loaded successfully";
571587
}
572588
}
573589

@@ -602,7 +618,17 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
602618
}
603619
};
604620

605-
params.model = jsonBody->operator[]("llama_model_path").asString();
621+
Json::Value model_path = jsonBody->operator[]("llama_model_path");
622+
if (model_path.isNull()) {
623+
LOG_ERROR << "Missing model path in request";
624+
} else {
625+
if (std::filesystem::exists(std::filesystem::path(model_path.asString()))) {
626+
params.model = model_path.asString();
627+
} else {
628+
LOG_ERROR << "Could not find model in path " << model_path.asString();
629+
}
630+
}
631+
606632
params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
607633
params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
608634
params.embedding = jsonBody->get("embedding", true).asBool();
@@ -681,7 +707,7 @@ void llamaCPP::StopBackgroundTask() {
681707
if (llama.model_loaded_external) {
682708
llama.model_loaded_external = false;
683709
llama.condition_tasks.notify_one();
684-
LOG_INFO << "changed to false";
710+
LOG_INFO << "Background task stopped! ";
685711
if (backgroundThread.joinable()) {
686712
backgroundThread.join();
687713
}

controllers/llamaCPP.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
8585
std::string pre_prompt;
8686
int repeat_last_n;
8787
bool caching_enabled;
88+
std::atomic<int> no_of_requests = 0;
8889
std::atomic<int> no_of_chats = 0;
8990
int clean_cache_threshold;
9091
std::string grammar_file_content;

utils/logging_utils.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#pragma once
2+
3+
#define LOG_INFO_REQUEST(RID) LOG_INFO << "Request " << RID << ": "
4+
#define LOG_WARN_REQUEST(RID) LOG_WARN << "Request " << RID << ": "
5+
#define LOG_ERROR_REQUEST(RID) LOG_ERROR << "Request " << RID << ": "

0 commit comments

Comments
 (0)