33#include < fstream>
44#include < iostream>
55#include " log.h"
6+ #include " utils/nitro_utils.h"
7+ #include " utils/logging_utils.h"
68
79// External
810#include " common.h"
911#include " llama.h"
1012
11- #include " log.h"
12- #include " utils/nitro_utils.h"
13-
1413using namespace inferences ;
1514using json = nlohmann::json;
1615
@@ -50,6 +49,7 @@ std::shared_ptr<inferenceState> create_inference_state(llamaCPP* instance) {
5049bool llamaCPP::CheckModelLoaded (
5150 std::function<void (const HttpResponsePtr&)>& callback) {
5251 if (!llama.model_loaded_external ) {
52+ LOG_ERROR << " Model has not been loaded" ;
5353 Json::Value jsonResp;
5454 jsonResp[" message" ] =
5555 " Model has not been loaded, please load model into nitro" ;
@@ -159,6 +159,7 @@ llamaCPP::~llamaCPP() {
159159void llamaCPP::WarmupModel () {
160160 json pseudo;
161161
162+ LOG_INFO << " Warm-up model" ;
162163 pseudo[" prompt" ] = " Hello" ;
163164 pseudo[" n_predict" ] = 2 ;
164165 pseudo[" stream" ] = false ;
@@ -187,6 +188,8 @@ void llamaCPP::InferenceImpl(
187188 inferences::ChatCompletionRequest&& completion,
188189 std::function<void (const HttpResponsePtr&)>& callback) {
189190 std::string formatted_output = pre_prompt;
191+ int request_id = ++no_of_requests;
192+ LOG_INFO_REQUEST (request_id) << " Generating reponse for inference request" ;
190193
191194 json data;
192195 json stopWords;
@@ -196,9 +199,9 @@ void llamaCPP::InferenceImpl(
196199 // Increase number of chats received and clean the prompt
197200 no_of_chats++;
198201 if (no_of_chats % clean_cache_threshold == 0 ) {
199- LOG_INFO << " Clean cache threshold reached!" ;
202+ LOG_INFO_REQUEST (request_id) << " Clean cache threshold reached!" ;
200203 llama.kv_cache_clear ();
201- LOG_INFO << " Cache cleaned" ;
204+ LOG_INFO_REQUEST (request_id) << " Cache cleaned" ;
202205 }
203206
204207 // Default values to enable auto caching
@@ -207,9 +210,7 @@ void llamaCPP::InferenceImpl(
207210
208211 // Passing load value
209212 data[" repeat_last_n" ] = this ->repeat_last_n ;
210-
211- LOG_INFO << " Messages:" << completion.messages .toStyledString ();
212- LOG_INFO << " Stop:" << completion.stop .toStyledString ();
213+ LOG_INFO_REQUEST (request_id) << " Stop words:" << completion.stop .toStyledString ();
213214
214215 data[" stream" ] = completion.stream ;
215216 data[" n_predict" ] = completion.max_tokens ;
@@ -268,18 +269,18 @@ void llamaCPP::InferenceImpl(
268269 auto image_url = content_piece[" image_url" ][" url" ].asString ();
269270 std::string base64_image_data;
270271 if (image_url.find (" http" ) != std::string::npos) {
271- LOG_INFO << " Remote image detected but not supported yet" ;
272+ LOG_INFO_REQUEST (request_id) << " Remote image detected but not supported yet" ;
272273 } else if (image_url.find (" data:image" ) != std::string::npos) {
273- LOG_INFO << " Base64 image detected" ;
274+ LOG_INFO_REQUEST (request_id) << " Base64 image detected" ;
274275 base64_image_data = nitro_utils::extractBase64 (image_url);
275- LOG_INFO << base64_image_data;
276+ LOG_INFO_REQUEST (request_id) << base64_image_data;
276277 } else {
277- LOG_INFO << " Local image detected" ;
278+ LOG_INFO_REQUEST (request_id) << " Local image detected" ;
278279 nitro_utils::processLocalImage (
279280 image_url, [&](const std::string& base64Image) {
280281 base64_image_data = base64Image;
281282 });
282- LOG_INFO << base64_image_data;
283+ LOG_INFO_REQUEST (request_id) << base64_image_data;
283284 }
284285 content_piece_image_data[" data" ] = base64_image_data;
285286
@@ -306,7 +307,7 @@ void llamaCPP::InferenceImpl(
306307 }
307308 }
308309 formatted_output += ai_prompt;
309- LOG_INFO << formatted_output;
310+ LOG_INFO_REQUEST (request_id) << formatted_output;
310311 }
311312
312313 data[" prompt" ] = formatted_output;
@@ -322,35 +323,36 @@ void llamaCPP::InferenceImpl(
322323 bool is_streamed = data[" stream" ];
323324// Enable full message debugging
324325#ifdef DEBUG
325- LOG_INFO << " Current completion text" ;
326- LOG_INFO << formatted_output;
326+ LOG_INFO_REQUEST (request_id) << " Current completion text" ;
327+ LOG_INFO_REQUEST (request_id) << formatted_output;
327328#endif
328329
329330 if (is_streamed) {
331+ LOG_INFO_REQUEST (request_id) << " Streamed, waiting for respone" ;
330332 auto state = create_inference_state (this );
331333 auto chunked_content_provider =
332- [state, data](char * pBuffer, std::size_t nBuffSize) -> std::size_t {
334+ [state, data, request_id ](char * pBuffer, std::size_t nBuffSize) -> std::size_t {
333335 if (state->inference_status == PENDING) {
334336 state->inference_status = RUNNING;
335337 } else if (state->inference_status == FINISHED) {
336338 return 0 ;
337339 }
338340
339341 if (!pBuffer) {
340- LOG_INFO << " Connection closed or buffer is null. Reset context" ;
342+ LOG_WARN_REQUEST (request_id) " Connection closed or buffer is null. Reset context" ;
341343 state->inference_status = FINISHED;
342344 return 0 ;
343345 }
344346
345347 if (state->inference_status == EOS) {
346- LOG_INFO << " End of result" ;
348+ LOG_INFO_REQUEST (request_id) << " End of result" ;
347349 const std::string str =
348350 " data: " +
349351 create_return_json (nitro_utils::generate_random_string (20 ), " _" , " " ,
350352 " stop" ) +
351353 " \n\n " + " data: [DONE]" + " \n\n " ;
352354
353- LOG_VERBOSE (" data stream" , {{" to_send" , str}});
355+ LOG_VERBOSE (" data stream" , {{" request_id " : request_id}, { " to_send" , str}});
354356 std::size_t nRead = std::min (str.size (), nBuffSize);
355357 memcpy (pBuffer, str.data (), nRead);
356358 state->inference_status = FINISHED;
@@ -370,7 +372,7 @@ void llamaCPP::InferenceImpl(
370372 memcpy (pBuffer, str.data (), nRead);
371373
372374 if (result.stop ) {
373- LOG_INFO << " reached result stop" ;
375+ LOG_INFO_REQUEST (request_id) << " Reached result stop" ;
374376 state->inference_status = EOS;
375377 return nRead;
376378 }
@@ -383,14 +385,14 @@ void llamaCPP::InferenceImpl(
383385
384386 return nRead;
385387 } else {
386- LOG_INFO << " Error during inference" ;
388+ LOG_ERROR_REQUEST (request_id) << " Error during inference" ;
387389 }
388390 state->inference_status = FINISHED;
389391 return 0 ;
390392 };
391393 // Queued task
392394 state->instance ->queue ->runTaskInQueue (
393- [callback, state, data, chunked_content_provider]() {
395+ [callback, state, data, chunked_content_provider, request_id ]() {
394396 state->task_id =
395397 state->instance ->llama .request_completion (data, false , false , -1 );
396398
@@ -410,22 +412,22 @@ void llamaCPP::InferenceImpl(
410412 retries += 1 ;
411413 }
412414 if (state->inference_status != RUNNING)
413- LOG_INFO << " Wait for task to be released:" << state->task_id ;
415+ LOG_INFO_REQUEST (request_id) << " Wait for task to be released:" << state->task_id ;
414416 std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
415417 }
416- LOG_INFO << " Task completed, release it" ;
418+ LOG_INFO_REQUEST (request_id) << " Task completed, release it" ;
417419 // Request completed, release it
418420 state->instance ->llama .request_cancel (state->task_id );
421+ LOG_INFO_REQUEST (request_id) << " Inference completed" ;
419422 });
420423 } else {
421424 Json::Value respData;
422425 auto resp = nitro_utils::nitroHttpResponse ();
423426 int task_id = llama.request_completion (data, false , false , -1 );
424- LOG_INFO << " sent the non stream, waiting for respone" ;
427+ LOG_INFO_REQUEST (request_id) << " Non stream, waiting for respone" ;
425428 if (!json_value (data, " stream" , false )) {
426429 std::string completion_text;
427430 task_result result = llama.next_result (task_id);
428- LOG_INFO << " Here is the result:" << result.error ;
429431 if (!result.error && result.stop ) {
430432 int prompt_tokens = result.result_json [" tokens_evaluated" ];
431433 int predicted_tokens = result.result_json [" tokens_predicted" ];
@@ -435,9 +437,12 @@ void llamaCPP::InferenceImpl(
435437 prompt_tokens, predicted_tokens);
436438 resp->setBody (full_return);
437439 } else {
438- resp->setBody (" Internal error during inference" );
440+ respData[" message" ] = " Internal error during inference" ;
441+ resp = nitro_utils::nitroHttpJsonResponse (respData);
442+ LOG_ERROR_REQUEST (request_id) << " Error during inference" ;
439443 }
440444 callback (resp);
445+ LOG_INFO_REQUEST (request_id) << " Inference completed" ;
441446 }
442447 }
443448}
@@ -458,10 +463,12 @@ void llamaCPP::Embedding(
458463void llamaCPP::EmbeddingImpl (
459464 std::shared_ptr<Json::Value> jsonBody,
460465 std::function<void (const HttpResponsePtr&)>& callback) {
466+ int request_id = ++no_of_requests;
467+ LOG_INFO_REQUEST (request_id) << " Generating reponse for embedding request" ;
461468 // Queue embedding task
462469 auto state = create_inference_state (this );
463470
464- state->instance ->queue ->runTaskInQueue ([this , state, jsonBody, callback]() {
471+ state->instance ->queue ->runTaskInQueue ([this , state, jsonBody, callback, request_id ]() {
465472 Json::Value responseData (Json::arrayValue);
466473
467474 if (jsonBody->isMember (" input" )) {
@@ -502,50 +509,58 @@ void llamaCPP::EmbeddingImpl(
502509 resp->setBody (Json::writeString (Json::StreamWriterBuilder (), root));
503510 resp->setContentTypeString (" application/json" );
504511 callback (resp);
512+ LOG_INFO_REQUEST (request_id) << " Embedding completed" ;
505513 });
506514}
507515
508516void llamaCPP::UnloadModel (
509517 const HttpRequestPtr& req,
510518 std::function<void (const HttpResponsePtr&)>&& callback) {
511519 Json::Value jsonResp;
512- jsonResp[" message" ] = " No model loaded" ;
513- if (llama.model_loaded_external ) {
520+ if (CheckModelLoaded (callback)) {
514521 StopBackgroundTask ();
515522
516523 llama_free (llama.ctx );
517524 llama_free_model (llama.model );
518525 llama.ctx = nullptr ;
519526 llama.model = nullptr ;
520527 jsonResp[" message" ] = " Model unloaded successfully" ;
528+ auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
529+ callback (resp);
530+ LOG_INFO << " Model unloaded successfully" ;
521531 }
522- auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
523- callback (resp);
524- return ;
525532}
526533
527534void llamaCPP::ModelStatus (
528535 const HttpRequestPtr& req,
529536 std::function<void (const HttpResponsePtr&)>&& callback) {
530537 Json::Value jsonResp;
531538 bool is_model_loaded = llama.model_loaded_external ;
532- if (is_model_loaded ) {
539+ if (CheckModelLoaded (callback) ) {
533540 jsonResp[" model_loaded" ] = is_model_loaded;
534541 jsonResp[" model_data" ] = llama.get_model_props ().dump ();
535- } else {
536- jsonResp[" model_loaded" ] = is_model_loaded;
537- }
538-
539- auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
540- callback (resp);
541- return ;
542+ auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
543+ callback (resp);
544+ LOG_INFO << " Model status responded" ;
545+ }
542546}
543547
544548void llamaCPP::LoadModel (
545549 const HttpRequestPtr& req,
546550 std::function<void (const HttpResponsePtr&)>&& callback) {
551+
552+ if (!nitro_utils::isAVX2Supported () && ggml_cpu_has_avx2 ()) {
553+ LOG_ERROR << " AVX2 is not supported by your processor" ;
554+ Json::Value jsonResp;
555+ jsonResp[" message" ] = " AVX2 is not supported by your processor, please download and replace the correct Nitro asset version" ;
556+ auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
557+ resp->setStatusCode (drogon::k500InternalServerError);
558+ callback (resp);
559+ return ;
560+ }
561+
547562 if (llama.model_loaded_external ) {
548- LOG_INFO << " model loaded" ;
563+ LOG_INFO << " Model already loaded" ;
549564 Json::Value jsonResp;
550565 jsonResp[" message" ] = " Model already loaded" ;
551566 auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
@@ -568,6 +583,7 @@ void llamaCPP::LoadModel(
568583 jsonResp[" message" ] = " Model loaded successfully" ;
569584 auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
570585 callback (resp);
586+ LOG_INFO << " Model loaded successfully" ;
571587 }
572588}
573589
@@ -602,7 +618,17 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
602618 }
603619 };
604620
605- params.model = jsonBody->operator [](" llama_model_path" ).asString ();
621+ Json::Value model_path = jsonBody->operator [](" llama_model_path" );
622+ if (model_path.isNull ()) {
623+ LOG_ERROR << " Missing model path in request" ;
624+ } else {
625+ if (std::filesystem::exists (std::filesystem::path (model_path.asString ()))) {
626+ params.model = model_path.asString ();
627+ } else {
628+ LOG_ERROR << " Could not find model in path " << model_path.asString ();
629+ }
630+ }
631+
606632 params.n_gpu_layers = jsonBody->get (" ngl" , 100 ).asInt ();
607633 params.n_ctx = jsonBody->get (" ctx_len" , 2048 ).asInt ();
608634 params.embedding = jsonBody->get (" embedding" , true ).asBool ();
@@ -681,7 +707,7 @@ void llamaCPP::StopBackgroundTask() {
681707 if (llama.model_loaded_external ) {
682708 llama.model_loaded_external = false ;
683709 llama.condition_tasks .notify_one ();
684- LOG_INFO << " changed to false " ;
710+ LOG_INFO << " Background task stopped! " ;
685711 if (backgroundThread.joinable ()) {
686712 backgroundThread.join ();
687713 }
0 commit comments