@@ -50,6 +50,7 @@ std::shared_ptr<inferenceState> create_inference_state(llamaCPP* instance) {
5050bool llamaCPP::CheckModelLoaded (
5151 std::function<void (const HttpResponsePtr&)>& callback) {
5252 if (!llama.model_loaded_external ) {
53+ LOG_ERROR << " Model has not been loaded" ;
5354 Json::Value jsonResp;
5455 jsonResp[" message" ] =
5556 " Model has not been loaded, please load model into nitro" ;
@@ -159,6 +160,7 @@ llamaCPP::~llamaCPP() {
159160void llamaCPP::WarmupModel () {
160161 json pseudo;
161162
163+ LOG_INFO << " Warm-up model" ;
162164 pseudo[" prompt" ] = " Hello" ;
163165 pseudo[" n_predict" ] = 2 ;
164166 pseudo[" stream" ] = false ;
@@ -187,6 +189,8 @@ void llamaCPP::InferenceImpl(
187189 inferences::ChatCompletionRequest&& completion,
188190 std::function<void (const HttpResponsePtr&)>& callback) {
189191 std::string formatted_output = pre_prompt;
192+ int request_id = ++no_of_inference_requests;
193+ LOG_INFO_REQUEST (request_id) << " Generating reponse for inference request" ;
190194
191195 json data;
192196 json stopWords;
@@ -196,9 +200,9 @@ void llamaCPP::InferenceImpl(
196200 // Increase number of chats received and clean the prompt
197201 no_of_chats++;
198202 if (no_of_chats % clean_cache_threshold == 0 ) {
199- LOG_INFO << " Clean cache threshold reached!" ;
203+ LOG_INFO_REQUEST (request_id) << " Clean cache threshold reached!" ;
200204 llama.kv_cache_clear ();
201- LOG_INFO << " Cache cleaned" ;
205+ LOG_INFO_REQUEST (request_id) << " Cache cleaned" ;
202206 }
203207
204208 // Default values to enable auto caching
@@ -207,9 +211,7 @@ void llamaCPP::InferenceImpl(
207211
208212 // Passing load value
209213 data[" repeat_last_n" ] = this ->repeat_last_n ;
210-
211- LOG_INFO << " Messages:" << completion.messages .toStyledString ();
212- LOG_INFO << " Stop:" << completion.stop .toStyledString ();
214+ LOG_INFO_REQUEST (request_id) << " Stop words:" << completion.stop .toStyledString ();
213215
214216 data[" stream" ] = completion.stream ;
215217 data[" n_predict" ] = completion.max_tokens ;
@@ -268,18 +270,18 @@ void llamaCPP::InferenceImpl(
268270 auto image_url = content_piece[" image_url" ][" url" ].asString ();
269271 std::string base64_image_data;
270272 if (image_url.find (" http" ) != std::string::npos) {
271- LOG_INFO << " Remote image detected but not supported yet" ;
273+ LOG_INFO_REQUEST (request_id) << " Remote image detected but not supported yet" ;
272274 } else if (image_url.find (" data:image" ) != std::string::npos) {
273- LOG_INFO << " Base64 image detected" ;
275+ LOG_INFO_REQUEST (request_id) << " Base64 image detected" ;
274276 base64_image_data = nitro_utils::extractBase64 (image_url);
275- LOG_INFO << base64_image_data;
277+ LOG_INFO_REQUEST (request_id) << base64_image_data;
276278 } else {
277- LOG_INFO << " Local image detected" ;
279+ LOG_INFO_REQUEST (request_id) << " Local image detected" ;
278280 nitro_utils::processLocalImage (
279281 image_url, [&](const std::string& base64Image) {
280282 base64_image_data = base64Image;
281283 });
282- LOG_INFO << base64_image_data;
284+ LOG_INFO_REQUEST (request_id) << base64_image_data;
283285 }
284286 content_piece_image_data[" data" ] = base64_image_data;
285287
@@ -306,7 +308,7 @@ void llamaCPP::InferenceImpl(
306308 }
307309 }
308310 formatted_output += ai_prompt;
309- LOG_INFO << formatted_output;
311+ LOG_INFO_REQUEST (request_id) << formatted_output;
310312 }
311313
312314 data[" prompt" ] = formatted_output;
@@ -322,35 +324,36 @@ void llamaCPP::InferenceImpl(
322324 bool is_streamed = data[" stream" ];
323325// Enable full message debugging
324326#ifdef DEBUG
325- LOG_INFO << " Current completion text" ;
326- LOG_INFO << formatted_output;
327+ LOG_INFO_REQUEST (request_id) << " Current completion text" ;
328+ LOG_INFO_REQUEST (request_id) << formatted_output;
327329#endif
328330
329331 if (is_streamed) {
332+ LOG_INFO_REQUEST (request_id) << " Streamed, waiting for respone" ;
330333 auto state = create_inference_state (this );
331334 auto chunked_content_provider =
332- [state, data](char * pBuffer, std::size_t nBuffSize) -> std::size_t {
335+ [state, data, request_id ](char * pBuffer, std::size_t nBuffSize) -> std::size_t {
333336 if (state->inference_status == PENDING) {
334337 state->inference_status = RUNNING;
335338 } else if (state->inference_status == FINISHED) {
336339 return 0 ;
337340 }
338341
339342 if (!pBuffer) {
340- LOG_INFO << " Connection closed or buffer is null. Reset context" ;
343+ LOG_WARN_REQUEST (request_id) " Connection closed or buffer is null. Reset context" ;
341344 state->inference_status = FINISHED;
342345 return 0 ;
343346 }
344347
345348 if (state->inference_status == EOS) {
346- LOG_INFO << " End of result" ;
349+ LOG_INFO_REQUEST (request_id) << " End of result" ;
347350 const std::string str =
348351 " data: " +
349352 create_return_json (nitro_utils::generate_random_string (20 ), " _" , " " ,
350353 " stop" ) +
351354 " \n\n " + " data: [DONE]" + " \n\n " ;
352355
353- LOG_VERBOSE (" data stream" , {{" to_send" , str}});
356+ LOG_VERBOSE (" data stream" , {{" request_id " : request_id}, { " to_send" , str}});
354357 std::size_t nRead = std::min (str.size (), nBuffSize);
355358 memcpy (pBuffer, str.data (), nRead);
356359 state->inference_status = FINISHED;
@@ -370,7 +373,7 @@ void llamaCPP::InferenceImpl(
370373 memcpy (pBuffer, str.data (), nRead);
371374
372375 if (result.stop ) {
373- LOG_INFO << " reached result stop" ;
376+ LOG_INFO_REQUEST (request_id) << " Reached result stop" ;
374377 state->inference_status = EOS;
375378 return nRead;
376379 }
@@ -383,14 +386,14 @@ void llamaCPP::InferenceImpl(
383386
384387 return nRead;
385388 } else {
386- LOG_INFO << " Error during inference" ;
389+ LOG_ERROR_REQUEST (request_id) << " Error during inference" ;
387390 }
388391 state->inference_status = FINISHED;
389392 return 0 ;
390393 };
391394 // Queued task
392395 state->instance ->queue ->runTaskInQueue (
393- [callback, state, data, chunked_content_provider]() {
396+ [callback, state, data, chunked_content_provider, request_id ]() {
394397 state->task_id =
395398 state->instance ->llama .request_completion (data, false , false , -1 );
396399
@@ -410,22 +413,22 @@ void llamaCPP::InferenceImpl(
410413 retries += 1 ;
411414 }
412415 if (state->inference_status != RUNNING)
413- LOG_INFO << " Wait for task to be released:" << state->task_id ;
416+ LOG_INFO_REQUEST (request_id) << " Wait for task to be released:" << state->task_id ;
414417 std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
415418 }
416- LOG_INFO << " Task completed, release it" ;
419+ LOG_INFO_REQUEST (request_id) << " Task completed, release it" ;
417420 // Request completed, release it
418421 state->instance ->llama .request_cancel (state->task_id );
422+ LOG_INFO_REQUEST (request_id) << " Inference completed" ;
419423 });
420424 } else {
421425 Json::Value respData;
422426 auto resp = nitro_utils::nitroHttpResponse ();
423427 int task_id = llama.request_completion (data, false , false , -1 );
424- LOG_INFO << " sent the non stream, waiting for respone" ;
428+ LOG_INFO_REQUEST (request_id) << " Non stream, waiting for respone" ;
425429 if (!json_value (data, " stream" , false )) {
426430 std::string completion_text;
427431 task_result result = llama.next_result (task_id);
428- LOG_INFO << " Here is the result:" << result.error ;
429432 if (!result.error && result.stop ) {
430433 int prompt_tokens = result.result_json [" tokens_evaluated" ];
431434 int predicted_tokens = result.result_json [" tokens_predicted" ];
@@ -435,9 +438,12 @@ void llamaCPP::InferenceImpl(
435438 prompt_tokens, predicted_tokens);
436439 resp->setBody (full_return);
437440 } else {
438- resp->setBody (" Internal error during inference" );
441+ respData[" message" ] = " Internal error during inference" ;
442+ resp = nitro_utils::nitroHttpJsonResponse (respData);
443+ LOG_ERROR_REQUEST (request_id) << " Error during inference" ;
439444 }
440445 callback (resp);
446+ LOG_INFO_REQUEST (request_id) << " Inference completed" ;
441447 }
442448 }
443449}
@@ -458,10 +464,12 @@ void llamaCPP::Embedding(
458464void llamaCPP::EmbeddingImpl (
459465 std::shared_ptr<Json::Value> jsonBody,
460466 std::function<void (const HttpResponsePtr&)>& callback) {
467+ int request_id = ++no_of_inference_requests;
468+ LOG_INFO_REQUEST (request_id) << " Generating reponse for embedding request" ;
461469 // Queue embedding task
462470 auto state = create_inference_state (this );
463471
464- state->instance ->queue ->runTaskInQueue ([this , state, jsonBody, callback]() {
472+ state->instance ->queue ->runTaskInQueue ([this , state, jsonBody, callback, request_id ]() {
465473 Json::Value responseData (Json::arrayValue);
466474
467475 if (jsonBody->isMember (" input" )) {
@@ -502,50 +510,47 @@ void llamaCPP::EmbeddingImpl(
502510 resp->setBody (Json::writeString (Json::StreamWriterBuilder (), root));
503511 resp->setContentTypeString (" application/json" );
504512 callback (resp);
513+ LOG_INFO_REQUEST (request_id) << " Embedding completed" ;
505514 });
506515}
507516
508517void llamaCPP::UnloadModel (
509518 const HttpRequestPtr& req,
510519 std::function<void (const HttpResponsePtr&)>&& callback) {
511520 Json::Value jsonResp;
512- jsonResp[" message" ] = " No model loaded" ;
513- if (llama.model_loaded_external ) {
521+ if (CheckModelLoaded (callback)) {
514522 StopBackgroundTask ();
515523
516524 llama_free (llama.ctx );
517525 llama_free_model (llama.model );
518526 llama.ctx = nullptr ;
519527 llama.model = nullptr ;
520528 jsonResp[" message" ] = " Model unloaded successfully" ;
529+ auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
530+ callback (resp);
531+ LOG_INFO << " Model unloaded successfully" ;
521532 }
522- auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
523- callback (resp);
524- return ;
525533}
526534
527535void llamaCPP::ModelStatus (
528536 const HttpRequestPtr& req,
529537 std::function<void (const HttpResponsePtr&)>&& callback) {
530538 Json::Value jsonResp;
531539 bool is_model_loaded = llama.model_loaded_external ;
532- if (is_model_loaded ) {
540+ if (CheckModelLoaded (callback) ) {
533541 jsonResp[" model_loaded" ] = is_model_loaded;
534542 jsonResp[" model_data" ] = llama.get_model_props ().dump ();
535- } else {
536- jsonResp[" model_loaded" ] = is_model_loaded;
537- }
538-
539- auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
540- callback (resp);
541- return ;
543+ auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
544+ callback (resp);
545+ LOG_INFO << " Model status responded" ;
546+ }
542547}
543548
544549void llamaCPP::LoadModel (
545550 const HttpRequestPtr& req,
546551 std::function<void (const HttpResponsePtr&)>&& callback) {
547552 if (llama.model_loaded_external ) {
548- LOG_INFO << " model loaded" ;
553+ LOG_INFO << " Model already loaded" ;
549554 Json::Value jsonResp;
550555 jsonResp[" message" ] = " Model already loaded" ;
551556 auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
@@ -568,6 +573,7 @@ void llamaCPP::LoadModel(
568573 jsonResp[" message" ] = " Model loaded successfully" ;
569574 auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
570575 callback (resp);
576+ LOG_INFO << " Model loaded successfully" ;
571577 }
572578}
573579
@@ -602,7 +608,17 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
602608 }
603609 };
604610
605- params.model = jsonBody->operator [](" llama_model_path" ).asString ();
611+ Json::Value model_path = jsonBody->operator [](" llama_model_path" );
612+ if (model_path.isNull ()) {
613+ LOG_ERROR << " Missing model path in request" ;
614+ } else {
615+ if (std::filesystem::exists (std::filesystem::path (model_path.asString ()))) {
616+ params.model = model_path.asString ();
617+ } else {
618+ LOG_ERROR << " Could not find model in path " << model_path.asString ();
619+ }
620+ }
621+
606622 params.n_gpu_layers = jsonBody->get (" ngl" , 100 ).asInt ();
607623 params.n_ctx = jsonBody->get (" ctx_len" , 2048 ).asInt ();
608624 params.embedding = jsonBody->get (" embedding" , true ).asBool ();
@@ -681,7 +697,7 @@ void llamaCPP::StopBackgroundTask() {
681697 if (llama.model_loaded_external ) {
682698 llama.model_loaded_external = false ;
683699 llama.condition_tasks .notify_one ();
684- LOG_INFO << " changed to false " ;
700+ LOG_INFO << " Background task stopped! " ;
685701 if (backgroundThread.joinable ()) {
686702 backgroundThread.join ();
687703 }
0 commit comments