@@ -170,6 +170,9 @@ void llamaCPP::chatCompletion(
170170 data[" cache_prompt" ] = true ;
171171 data[" n_keep" ] = -1 ;
172172
173+ // Passing load value
174+ data[" repeat_last_n" ] = this ->repeat_last_n ;
175+
173176 data[" stream" ] = (*jsonBody).get (" stream" , false ).asBool ();
174177 data[" n_predict" ] = (*jsonBody).get (" max_tokens" , 500 ).asInt ();
175178 data[" top_p" ] = (*jsonBody).get (" top_p" , 0.95 ).asFloat ();
@@ -200,6 +203,8 @@ void llamaCPP::chatCompletion(
200203 stopWords.push_back (stop_word.asString ());
201204 }
202205 // specify default stop words
206+ // Ensure success case for chatML
207+ stopWords.push_back (" <|im_end|>" );
203208 stopWords.push_back (nitro_utils::rtrim (user_prompt));
204209 data[" stop" ] = stopWords;
205210 }
@@ -374,7 +379,7 @@ void llamaCPP::loadModel(
374379 params.n_ctx = (*jsonBody).get (" ctx_len" , 2048 ).asInt ();
375380 params.embedding = (*jsonBody).get (" embedding" , true ).asBool ();
376381 // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
377-
382+ params. n_batch = (*jsonBody). get ( " n_batch " , 512 ). asInt ();
378383 params.n_parallel = (*jsonBody).get (" n_parallel" , drogon_thread).asInt ();
379384 params.n_threads =
380385 (*jsonBody)
@@ -386,14 +391,8 @@ void llamaCPP::loadModel(
386391 this ->ai_prompt = (*jsonBody).get (" ai_prompt" , " ASSISTANT: " ).asString ();
387392 this ->system_prompt =
388393 (*jsonBody).get (" system_prompt" , " ASSISTANT's RULE: " ).asString ();
389- this ->pre_prompt =
390- (*jsonBody)
391- .get (" pre_prompt" ,
392- " A chat between a curious user and an artificial "
393- " intelligence "
394- " assistant. The assistant follows the given rules no matter "
395- " what.\\ n" )
396- .asString ();
394+ this ->pre_prompt = (*jsonBody).get (" pre_prompt" , " " ).asString ();
395+ this ->repeat_last_n = (*jsonBody).get (" repeat_last_n" , 32 ).asInt ();
397396 }
398397#ifdef GGML_USE_CUBLAS
399398 LOG_INFO << " Setting up GGML CUBLAS PARAMS" ;
0 commit comments