@@ -197,15 +197,16 @@ void llamaCPP::InferenceImpl(
197197 // To set default value
198198
199199 // Increase number of chats received and clean the prompt
200- no_of_chats++;
201- if (no_of_chats % clean_cache_threshold == 0 ) {
202- LOG_INFO_REQUEST (request_id) << " Clean cache threshold reached!" ;
203- llama.kv_cache_clear ();
204- LOG_INFO_REQUEST (request_id) << " Cache cleaned" ;
205- }
200+ // no_of_chats++;
201+ // if (no_of_chats % clean_cache_threshold == 0) {
202+ // LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!";
203+ // llama.kv_cache_clear();
204+ // LOG_INFO_REQUEST(request_id) << "Cache cleaned";
205+ // }
206206
207207 // Default values to enable auto caching
208- data[" cache_prompt" ] = caching_enabled;
208+ // data["cache_prompt"] = caching_enabled;
209+ data[" cache_prompt" ] = false ;
209210 data[" n_keep" ] = -1 ;
210211
211212 // Passing load value
@@ -655,7 +656,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
655656 params.cont_batching = jsonBody->get (" cont_batching" , false ).asBool ();
656657 this ->clean_cache_threshold =
657658 jsonBody->get (" clean_cache_threshold" , 5 ).asInt ();
658- this ->caching_enabled = jsonBody->get (" caching_enabled" , true ).asBool ();
659+ this ->caching_enabled = jsonBody->get (" caching_enabled" , false ).asBool ();
659660 this ->user_prompt = jsonBody->get (" user_prompt" , " USER: " ).asString ();
660661 this ->ai_prompt = jsonBody->get (" ai_prompt" , " ASSISTANT: " ).asString ();
661662 this ->system_prompt =
0 commit comments