Merge pull request #229 from tikikun/main

tikikun · web-flow · commit f777d79f58d5 · 2023-12-02T23:26:36.000+07:00
multiple important fixes for CPU specific optimization
diff --git a/README.md b/README.md
@@ -107,6 +107,7 @@ Table of parameters
 | `system_prompt`    | String  | The prompt to use for system rules.                          |
 | `pre_prompt`    | String  | The prompt to use for internal configuration.                          |
 | `cpu_threads`   | Integer | The number of threads to use for inferencing (CPU MODE ONLY) |
+| `n_batch`       | Integer | The batch size for prompt eval step |
 
 ***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal
 ```zsh
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -170,6 +170,9 @@ void llamaCPP::chatCompletion(
     data["cache_prompt"] = true;
     data["n_keep"] = -1;
 
+    // Passing load value
+    data["repeat_last_n"] = this->repeat_last_n;
+
     data["stream"] = (*jsonBody).get("stream", false).asBool();
     data["n_predict"] = (*jsonBody).get("max_tokens", 500).asInt();
     data["top_p"] = (*jsonBody).get("top_p", 0.95).asFloat();
@@ -200,6 +203,8 @@ void llamaCPP::chatCompletion(
       stopWords.push_back(stop_word.asString());
     }
     // specify default stop words
+    // Ensure success case for chatML
+    stopWords.push_back("<|im_end|>");
     stopWords.push_back(nitro_utils::rtrim(user_prompt));
     data["stop"] = stopWords;
   }
@@ -374,7 +379,7 @@ void llamaCPP::loadModel(
     params.n_ctx = (*jsonBody).get("ctx_len", 2048).asInt();
     params.embedding = (*jsonBody).get("embedding", true).asBool();
     // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
-
+    params.n_batch = (*jsonBody).get("n_batch", 512).asInt();
     params.n_parallel = (*jsonBody).get("n_parallel", drogon_thread).asInt();
     params.n_threads =
         (*jsonBody)
@@ -386,14 +391,8 @@ void llamaCPP::loadModel(
     this->ai_prompt = (*jsonBody).get("ai_prompt", "ASSISTANT: ").asString();
     this->system_prompt =
         (*jsonBody).get("system_prompt", "ASSISTANT's RULE: ").asString();
-    this->pre_prompt =
-        (*jsonBody)
-            .get("pre_prompt",
-                 "A chat between a curious user and an artificial "
-                 "intelligence "
-                 "assistant. The assistant follows the given rules no matter "
-                 "what.\\n")
-            .asString();
+    this->pre_prompt = (*jsonBody).get("pre_prompt", "").asString();
+    this->repeat_last_n = (*jsonBody).get("repeat_last_n", 32).asInt();
   }
 #ifdef GGML_USE_CUBLAS
   LOG_INFO << "Setting up GGML CUBLAS PARAMS";
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -2161,5 +2161,6 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
   std::string ai_prompt;
   std::string system_prompt;
   std::string pre_prompt;
+  int repeat_last_n;
 };
 }; // namespace inferences