Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit f777d79

Browse files
authored
Merge pull request #229 from tikikun/main
multiple important fixes for CPU specific optimization
2 parents 37944d5 + 16c46a7 commit f777d79

File tree

3 files changed

+10
-9
lines changed

3 files changed

+10
-9
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ Table of parameters
107107
| `system_prompt` | String | The prompt to use for system rules. |
108108
| `pre_prompt` | String | The prompt to use for internal configuration. |
109109
| `cpu_threads` | Integer | The number of threads to use for inferencing (CPU MODE ONLY) |
110+
| `n_batch` | Integer | The batch size for prompt eval step |
110111

111112
***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal
112113
```zsh

controllers/llamaCPP.cc

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,9 @@ void llamaCPP::chatCompletion(
170170
data["cache_prompt"] = true;
171171
data["n_keep"] = -1;
172172

173+
// Passing load value
174+
data["repeat_last_n"] = this->repeat_last_n;
175+
173176
data["stream"] = (*jsonBody).get("stream", false).asBool();
174177
data["n_predict"] = (*jsonBody).get("max_tokens", 500).asInt();
175178
data["top_p"] = (*jsonBody).get("top_p", 0.95).asFloat();
@@ -200,6 +203,8 @@ void llamaCPP::chatCompletion(
200203
stopWords.push_back(stop_word.asString());
201204
}
202205
// specify default stop words
206+
// Ensure success case for chatML
207+
stopWords.push_back("<|im_end|>");
203208
stopWords.push_back(nitro_utils::rtrim(user_prompt));
204209
data["stop"] = stopWords;
205210
}
@@ -374,7 +379,7 @@ void llamaCPP::loadModel(
374379
params.n_ctx = (*jsonBody).get("ctx_len", 2048).asInt();
375380
params.embedding = (*jsonBody).get("embedding", true).asBool();
376381
// Check if n_parallel exists in jsonBody, if not, set to drogon_thread
377-
382+
params.n_batch = (*jsonBody).get("n_batch", 512).asInt();
378383
params.n_parallel = (*jsonBody).get("n_parallel", drogon_thread).asInt();
379384
params.n_threads =
380385
(*jsonBody)
@@ -386,14 +391,8 @@ void llamaCPP::loadModel(
386391
this->ai_prompt = (*jsonBody).get("ai_prompt", "ASSISTANT: ").asString();
387392
this->system_prompt =
388393
(*jsonBody).get("system_prompt", "ASSISTANT's RULE: ").asString();
389-
this->pre_prompt =
390-
(*jsonBody)
391-
.get("pre_prompt",
392-
"A chat between a curious user and an artificial "
393-
"intelligence "
394-
"assistant. The assistant follows the given rules no matter "
395-
"what.\\n")
396-
.asString();
394+
this->pre_prompt = (*jsonBody).get("pre_prompt", "").asString();
395+
this->repeat_last_n = (*jsonBody).get("repeat_last_n", 32).asInt();
397396
}
398397
#ifdef GGML_USE_CUBLAS
399398
LOG_INFO << "Setting up GGML CUBLAS PARAMS";

controllers/llamaCPP.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2161,5 +2161,6 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
21612161
std::string ai_prompt;
21622162
std::string system_prompt;
21632163
std::string pre_prompt;
2164+
int repeat_last_n;
21642165
};
21652166
}; // namespace inferences

0 commit comments

Comments
 (0)