Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit eb403d5

Browse files
committed
feat: temporary fix to add artificial queue into nitro
1 parent 18575c3 commit eb403d5

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

controllers/llamaCPP.cc

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "llama.h"
33
#include "log.h"
44
#include "utils/nitro_utils.h"
5+
#include <thread>
56

67
using namespace inferences;
78
using json = nlohmann::json;
@@ -293,20 +294,38 @@ void llamaCPP::chatCompletion(
293294
LOG_INFO << "Current completion text";
294295
LOG_INFO << formatted_output;
295296
#endif
296-
const int task_id = llama.request_completion(data, false, false, -1);
297+
int task_id;
298+
299+
if (llama.params.n_parallel == 1) {
300+
while (true) {
301+
if (!single_queue_is_busy) {
302+
task_id = llama.request_completion(data, false, false, -1);
303+
single_queue_is_busy = true;
304+
break;
305+
} else {
306+
std::this_thread::sleep_for(
307+
std::chrono::milliseconds(100)); // Sleep for 500 milliseconds
308+
}
309+
}
310+
} else {
311+
task_id = llama.request_completion(data, false, false, -1);
312+
}
313+
297314
LOG_INFO << "Resolved request for task_id:" << task_id;
298315

299316
if (is_streamed) {
300317
auto state = createState(task_id, this);
301318

302319
auto chunked_content_provider =
303-
[state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
320+
[this, state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
304321
if (!pBuffer) {
305322
LOG_INFO << "Connection closed or buffer is null. Reset context";
306323
state->instance->llama.request_cancel(state->task_id);
324+
single_queue_is_busy = false;
307325
return 0;
308326
}
309327
if (state->isStopped) {
328+
single_queue_is_busy = false;
310329
return 0;
311330
}
312331

@@ -339,8 +358,10 @@ void llamaCPP::chatCompletion(
339358
}
340359
return nRead;
341360
} else {
361+
single_queue_is_busy = false;
342362
return 0;
343363
}
364+
single_queue_is_busy = false;
344365
return 0;
345366
};
346367
auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,

controllers/llamaCPP.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2560,7 +2560,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
25602560

25612561
private:
25622562
llama_server_context llama;
2563-
//std::atomic<bool> model_loaded = false;
2563+
// std::atomic<bool> model_loaded = false;
25642564
size_t sent_count = 0;
25652565
size_t sent_token_probs_index = 0;
25662566
std::thread backgroundThread;
@@ -2572,5 +2572,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
25722572
bool caching_enabled;
25732573
std::atomic<int> no_of_chats = 0;
25742574
int clean_cache_threshold;
2575+
std::atomic<bool> single_queue_is_busy; // This value only used under the
2576+
// condition n_parallel is 1
25752577
};
25762578
}; // namespace inferences

0 commit comments

Comments
 (0)