-
Notifications
You must be signed in to change notification settings - Fork 102
MINIFICPP-2719 - Add multimodal capability to llama.cpp processor #2107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
31ddfff
6e5d52c
d2ce276
9dc1f90
09c3416
efb65a9
f96ef0b
e111c62
841edc6
97c02ac
049df46
09fe599
b90ec86
c0bf227
263fa3e
15ca989
1a85882
d059ed1
3bfaffb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,8 +16,12 @@ | |
| */ | ||
|
|
||
| #include "DefaultLlamaContext.h" | ||
|
|
||
| #include <range/v3/all.hpp> | ||
|
|
||
| #include "minifi-cpp/Exception.h" | ||
| #include "fmt/format.h" | ||
| #include "mtmd/mtmd-helper.h" | ||
|
|
||
| namespace org::apache::nifi::minifi::extensions::llamacpp::processors { | ||
|
|
||
|
|
@@ -36,25 +40,26 @@ std::vector<llama_token> tokenizeInput(const llama_vocab* vocab, const std::stri | |
| return tokenized_input; | ||
| } | ||
|
|
||
| constexpr size_t DEFAULT_BUFFER_SIZE = 4096; | ||
|
|
||
| } // namespace | ||
|
|
||
|
|
||
| DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params) { | ||
| DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path, | ||
| const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr<core::logging::Logger>& logger) { | ||
| llama_model_ = llama_model_load_from_file(model_path.string().c_str(), llama_model_default_params()); // NOLINT(cppcoreguidelines-prefer-member-initializer) | ||
| if (!llama_model_) { | ||
| throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load model from '{}'", model_path.string())); | ||
| } | ||
|
|
||
| chat_template_ = common_chat_templates_init(llama_model_, ""); | ||
|
|
||
| llama_context_params ctx_params = llama_context_default_params(); | ||
| ctx_params.n_ctx = llama_ctx_params.n_ctx; | ||
| ctx_params.n_batch = llama_ctx_params.n_batch; | ||
| ctx_params.n_ubatch = llama_ctx_params.n_ubatch; | ||
| ctx_params.n_seq_max = llama_ctx_params.n_seq_max; | ||
| ctx_params.n_threads = llama_ctx_params.n_threads; | ||
| ctx_params.n_threads_batch = llama_ctx_params.n_threads_batch; | ||
| ctx_params.flash_attn = false; | ||
| ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; | ||
| llama_ctx_ = llama_init_from_model(llama_model_, ctx_params); | ||
|
|
||
| auto sparams = llama_sampler_chain_default_params(); | ||
|
|
@@ -73,9 +78,27 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path | |
| llama_sampler_chain_add(llama_sampler_, llama_sampler_init_temp(*llama_sampler_params.temperature)); | ||
| } | ||
| llama_sampler_chain_add(llama_sampler_, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); | ||
|
|
||
| if (!multimodal_model_path) { | ||
| logger->log_info("No multimodal model path provided"); | ||
| return; | ||
| } | ||
|
|
||
| mtmd_context_params mparams = mtmd_context_params_default(); | ||
| mparams.use_gpu = false; | ||
| mparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; | ||
|
|
||
| multimodal_ctx_ = mtmd_init_from_file(multimodal_model_path->string().c_str(), llama_model_, mparams); | ||
| if (!multimodal_ctx_) { | ||
| throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load multimodal model from '{}'", multimodal_model_path->string())); | ||
| } | ||
|
|
||
| logger->log_info("Successfully loaded multimodal model from '{}'", multimodal_model_path->string()); | ||
|
Comment on lines
+82
to
+96
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would extract this to a separate function and have something like |
||
| } | ||
|
|
||
| DefaultLlamaContext::~DefaultLlamaContext() { | ||
| mtmd_free(multimodal_ctx_); | ||
| multimodal_ctx_ = nullptr; | ||
| llama_sampler_free(llama_sampler_); | ||
| llama_sampler_ = nullptr; | ||
| llama_free(llama_ctx_); | ||
|
|
@@ -85,47 +108,96 @@ DefaultLlamaContext::~DefaultLlamaContext() { | |
| } | ||
|
|
||
| std::optional<std::string> DefaultLlamaContext::applyTemplate(const std::vector<LlamaChatMessage>& messages) { | ||
| std::vector<llama_chat_message> llama_messages; | ||
| llama_messages.reserve(messages.size()); | ||
| std::transform(messages.begin(), messages.end(), std::back_inserter(llama_messages), | ||
| [](const LlamaChatMessage& msg) { return llama_chat_message{.role = msg.role.c_str(), .content = msg.content.c_str()}; }); | ||
| std::string text; | ||
| text.resize(DEFAULT_BUFFER_SIZE); | ||
| const char * chat_template = llama_model_chat_template(llama_model_, nullptr); | ||
| int32_t res_size = llama_chat_apply_template(chat_template, llama_messages.data(), llama_messages.size(), true, text.data(), gsl::narrow<int32_t>(text.size())); | ||
| if (res_size < 0) { | ||
| if (!chat_template_) { | ||
| return std::nullopt; | ||
| } | ||
| if (res_size > gsl::narrow<int32_t>(text.size())) { | ||
| text.resize(res_size); | ||
| res_size = llama_chat_apply_template(chat_template, llama_messages.data(), llama_messages.size(), true, text.data(), gsl::narrow<int32_t>(text.size())); | ||
| if (res_size < 0) { | ||
| return std::nullopt; | ||
| } | ||
| common_chat_templates_inputs inputs; | ||
| for (auto& msg : messages) { | ||
| common_chat_msg chat_msg; | ||
| chat_msg.role = msg.role; | ||
| chat_msg.content = msg.content; | ||
| inputs.messages.push_back(std::move(chat_msg)); | ||
| } | ||
| text.resize(res_size); | ||
| inputs.enable_thinking = false; // TODO(adebreceni): MINIFICPP-2800 common_chat_templates_support_enable_thinking(chat_template_.get()); | ||
|
|
||
| return text; | ||
| return common_chat_templates_apply(chat_template_.get(), inputs).prompt; | ||
| } | ||
|
|
||
| std::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) { | ||
| namespace { | ||
|
|
||
| struct mtmd_bitmap_deleter { | ||
| void operator()(mtmd_bitmap* val) { mtmd_bitmap_free(val); } | ||
| }; | ||
| using unique_bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>; | ||
|
|
||
| struct mtmd_input_chunks_deleter { | ||
| void operator()(mtmd_input_chunks* val) { mtmd_input_chunks_free(val); } | ||
| }; | ||
| using unique_mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>; | ||
|
|
||
| } // namespace | ||
|
|
||
| std::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files, | ||
| std::function<void(std::string_view/*token*/)> token_handler) { | ||
| GenerationResult result{}; | ||
| auto start_time = std::chrono::steady_clock::now(); | ||
| llama_memory_seq_rm(llama_get_memory(llama_ctx_), 0, -1, -1); | ||
| const llama_vocab * vocab = llama_model_get_vocab(llama_model_); | ||
| std::vector<llama_token> tokenized_input = tokenizeInput(vocab, input); | ||
| result.num_tokens_in = gsl::narrow<uint64_t>(tokenized_input.size()); | ||
| llama_pos n_past = 0; | ||
| std::vector<llama_token> tokenized_input; | ||
| llama_batch batch = llama_batch_init(1, 0, 1); | ||
| auto batch_deleter = gsl::finally([&] {llama_batch_free(batch);}); | ||
| batch.n_tokens = 1; | ||
| batch.n_seq_id[0] = 1; | ||
| batch.seq_id[0][0] = 0; | ||
| batch.logits[0] = true; | ||
|
Comment on lines
+149
to
+153
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be moved before the |
||
| int32_t decode_status = 0; | ||
| if (multimodal_ctx_) { | ||
| if (files.empty()) { | ||
| return std::unexpected{"Multimodal input requires at least one file"}; | ||
| } | ||
| std::vector<unique_bitmap_ptr> bitmaps; | ||
| for (auto& file : files) { | ||
| unique_bitmap_ptr bitmap{mtmd_helper_bitmap_init_from_buf(multimodal_ctx_, reinterpret_cast<const unsigned char*>(file.data()), file.size())}; | ||
| if (!bitmap) { | ||
| throw Exception(PROCESSOR_EXCEPTION, "Failed to create multimodal bitmap from buffer"); | ||
| } | ||
| bitmaps.push_back(std::move(bitmap)); | ||
| } | ||
| mtmd_input_text inp_txt = { | ||
| .text = prompt.c_str(), | ||
| .add_special = true, | ||
| .parse_special = true, | ||
| }; | ||
| unique_mtmd_input_chunks_ptr chunks{mtmd_input_chunks_init()}; | ||
| auto bitmap_c_ptrs = bitmaps | ranges::views::transform([] (auto& ptr) {return static_cast<const mtmd_bitmap*>(ptr.get());}) | ranges::to<std::vector>(); | ||
| auto tokenized = mtmd_tokenize(multimodal_ctx_, chunks.get(), &inp_txt, bitmap_c_ptrs.data(), bitmap_c_ptrs.size()); | ||
| if (tokenized != 0) { | ||
| throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to tokenize multimodal prompt, error: {}", tokenized)); | ||
| } | ||
| auto status = mtmd_helper_eval_chunks(multimodal_ctx_, llama_ctx_, chunks.get(), 0, 0, 1, true, &n_past); | ||
| if (status != 0) { | ||
| throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to eval multimodal chunks, error: {}", status)); | ||
| } | ||
|
adamdebreceni marked this conversation as resolved.
Comment on lines
+156
to
+181
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would extract this to a separate function. Additionally why is llama_decode run in case of the of string tokenization, but not in the multimodal use case? |
||
| } else { | ||
| if (!files.empty()) { | ||
| return std::unexpected{"Model is not configured for multimodal input"}; | ||
| } | ||
| try { | ||
| tokenized_input = tokenizeInput(vocab, prompt); | ||
| } catch (std::exception& e) { | ||
| return std::unexpected{fmt::format("Error during tokenization: {}", e.what())}; | ||
| } catch (...) { | ||
| return std::unexpected{"Unknown error during tokenization"}; | ||
| } | ||
| n_past = gsl::narrow<llama_pos>(tokenized_input.size()); | ||
| decode_status = llama_decode(llama_ctx_, llama_batch_get_one(tokenized_input.data(), n_past)); | ||
| } | ||
| result.num_tokens_in = gsl::narrow<uint64_t>(n_past); | ||
|
|
||
| llama_batch batch = llama_batch_get_one(tokenized_input.data(), gsl::narrow<int32_t>(tokenized_input.size())); | ||
| llama_token new_token_id = 0; | ||
| bool first_token_generated = false; | ||
| while (true) { | ||
| int32_t res = llama_decode(llama_ctx_, batch); | ||
| if (res == 1) { | ||
| return std::unexpected{"Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)"}; | ||
| } else if (res < 0) { | ||
| return std::unexpected{"Error occurred while executing llama decode"}; | ||
| } | ||
|
|
||
| while (decode_status == 0) { | ||
| new_token_id = llama_sampler_sample(llama_sampler_, llama_ctx_, -1); | ||
| if (!first_token_generated) { | ||
| result.time_to_first_token = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start_time); | ||
|
|
@@ -147,8 +219,22 @@ std::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const | |
| gsl_Assert(len < 128); | ||
|
|
||
| std::string_view token_str{buf.data(), gsl::narrow<std::string_view::size_type>(len)}; | ||
| batch = llama_batch_get_one(&new_token_id, 1); | ||
| batch.token[0] = new_token_id; | ||
| batch.pos[0] = n_past; | ||
| ++n_past; | ||
| token_handler(token_str); | ||
|
|
||
| decode_status = llama_decode(llama_ctx_, batch); | ||
| } | ||
|
|
||
| if (decode_status == 1) { | ||
| return std::unexpected("Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)"); | ||
| } | ||
| if (decode_status == 2) { | ||
| return std::unexpected("Llama decode aborted"); | ||
| } | ||
| if (decode_status < 0) { | ||
| return std::unexpected("Error occurred while executing llama decode"); | ||
| } | ||
|
|
||
| result.tokens_per_second = | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should keep function names unique: