Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions prepare_llm_models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ PHI4_MODEL="microsoft/Phi-4-mini-instruct"
MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
GPT_OSS_MODEL="openai/gpt-oss-20b"
DEVSTRAL_MODEL="unsloth/Devstral-Small-2507"
GEMMA4_MODEL="google/gemma-4-26B-A4B-it"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it will probably require newer transformers5.5. Before it is added to optimum-intel, you might just pull the tokenizer from https://huggingface.co/OpenVINO/gemma-4-E4B-it-int4-ov/tree/main


if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi

Expand Down Expand Up @@ -217,3 +218,14 @@ if [ ! -f "$1/$DEVSTRAL_MODEL/$TOKENIZER_FILE" ]; then
echo "[ERROR] Models file $1/$DEVSTRAL_MODEL/$TOKENIZER_FILE does not exist."
exit 1
fi

if [ -f "$1/$GEMMA4_MODEL/$TOKENIZER_FILE" ]; then
echo "Models file $1/$GEMMA4_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
else
mkdir -p $1/$GEMMA4_MODEL
convert_tokenizer $GEMMA4_MODEL --with_detokenizer -o $1/$GEMMA4_MODEL
fi
if [ ! -f "$1/$GEMMA4_MODEL/$TOKENIZER_FILE" ]; then
echo "[ERROR] Models file $1/$GEMMA4_MODEL/$TOKENIZER_FILE does not exist."
exit 1
fi
3 changes: 2 additions & 1 deletion spelling-whitelist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ release_files/thirdparty-licenses/libgt2.LICENSE.txt:1083: publically ==> public
src/test/llm/output_parsers/qwen3coder_output_parser_test.cpp
demos/vlm_npu/README.md:157: mane ==> main, many, maine
demos/vlm_npu/README.md:218: mane ==> main, many, maine
demos/integration_with_OpenWebUI/README.md:423: Buildin ==> Building, Build in
src/test/llm/output_parsers/gemma4_output_parser_test.cpp
demos/integration_with_OpenWebUI/README.md:423: Buildin ==> Building, Build in
24 changes: 24 additions & 0 deletions src/llm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,12 @@ ovms_cc_library(
name = "io_processing_utils",
hdrs = ["io_processing/utils.hpp"],
srcs = ["io_processing/utils.cpp"],
deps = [
"@com_github_tencent_rapidjson//:rapidjson",
"//src/port:rapidjson_stringbuffer",
"//src/port:rapidjson_writer",
"//src/port:rapidjson_document",
],
visibility = ["//visibility:public"],
)

Expand Down Expand Up @@ -175,6 +181,23 @@ ovms_cc_library(
],
visibility = ["//visibility:public"],
)

ovms_cc_library(
name = "io_processing_gemma4_tool_parser",
hdrs = ["io_processing/gemma4/tool_parser.hpp"],
srcs = ["io_processing/gemma4/tool_parser.cpp"],
deps = [
"@com_github_tencent_rapidjson//:rapidjson",
"//src/port:rapidjson_document",
"//src:libovmslogging",
"//src:libovmsstring_utils",
":io_processing_utils",
":io_processing_base_output_parser",
"//third_party:genai",
],
visibility = ["//visibility:public"],
)

ovms_cc_library( # TODO split further so we don't have to recompile everything when changing one parser ...
name = "output_parsers",
hdrs = [
Expand Down Expand Up @@ -210,6 +233,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
":partial_json_builder",
":io_processing_base_output_parser",
":io_processing_qwen3coder_tool_parser",
":io_processing_gemma4_tool_parser",
":io_processing_utils",
":apis_tool_schema_wrapper",
],
Expand Down
2 changes: 1 addition & 1 deletion src/llm/apis/openai_api_handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ class OpenAIApiHandler {
// Serialization - pure virtual, each handler produces its own response format
virtual std::string serializeUnaryResponse(const std::vector<ov::genai::GenerationOutput>& generationOutputs) = 0;
virtual std::string serializeUnaryResponse(ov::genai::EncodedResults& results) = 0;
virtual std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results) = 0;
virtual std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) = 0;
virtual std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) = 0;
virtual std::string serializeStreamingUsageChunk() = 0;
virtual std::string serializeStreamingHandshakeChunk() = 0;
Expand Down
9 changes: 4 additions & 5 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco
return jsonResponse.ToString();
}

std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results) {
std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) {
OVMS_PROFILE_FUNCTION();
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
Expand All @@ -470,13 +470,12 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD
jsonResponse.StartArray("choices");
int index = 0;

for (int i = 0; i < results.texts.size(); i++) {
const std::string& text = results.texts[i];
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", text);
if (!textResponse.empty()) {
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", textResponse);

// Workaround to use OVMS unary parsers: get tokens from string
// This way we have detokenized text from GenAI and calculate tokens, to further convert back to text again, in parseOutputIfNeeded...
auto generatedTokens = encodeTextToTokens(text);
auto generatedTokens = encodeTextToTokens(textResponse);

SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", generatedTokens);
ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens);
Expand Down
2 changes: 1 addition & 1 deletion src/llm/apis/openai_completions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class OpenAIChatCompletionsHandler : public OpenAIApiHandler {

std::string serializeUnaryResponse(const std::vector<ov::genai::GenerationOutput>& generationOutputs) override;
std::string serializeUnaryResponse(ov::genai::EncodedResults& results) override;
std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results) override;
std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) override;
std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) override;
std::string serializeStreamingUsageChunk() override;
std::string serializeStreamingHandshakeChunk() override;
Expand Down
8 changes: 4 additions & 4 deletions src/llm/apis/openai_responses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,21 +655,21 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes
return serializeUnaryResponseImpl(parsedOutputs);
}

std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results) {
std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) {
OVMS_PROFILE_FUNCTION();
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
// Usage is already correctly set from perf_metrics above — no need for updateUsage.
std::vector<ParsedOutput> parsedOutputs;
for (const std::string& text : results.texts) {
if (!textResponse.empty()) {
if (outputParser != nullptr) {
// Same workaround as in chat completions
auto generatedTokens = encodeTextToTokens(text);
auto generatedTokens = encodeTextToTokens(textResponse);
parsedOutputs.push_back(parseOutputIfNeeded(generatedTokens));
} else {
// Fast path: no output parser, use decoded text directly.
ParsedOutput output;
output.content = text;
output.content = textResponse;
parsedOutputs.push_back(std::move(output));
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/llm/apis/openai_responses.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class OpenAIResponsesHandler : public OpenAIApiHandler {

std::string serializeUnaryResponse(const std::vector<ov::genai::GenerationOutput>& generationOutputs) override;
std::string serializeUnaryResponse(ov::genai::EncodedResults& results) override;
std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results) override;
std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) override;
std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) override;
std::string serializeStreamingUsageChunk() override;
std::string serializeStreamingHandshakeChunk() override;
Expand Down
Loading