From c6d0f8fb121413d440e4c09d9305f6bad43f0415 Mon Sep 17 00:00:00 2001 From: PARTH SHAH Date: Wed, 11 Jun 2025 12:11:38 +0530 Subject: [PATCH 1/2] Add support for generating SHARED library with -DBUILD_SHARED_LIBS=ON flag --- CMakeLists.txt | 9 +++++- src/static_exports.cc | 65 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 src/static_exports.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index ab3d6ca..28892e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,8 @@ cmake_minimum_required(VERSION 3.19...3.30) project(tokenizers_cpp C CXX) +option(BUILD_SHARED_LIBS "Build tokenizers_cpp as a shared library" OFF) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) @@ -157,7 +159,12 @@ set( src/huggingface_tokenizer.cc src/rwkv_world_tokenizer.cc ) -add_library(tokenizers_cpp STATIC ${TOKENIZER_CPP_SRCS}) + +if(BUILD_SHARED_LIBS) + add_library(tokenizers_cpp SHARED ${TOKENIZER_CPP_SRCS} src/static_exports.cc) +else() + add_library(tokenizers_cpp STATIC ${TOKENIZER_CPP_SRCS}) +endif() target_include_directories(tokenizers_cpp PRIVATE sentencepiece/src) target_include_directories(tokenizers_cpp PRIVATE msgpack/include) target_include_directories(tokenizers_cpp PUBLIC ${TOKENIZERS_CPP_INCLUDE}) diff --git a/src/static_exports.cc b/src/static_exports.cc new file mode 100644 index 0000000..cdbdc0e --- /dev/null +++ b/src/static_exports.cc @@ -0,0 +1,65 @@ +#include "tokenizers_cpp.h" + +// export LoadBlobJsonAndEncode(const std::string &, const std::string &, std::vector &) +extern "C" __declspec(dllexport) void LoadBlobJsonAndEncode(const std::string& json_blob, const std::string& text, std::vector& token_ids) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobJSON(json_blob); + token_ids = tokenizer->Encode(text); + return; +} + +// export LoadBlobJsonAndEncodeBatch(const std::string &, const std::vector &, std::vector> &) +extern "C" __declspec(dllexport) void LoadBlobJsonAndEncodeBatch(const std::string& json_blob, const std::vector& texts, std::vector>& token_ids_batch) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobJSON(json_blob); + token_ids_batch = tokenizer->EncodeBatch(texts); + return; +} + +// export LoadBlobSentencePieceAndEncode(const std::string &, const std::string &, std::vector &) +extern "C" __declspec(dllexport) void LoadBlobSentencePieceAndEncode(const std::string& model_blob, const std::string& text, std::vector& token_ids) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobSentencePiece(model_blob); + token_ids = tokenizer->Encode(text); + return; +} + +// export LoadBlobSentencePieceAndEncodeBatch(const std::string &, const std::vector &, std::vector> &) +extern "C" __declspec(dllexport) void LoadBlobSentencePieceAndEncodeBatch(const std::string& model_blob, const std::vector& texts, std::vector>& token_ids_batch) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobSentencePiece(model_blob); + token_ids_batch = tokenizer->EncodeBatch(texts); + return; +} + +// export LoadBlobRWKVWorldAndEncode(const std::string &, const std::string &, std::vector &) +extern "C" __declspec(dllexport) void LoadBlobRWKVWorldAndEncode(const std::string& model_blob, const std::string& text, std::vector& token_ids) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobRWKVWorld(model_blob); + token_ids = tokenizer->Encode(text); + return; +} + +// export LoadBlobRWKVWorldAndEncodeBatch(const std::string &, const std::vector &, std::vector> &) +extern "C" __declspec(dllexport) void LoadBlobRWKVWorldAndEncodeBatch(const std::string& model_blob, const std::vector& texts, std::vector>& token_ids_batch) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobRWKVWorld(model_blob); + token_ids_batch = tokenizer->EncodeBatch(texts); + return; +} + +// export LoadBlobByteLevelBPEAndEncode(const std::string &, const std::string &, const std::string &, const std::string &, std::vector &) +extern "C" __declspec(dllexport) void LoadBlobByteLevelBPEAndEncode(const std::string& vocab_blob, const std::string& merges_blob, const std::string& added_tokens, const std::string& text, std::vector& token_ids) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobByteLevelBPE(vocab_blob, merges_blob, added_tokens); + token_ids = tokenizer->Encode(text); + return; +} + +// export LoadBlobByteLevelBPEAndEncodeBatch(const std::string &, const std::string &, const std::string &, const std::vector &, std::vector> &) +extern "C" __declspec(dllexport) void LoadBlobByteLevelBPEAndEncodeBatch(const std::string& vocab_blob, const std::string& merges_blob, const std::string& added_tokens, const std::vector& texts, std::vector>& token_ids_batch) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobByteLevelBPE(vocab_blob, merges_blob, added_tokens); + token_ids_batch = tokenizer->EncodeBatch(texts); + return; +} \ No newline at end of file From c424a483effa15b58dcc1a4f823ca6d1b9666b1d Mon Sep 17 00:00:00 2001 From: PARTH SHAH Date: Wed, 11 Jun 2025 12:17:06 +0530 Subject: [PATCH 2/2] Update Readme --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index ef682d3..6e93404 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,29 @@ You also need to turn on `c++17` support. See [example](example) folder for an example CMake project. +### Build it as shared library +You can also build this library as a standalone shared library. + +1. Initialize all submodules: + ```sh + git submodule update --init --recursive + ``` +2. Create and enter the build directory: + ```sh + mkdir -p build + cd build + ``` +3. Generate the CMake configuration: + ```sh + cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON + ``` +4. Build the project: + ```sh + cmake --build . --config Release + ``` + +When using the shared library, you can access the exported methods defined in `src/static_exports.cc`. + ### Example Code ```c++