Skip to content

Commit eec72a6

Browse files
authored
add rwkv world tokenizer (mlc-ai#14)
* support rwkv world tokenizer * refine * rename
1 parent 4f42c9f commit eec72a6

6 files changed

Lines changed: 168 additions & 0 deletions

File tree

CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
cmake_minimum_required(VERSION 3.18)
22
project(tokenizers_cpp C CXX)
33

4+
set(CMAKE_CXX_STANDARD 17)
5+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
6+
set(CMAKE_CXX_EXTENSIONS OFF)
7+
8+
include(FetchContent)
9+
410
# update to contain more rust flags
511
set(TOKENIZERS_CPP_RUST_FLAGS "")
612
set(TOKENIZERS_CPP_CARGO_TARGET "")
@@ -71,6 +77,13 @@ endif ()
7177
get_filename_component(TOKENIZERS_CPP_ROOT ${CMAKE_CURRENT_LIST_FILE} DIRECTORY)
7278
set(TOKENIZERS_CPP_CARGO_SOURCE_PATH ${TOKENIZERS_CPP_ROOT}/rust)
7379

80+
FetchContent_Declare(
81+
msgpack
82+
GIT_REPOSITORY https://github.com/msgpack/msgpack-c
83+
GIT_TAG cpp-6.1.0
84+
)
85+
option(MSGPACK_USE_BOOST "" OFF)
86+
FetchContent_MakeAvailable(msgpack)
7487

7588
if(MSVC)
7689
set(TOKENIZERS_RUST_LIB "${TOKENIZERS_CPP_CARGO_BINARY_DIR}/tokenizers_c.lib")
@@ -98,10 +111,12 @@ set(
98111
TOKENIZER_CPP_SRCS
99112
src/sentencepiece_tokenizer.cc
100113
src/huggingface_tokenizer.cc
114+
src/rwkv_world_tokenizer.cc
101115
)
102116
add_library(tokenizer_cpp_objs OBJECT ${TOKENIZER_CPP_SRCS})
103117
target_include_directories(tokenizer_cpp_objs PRIVATE sentencepiece/src)
104118
target_include_directories(tokenizer_cpp_objs PUBLIC ${TOKENIZERS_CPP_INCLUDE})
119+
target_link_libraries(tokenizer_cpp_objs PRIVATE msgpack-cxx)
105120

106121
# sentencepiece config
107122
option(SPM_ENABLE_SHARED "override sentence piece config" OFF)

example/build_and_run.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ fi
1616
if [ ! -f "tokenizer.json" ]; then
1717
wget https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1/resolve/main/tokenizer.json
1818
fi
19+
if [ ! -f "tokenizer_model" ]; then
20+
wget https://github.com/BBuf/rwkv-world-tokenizer/releases/download/v1.0.0/tokenizer_model.zip
21+
unzip tokenizer_model.zip
22+
fi
1923
cd ..
2024

2125
# run

example/example.cc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,24 @@ void HuggingFaceTokenizerExample() {
7070
std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
7171
}
7272

73+
// RWKV world tokenizer
74+
// - dist/tokenizer_model
75+
void RWKVWorldTokenizerExample() {
76+
auto tok = Tokenizer::FromBlobRWKVWorld("dist/tokenizer_model");
77+
std::string prompt = "What is the capital of Canada?";
78+
// call Encode to turn prompt into token ids
79+
std::vector<int> ids = tok->Encode(prompt);
80+
// call Decode to turn ids into string
81+
std::string decoded_prompt = tok->Decode(ids);
82+
83+
// print encoded result
84+
std::cout << "RWKV World tokenizer: " << std::endl;
85+
PrintEncodeResult(ids);
86+
std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
87+
}
88+
7389
int main(int argc, char* argv[]) {
7490
SentencePieceTokenizerExample();
7591
HuggingFaceTokenizerExample();
92+
RWKVWorldTokenizerExample();
7693
}

include/rwkv_world_tokenizer.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*!
2+
* Copyright (c) 2023 by Contributors daquexian
3+
* \file rwkv_world_tokenizer.h
4+
* \brief Implementation of llm chat.
5+
*/
6+
7+
#include <unordered_map>
8+
#include <string>
9+
#include <vector>
10+
11+
namespace tokenizers {
12+
class RWKVWorldToolTokenizer {
13+
public:
14+
RWKVWorldToolTokenizer(const std::string &path);
15+
std::vector<int> encode(std::string_view str) const;
16+
std::string decode(const std::vector<int> &ids) const;
17+
std::string decode(int id) const;
18+
19+
private:
20+
std::unordered_map<std::string, int> _word2idx;
21+
std::unordered_map<int, std::string> _idx2word;
22+
};
23+
} // namespace tokenizers
24+

include/tokenizers_cpp.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,13 @@ class Tokenizer {
6666
* \return The created tokenizer.
6767
*/
6868
static std::unique_ptr<Tokenizer> FromBlobSentencePiece(const std::string& model_blob);
69+
/*!
70+
* \brief Create RWKVWorldTokenizer.
71+
*
72+
* \param model_blob The blob that contains vocabs.
73+
* \return The created tokenizer.
74+
*/
75+
static std::unique_ptr<Tokenizer> FromBlobRWKVWorld(const std::string& model_blob);
6976
};
7077

7178
} // namespace tokenizers

src/rwkv_world_tokenizer.cc

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/*!
2+
* Copyright (c) 2023 by Contributors
3+
* \file rwkv_world_tokenizer.cpp
4+
* \brief Implementation of llm chat.
5+
*/
6+
#include <tokenizers_cpp.h>
7+
#include "rwkv_world_tokenizer.h"
8+
9+
#include <iostream>
10+
#include <fstream>
11+
#include <string_view>
12+
#include <msgpack.hpp>
13+
14+
namespace tokenizers {
15+
16+
RWKVWorldToolTokenizer::RWKVWorldToolTokenizer(const std::string &path) {
17+
std::ifstream infile;
18+
infile.open(path, std::ios::binary | std::ios::in);
19+
infile.seekg(0, std::ios::end);
20+
int64_t length = infile.tellg();
21+
infile.seekg(0, std::ios::beg);
22+
char *data = new char[length];
23+
infile.read(data, length);
24+
infile.close();
25+
26+
auto unpacker = msgpack::unpack(data, length);
27+
auto obj = unpacker.get();
28+
_idx2word = obj.as<std::unordered_map<int, std::string>>();
29+
for (auto &pair : _idx2word) {
30+
_word2idx[pair.second] = pair.first;
31+
}
32+
}
33+
34+
std::vector<int> RWKVWorldToolTokenizer::encode(std::string_view str) const {
35+
std::vector<int> ids;
36+
int str_idx = 0;
37+
int word_len = 1;
38+
int id = 0;
39+
while (str_idx < str.size()) {
40+
if (str_idx + word_len > str.size()) {
41+
ids.push_back(id);
42+
break;
43+
}
44+
auto substr = str.substr(str_idx, word_len);
45+
auto it = _word2idx.find(std::string(substr));
46+
if (it == _word2idx.end()) {
47+
ids.push_back(id);
48+
str_idx += (word_len - 1);
49+
word_len = 1;
50+
} else {
51+
id = it->second;
52+
word_len++;
53+
}
54+
}
55+
return ids;
56+
}
57+
58+
std::string RWKVWorldToolTokenizer::decode(int id) const {
59+
auto it = _idx2word.find(id);
60+
if (it == _idx2word.end()) {
61+
return "<unk>";
62+
} else {
63+
return it->second;
64+
}
65+
}
66+
67+
std::string RWKVWorldToolTokenizer::decode(const std::vector<int> &ids) const {
68+
std::string str;
69+
for (auto id : ids) {
70+
str += decode(id);
71+
}
72+
return str;
73+
}
74+
75+
RWKVWorldToolTokenizer createRWKVWorldToolTokenizer(const std::string &path) {
76+
return RWKVWorldToolTokenizer(path);
77+
}
78+
79+
class RWKVWorldTokenizer : public Tokenizer {
80+
public:
81+
explicit RWKVWorldTokenizer(const std::string& model_blob) : rwkv_world_tokenizer_(model_blob) {
82+
}
83+
84+
std::vector<int32_t> Encode(const std::string& text) final {
85+
return rwkv_world_tokenizer_.encode(text);
86+
}
87+
88+
std::string Decode(const std::vector<int32_t>& ids) final {
89+
return rwkv_world_tokenizer_.decode(ids);
90+
}
91+
92+
private:
93+
// the tokenizer
94+
RWKVWorldToolTokenizer rwkv_world_tokenizer_;
95+
};
96+
97+
std::unique_ptr<Tokenizer> Tokenizer::FromBlobRWKVWorld(const std::string& model_blob) {
98+
return std::make_unique<RWKVWorldTokenizer>(model_blob);
99+
}
100+
101+
} // namespace tokenizers

0 commit comments

Comments
 (0)