Skip to content

Commit 35bad0e

Browse files
authored
[HFTokenizer] Allow use of "add_special_tokens" for HFTokenizer.Encode and "skip_special_tokens" for HFTokenizer.Decode methods. (mlc-ai#27)
1 parent 7466de5 commit 35bad0e

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

src/huggingface_tokenizer.cc

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,8 @@ class HFTokenizer : public Tokenizer {
2727
}
2828

2929
// use i32 to be consistent with sentencepiece
30-
std::vector<int32_t> Encode(const std::string& text) final {
31-
bool add_special_token = false;
32-
tokenizers_encode(handle_, text.data(), text.length(), static_cast<int>(add_special_token));
30+
std::vector<int32_t> Encode(const std::string& text, bool add_special_tokens) {
31+
tokenizers_encode(handle_, text.data(), text.length(), static_cast<int>(add_special_tokens));
3332
const uint32_t* data;
3433
size_t len;
3534
tokenizers_get_encode_ids(handle_, &data, &len);
@@ -39,16 +38,24 @@ class HFTokenizer : public Tokenizer {
3938
}
4039

4140
// use i32 to be consistent with sentencepiece
42-
std::string Decode(const std::vector<int32_t>& ids) final {
43-
bool skip_special_token = false;
41+
std::vector<int32_t> Encode(const std::string& text) final {
42+
return Encode(text, false);
43+
}
44+
45+
// use i32 to be consistent with sentencepiece
46+
std::string Decode(const std::vector<int32_t>& ids, bool skip_special_tokens) {
4447
tokenizers_decode(handle_, reinterpret_cast<const uint32_t*>(ids.data()), ids.size(),
45-
static_cast<int>(skip_special_token));
48+
static_cast<int>(skip_special_tokens));
4649
const char* data;
4750
size_t len;
4851
tokenizers_get_decode_str(handle_, &data, &len);
4952
return std::string(data, len);
5053
}
5154

55+
std::string Decode(const std::vector<int32_t>& ids) final {
56+
return Decode(ids, false);
57+
}
58+
5259
size_t GetVocabSize() final {
5360
size_t size;
5461
tokenizers_get_vocab_size(handle_, &size);

0 commit comments

Comments
 (0)