@@ -27,9 +27,8 @@ class HFTokenizer : public Tokenizer {
2727 }
2828
2929 // use i32 to be consistent with sentencepiece
30- std::vector<int32_t > Encode (const std::string& text) final {
31- bool add_special_token = false ;
32- tokenizers_encode (handle_, text.data (), text.length (), static_cast <int >(add_special_token));
30+ std::vector<int32_t > Encode (const std::string& text, bool add_special_tokens) {
31+ tokenizers_encode (handle_, text.data (), text.length (), static_cast <int >(add_special_tokens));
3332 const uint32_t * data;
3433 size_t len;
3534 tokenizers_get_encode_ids (handle_, &data, &len);
@@ -39,16 +38,24 @@ class HFTokenizer : public Tokenizer {
3938 }
4039
4140 // use i32 to be consistent with sentencepiece
42- std::string Decode (const std::vector<int32_t >& ids) final {
43- bool skip_special_token = false ;
41+ std::vector<int32_t > Encode (const std::string& text) final {
42+ return Encode (text, false );
43+ }
44+
45+ // use i32 to be consistent with sentencepiece
46+ std::string Decode (const std::vector<int32_t >& ids, bool skip_special_tokens) {
4447 tokenizers_decode (handle_, reinterpret_cast <const uint32_t *>(ids.data ()), ids.size (),
45- static_cast <int >(skip_special_token ));
48+ static_cast <int >(skip_special_tokens ));
4649 const char * data;
4750 size_t len;
4851 tokenizers_get_decode_str (handle_, &data, &len);
4952 return std::string (data, len);
5053 }
5154
55+ std::string Decode (const std::vector<int32_t >& ids) final {
56+ return Decode (ids, false );
57+ }
58+
5259 size_t GetVocabSize () final {
5360 size_t size;
5461 tokenizers_get_vocab_size (handle_, &size);
0 commit comments