Skip to content

Commit c290994

Browse files
authored
Fix for loading tokenizers using non-utf8 strings (mlc-ai#55)
* std::str::from_utf8() replaced by String::from_utf8_lossy() to fix non utf8 data error in case of BPE use * fixed invalid type
1 parent 0621f84 commit c290994

File tree

1 file changed

+6
-7
lines changed

1 file changed

+6
-7
lines changed

rust/src/lib.rs

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ impl TokenizerWrapper {
107107
#[no_mangle]
108108
extern "C" fn tokenizers_new_from_str(input_cstr: *const u8, len: usize) -> *mut TokenizerWrapper {
109109
unsafe {
110-
let json = std::str::from_utf8(std::slice::from_raw_parts(input_cstr, len)).unwrap();
110+
let json = &String::from_utf8_lossy(std::slice::from_raw_parts(input_cstr, len));
111111
return Box::into_raw(Box::new(TokenizerWrapper::from_str(json)));
112112
}
113113
}
@@ -123,14 +123,13 @@ extern "C" fn byte_level_bpe_tokenizers_new_from_str(
123123
) -> *mut TokenizerWrapper {
124124
unsafe {
125125
let vocab =
126-
std::str::from_utf8(std::slice::from_raw_parts(input_vocab_str, len_vocab)).unwrap();
126+
&String::from_utf8_lossy(std::slice::from_raw_parts(input_vocab_str, len_vocab));
127127
let merges =
128-
std::str::from_utf8(std::slice::from_raw_parts(input_merges_str, len_merges)).unwrap();
129-
let added_tokens = std::str::from_utf8(std::slice::from_raw_parts(
128+
&String::from_utf8_lossy(std::slice::from_raw_parts(input_merges_str, len_merges));
129+
let added_tokens = &String::from_utf8_lossy(std::slice::from_raw_parts(
130130
input_added_tokens_str,
131131
len_added_tokens,
132-
))
133-
.unwrap();
132+
));
134133
return Box::into_raw(Box::new(TokenizerWrapper::byte_level_bpe_from_str(
135134
vocab,
136135
merges,
@@ -261,7 +260,7 @@ extern "C" fn tokenizers_token_to_id(
261260
out_id: *mut i32,
262261
) {
263262
unsafe {
264-
let token: &str = std::str::from_utf8(std::slice::from_raw_parts(token, len)).unwrap();
263+
let token: &str = &String::from_utf8_lossy(std::slice::from_raw_parts(token, len));
265264
let id = (*handle).tokenizer.token_to_id(token);
266265
*out_id = match id {
267266
Some(id) => id as i32,

0 commit comments

Comments
 (0)