@@ -107,7 +107,7 @@ impl TokenizerWrapper {
107107#[ no_mangle]
108108extern "C" fn tokenizers_new_from_str ( input_cstr : * const u8 , len : usize ) -> * mut TokenizerWrapper {
109109 unsafe {
110- let json = std :: str :: from_utf8 ( std:: slice:: from_raw_parts ( input_cstr, len) ) . unwrap ( ) ;
110+ let json = & String :: from_utf8_lossy ( std:: slice:: from_raw_parts ( input_cstr, len) ) ;
111111 return Box :: into_raw ( Box :: new ( TokenizerWrapper :: from_str ( json) ) ) ;
112112 }
113113}
@@ -123,14 +123,13 @@ extern "C" fn byte_level_bpe_tokenizers_new_from_str(
123123) -> * mut TokenizerWrapper {
124124 unsafe {
125125 let vocab =
126- std :: str :: from_utf8 ( std:: slice:: from_raw_parts ( input_vocab_str, len_vocab) ) . unwrap ( ) ;
126+ & String :: from_utf8_lossy ( std:: slice:: from_raw_parts ( input_vocab_str, len_vocab) ) ;
127127 let merges =
128- std :: str :: from_utf8 ( std:: slice:: from_raw_parts ( input_merges_str, len_merges) ) . unwrap ( ) ;
129- let added_tokens = std :: str :: from_utf8 ( std:: slice:: from_raw_parts (
128+ & String :: from_utf8_lossy ( std:: slice:: from_raw_parts ( input_merges_str, len_merges) ) ;
129+ let added_tokens = & String :: from_utf8_lossy ( std:: slice:: from_raw_parts (
130130 input_added_tokens_str,
131131 len_added_tokens,
132- ) )
133- . unwrap ( ) ;
132+ ) ) ;
134133 return Box :: into_raw ( Box :: new ( TokenizerWrapper :: byte_level_bpe_from_str (
135134 vocab,
136135 merges,
@@ -261,7 +260,7 @@ extern "C" fn tokenizers_token_to_id(
261260 out_id : * mut i32 ,
262261) {
263262 unsafe {
264- let token: & str = std :: str :: from_utf8 ( std:: slice:: from_raw_parts ( token, len) ) . unwrap ( ) ;
263+ let token: & str = & String :: from_utf8_lossy ( std:: slice:: from_raw_parts ( token, len) ) ;
265264 let id = ( * handle) . tokenizer . token_to_id ( token) ;
266265 * out_id = match id {
267266 Some ( id) => id as i32 ,
0 commit comments