diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..fe2db73b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,20 @@ +{ + "git.ignoreLimitWarning": true, + "files.eol": "\n", + "editor.formatOnSave": true, + "files.exclude": { + "tmp/**": true + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + }, + "[markdown]": { + "editor.formatOnSave": true, + "editor.formatOnPaste": true + }, + "markdownlint.config": { + "MD013": false, + "MD024": false + }, + "cSpell.diagnosticLevel": "Hint", +} diff --git a/Cargo.toml b/Cargo.toml index f862dcef..3ef3041a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,18 +1,30 @@ [package] -name = "encoding_rs" +name = "encoding_rs2" description = "A Gecko-oriented implementation of the Encoding Standard" -version = "0.8.35" -edition = '2018' +version = "0.8.36" +edition = '2024' authors = ["Henri Sivonen "] license = "(Apache-2.0 OR MIT) AND BSD-3-Clause" -include = ["src/**/*.rs", "/data", "Cargo.toml", "COPYRIGHT", "LICENSE*", "README.md"] +include = [ + "src/**/*.rs", + "/data", + "Cargo.toml", + "COPYRIGHT", + "LICENSE*", + "README.md", +] readme = "README.md" documentation = "https://docs.rs/encoding_rs/" homepage = "https://docs.rs/encoding_rs/" -repository = "https://github.com/hsivonen/encoding_rs" +repository = "https://github.com/brmmm3/encoding_rs" keywords = ["encoding", "web", "unicode", "charset"] -categories = ["text-processing", "encoding", "web-programming", "internationalization"] -rust-version = "1.40" +categories = [ + "text-processing", + "encoding", + "web-programming", + "internationalization", +] +rust-version = "1.86" [features] default = ["alloc"] @@ -26,20 +38,22 @@ fast-hanja-encode = [] fast-kanji-encode = [] fast-gb-hanzi-encode = [] fast-big5-hanzi-encode = [] -fast-legacy-encode = ["fast-hangul-encode", - "fast-hanja-encode", - "fast-kanji-encode", - "fast-gb-hanzi-encode", - "fast-big5-hanzi-encode"] +fast-legacy-encode = [ + "fast-hangul-encode", + "fast-hanja-encode", + "fast-kanji-encode", + "fast-gb-hanzi-encode", + "fast-big5-hanzi-encode", +] [dependencies] cfg-if = "1.0" serde = { version = "1.0", optional = true } -any_all_workaround = { version = "0.1.0" , optional = true } +any_all_workaround = { version = "0.1.0", optional = true } [dev-dependencies] serde_derive = "1.0" -bincode = "1.0" +bincode = "2.0" serde_json = "1.0" [profile.release] diff --git a/README.md b/README.md index 16c0f060..4b54d8da 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [![crates.io](https://img.shields.io/crates/v/encoding_rs.svg)](https://crates.io/crates/encoding_rs) [![docs.rs](https://docs.rs/encoding_rs/badge.svg)](https://docs.rs/encoding_rs/) +This is a fork from hsivonen/encoding_rs with some updates. + encoding_rs an implementation of the (non-JavaScript parts of) the [Encoding Standard](https://encoding.spec.whatwg.org/) written in Rust. @@ -53,7 +55,7 @@ Specifically, encoding_rs does the following: workloads than the standard library; hopefully will get upstreamed some day) and ASCII. -Additionally, `encoding_rs::mem` does the following: +Additionally, `encoding_rs2::mem` does the following: * Checks if a byte buffer contains only ASCII. * Checks if a potentially-invalid UTF-16 buffer contains only Basic Latin (ASCII). diff --git a/fuzz/fuzzers/fuzz_encodings.rs b/fuzz/fuzzers/fuzz_encodings.rs index 925547e6..e90e65fe 100644 --- a/fuzz/fuzzers/fuzz_encodings.rs +++ b/fuzz/fuzzers/fuzz_encodings.rs @@ -12,48 +12,50 @@ extern crate libfuzzer_sys; extern crate encoding_rs; -use encoding_rs::*; +use encoding_rs2::*; // Doesn't included ISO-8859-8-I. -static ENCODINGS: [&'static Encoding; 39] = [&UTF_8_INIT, - &REPLACEMENT_INIT, - &GBK_INIT, - &BIG5_INIT, - &EUC_JP_INIT, - &GB18030_INIT, - &UTF_16BE_INIT, - &UTF_16LE_INIT, - &SHIFT_JIS_INIT, - &EUC_KR_INIT, - &ISO_2022_JP_INIT, - &X_USER_DEFINED_INIT, - &WINDOWS_1250_INIT, - &WINDOWS_1251_INIT, - &WINDOWS_1252_INIT, - &WINDOWS_1253_INIT, - &WINDOWS_1254_INIT, - &WINDOWS_1255_INIT, - &WINDOWS_1256_INIT, - &WINDOWS_1257_INIT, - &WINDOWS_1258_INIT, - &KOI8_U_INIT, - &MACINTOSH_INIT, - &IBM866_INIT, - &KOI8_R_INIT, - &ISO_8859_2_INIT, - &ISO_8859_3_INIT, - &ISO_8859_4_INIT, - &ISO_8859_5_INIT, - &ISO_8859_6_INIT, - &ISO_8859_7_INIT, - &ISO_8859_10_INIT, - &ISO_8859_13_INIT, - &ISO_8859_14_INIT, - &WINDOWS_874_INIT, - &ISO_8859_15_INIT, - &ISO_8859_16_INIT, - &ISO_8859_8_I_INIT, - &X_MAC_CYRILLIC_INIT]; +static ENCODINGS: [&'static Encoding; 39] = [ + &UTF_8_INIT, + &REPLACEMENT_INIT, + &GBK_INIT, + &BIG5_INIT, + &EUC_JP_INIT, + &GB18030_INIT, + &UTF_16BE_INIT, + &UTF_16LE_INIT, + &SHIFT_JIS_INIT, + &EUC_KR_INIT, + &ISO_2022_JP_INIT, + &X_USER_DEFINED_INIT, + &WINDOWS_1250_INIT, + &WINDOWS_1251_INIT, + &WINDOWS_1252_INIT, + &WINDOWS_1253_INIT, + &WINDOWS_1254_INIT, + &WINDOWS_1255_INIT, + &WINDOWS_1256_INIT, + &WINDOWS_1257_INIT, + &WINDOWS_1258_INIT, + &KOI8_U_INIT, + &MACINTOSH_INIT, + &IBM866_INIT, + &KOI8_R_INIT, + &ISO_8859_2_INIT, + &ISO_8859_3_INIT, + &ISO_8859_4_INIT, + &ISO_8859_5_INIT, + &ISO_8859_6_INIT, + &ISO_8859_7_INIT, + &ISO_8859_10_INIT, + &ISO_8859_13_INIT, + &ISO_8859_14_INIT, + &WINDOWS_874_INIT, + &ISO_8859_15_INIT, + &ISO_8859_16_INIT, + &ISO_8859_8_I_INIT, + &X_MAC_CYRILLIC_INIT, +]; fn check_utf8(data: &[u8]) { if let Err(_) = ::std::str::from_utf8(data) { @@ -146,8 +148,7 @@ fn encode_from_utf8(encoding: &'static Encoding, data: &[u8]) { } else { let mut total_read = 0; loop { - if let Some(needed) = encoder - .max_buffer_length_from_utf8_if_no_unmappables( + if let Some(needed) = encoder.max_buffer_length_from_utf8_if_no_unmappables( string.len() - total_read, ) { dst.resize(needed, 0); @@ -164,9 +165,9 @@ fn encode_from_utf8(encoding: &'static Encoding, data: &[u8]) { } let mut total_read = 0; loop { - if let Some(needed) = encoder.max_buffer_length_from_utf8_if_no_unmappables( - string.len() - total_read, - ) { + if let Some(needed) = + encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read) + { dst.resize(needed, 0); let (result, read, _, _) = encoder.encode_from_utf8(&string[total_read..], &mut dst, false); @@ -206,7 +207,8 @@ fn encode_from_utf8_without_replacement(encoding: &'static Encoding, data: &[u8] string.push(c); } else { if let Some(needed) = - encoder.max_buffer_length_from_utf8_without_replacement(string.len()) { + encoder.max_buffer_length_from_utf8_without_replacement(string.len()) + { dst.resize(needed, 0); let (result, _, _) = encoder.encode_from_utf8_without_replacement(&string, &mut dst, true); @@ -216,7 +218,8 @@ fn encode_from_utf8_without_replacement(encoding: &'static Encoding, data: &[u8] } } if let Some(needed) = - encoder.max_buffer_length_from_utf8_without_replacement(string.len()) { + encoder.max_buffer_length_from_utf8_without_replacement(string.len()) + { dst.resize(needed, 0); let (result, _, _) = encoder.encode_from_utf8_without_replacement(&string, &mut dst, false); @@ -260,7 +263,8 @@ fn encode_from_utf16(encoding: &'static Encoding, data: &[u8]) { let mut total_read = 0; loop { if let Some(needed) = - encoder.max_buffer_length_from_utf16_if_no_unmappables(chunk.len() - total_read) { + encoder.max_buffer_length_from_utf16_if_no_unmappables(chunk.len() - total_read) + { dst.resize(needed, 0); let (result, read, _, _) = encoder.encode_from_utf16(&chunk[total_read..], &mut dst, last); @@ -301,11 +305,11 @@ fn encode_from_utf16_without_replacement(encoding: &'static Encoding, data: &[u8 let new_offset = offset + chunk_size; let chunk = &s[offset..new_offset]; offset = new_offset; - if let Some(needed) = encoder - .max_buffer_length_from_utf16_without_replacement(chunk.len()) { + if let Some(needed) = encoder.max_buffer_length_from_utf16_without_replacement(chunk.len()) + { dst.resize(needed, 0); - let (result, _, _) = encoder - .encode_from_utf16_without_replacement(&chunk, &mut dst, last); + let (result, _, _) = + encoder.encode_from_utf16_without_replacement(&chunk, &mut dst, last); match result { EncoderResult::InputEmpty => { if last { @@ -524,16 +528,14 @@ fn dispatch_test(encoding: &'static Encoding, data: &[u8]) { } } -fuzz_target!( - |data: &[u8]| { - if let Some(first) = data.first() { - let index = *first as usize; - if index >= ENCODINGS.len() { - return; - } - let encoding = ENCODINGS[index]; - dispatch_test(encoding, &data[1..]); +fuzz_target!(|data: &[u8]| { + if let Some(first) = data.first() { + let index = *first as usize; + if index >= ENCODINGS.len() { + return; } - // Comment to make rustfmt not introduce a compilation error + let encoding = ENCODINGS[index]; + dispatch_test(encoding, &data[1..]); } -); + // Comment to make rustfmt not introduce a compilation error +}); diff --git a/fuzz/fuzzers/fuzz_labels.rs b/fuzz/fuzzers/fuzz_labels.rs index 2353225d..ed35d1b8 100644 --- a/fuzz/fuzzers/fuzz_labels.rs +++ b/fuzz/fuzzers/fuzz_labels.rs @@ -1,7 +1,8 @@ #![no_main] -#[macro_use] extern crate libfuzzer_sys; +#[macro_use] +extern crate libfuzzer_sys; extern crate encoding_rs; -use encoding_rs::*; +use encoding_rs2::*; fuzz_target!(|data: &[u8]| { Encoding::for_label(data); diff --git a/fuzz/fuzzers/fuzz_mem.rs b/fuzz/fuzzers/fuzz_mem.rs index 8d45d596..09665771 100644 --- a/fuzz/fuzzers/fuzz_mem.rs +++ b/fuzz/fuzzers/fuzz_mem.rs @@ -74,32 +74,47 @@ fn string_with_len(len: usize) -> String { } fn fuzz_is_ascii(data: &[u8]) { - assert_eq!(encoding_rs::mem::is_ascii(data), safe_encoding_rs_mem::is_ascii(data)); + assert_eq!( + encoding_rs2::mem::is_ascii(data), + safe_encoding_rs_mem::is_ascii(data) + ); } fn fuzz_is_basic_latin(data: &[u16]) { - assert_eq!(encoding_rs::mem::is_basic_latin(data), safe_encoding_rs_mem::is_basic_latin(data)); + assert_eq!( + encoding_rs2::mem::is_basic_latin(data), + safe_encoding_rs_mem::is_basic_latin(data) + ); } fn fuzz_is_utf8_latin1(data: &[u8]) { - assert_eq!(encoding_rs::mem::is_utf8_latin1(data), safe_encoding_rs_mem::is_utf8_latin1(data)); + assert_eq!( + encoding_rs2::mem::is_utf8_latin1(data), + safe_encoding_rs_mem::is_utf8_latin1(data) + ); } fn fuzz_is_str_latin1(data: &[u8]) { if let Ok(s) = std::str::from_utf8(data) { - assert_eq!(encoding_rs::mem::is_str_latin1(s), safe_encoding_rs_mem::is_str_latin1(s)); + assert_eq!( + encoding_rs2::mem::is_str_latin1(s), + safe_encoding_rs_mem::is_str_latin1(s) + ); } } fn fuzz_is_utf16_latin1(data: &[u16]) { - assert_eq!(encoding_rs::mem::is_utf16_latin1(data), safe_encoding_rs_mem::is_utf16_latin1(data)); + assert_eq!( + encoding_rs2::mem::is_utf16_latin1(data), + safe_encoding_rs_mem::is_utf16_latin1(data) + ); } fn fuzz_convert_utf8_to_utf16(data: &[u8]) { let needed = data.len() + 1; let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - let len = encoding_rs::mem::convert_utf8_to_utf16(data, &mut dst[..]); + let len = encoding_rs2::mem::convert_utf8_to_utf16(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::convert_utf8_to_utf16(data, &mut safe_dst[..]); dst.truncate(len); safe_dst.truncate(safe_len); @@ -113,7 +128,7 @@ fn fuzz_convert_str_to_utf16(data: &[u8]) { let needed = s.len(); let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - let len = encoding_rs::mem::convert_str_to_utf16(s, &mut dst[..]); + let len = encoding_rs2::mem::convert_str_to_utf16(s, &mut dst[..]); let safe_len = safe_encoding_rs_mem::convert_str_to_utf16(s, &mut safe_dst[..]); dst.truncate(len); safe_dst.truncate(safe_len); @@ -127,7 +142,7 @@ fn fuzz_convert_utf16_to_utf8(data: &[u16]) { let needed = data.len() * 3; let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - let len = encoding_rs::mem::convert_utf16_to_utf8(data, &mut dst[..]); + let len = encoding_rs2::mem::convert_utf16_to_utf8(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::convert_utf16_to_utf8(data, &mut safe_dst[..]); dst.truncate(len); safe_dst.truncate(safe_len); @@ -140,7 +155,7 @@ fn fuzz_convert_utf16_to_str(data: &[u16]) { let needed = data.len() * 3; let mut dst = string_with_len(needed); let mut safe_dst = string_with_len(needed); - let len = encoding_rs::mem::convert_utf16_to_str(data, &mut dst[..]); + let len = encoding_rs2::mem::convert_utf16_to_str(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::convert_utf16_to_str(data, &mut safe_dst[..]); check_utf8(dst.as_bytes()); check_utf8(safe_dst.as_bytes()); @@ -154,7 +169,7 @@ fn fuzz_convert_latin1_to_utf16(data: &[u8]) { let needed = data.len(); let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - encoding_rs::mem::convert_latin1_to_utf16(data, &mut dst[..]); + encoding_rs2::mem::convert_latin1_to_utf16(data, &mut dst[..]); safe_encoding_rs_mem::convert_latin1_to_utf16(data, &mut safe_dst[..]); assert_eq!(dst, safe_dst); check_utf16(&dst[..]); @@ -164,7 +179,7 @@ fn fuzz_convert_latin1_to_utf8(data: &[u8]) { let needed = data.len() * 2; let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - let len = encoding_rs::mem::convert_latin1_to_utf8(data, &mut dst[..]); + let len = encoding_rs2::mem::convert_latin1_to_utf8(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::convert_latin1_to_utf8(data, &mut safe_dst[..]); dst.truncate(len); safe_dst.truncate(safe_len); @@ -177,7 +192,7 @@ fn fuzz_convert_latin1_to_str(data: &[u8]) { let needed = data.len() * 2; let mut dst = string_with_len(needed); let mut safe_dst = string_with_len(needed); - let len = encoding_rs::mem::convert_latin1_to_str(data, &mut dst[..]); + let len = encoding_rs2::mem::convert_latin1_to_str(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::convert_latin1_to_str(data, &mut safe_dst[..]); check_utf8(dst.as_bytes()); check_utf8(safe_dst.as_bytes()); @@ -191,7 +206,7 @@ fn fuzz_convert_utf8_to_latin1_lossy(data: &[u8]) { let needed = data.len(); let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - let len = encoding_rs::mem::convert_utf8_to_latin1_lossy(data, &mut dst[..]); + let len = encoding_rs2::mem::convert_utf8_to_latin1_lossy(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::convert_utf8_to_latin1_lossy(data, &mut safe_dst[..]); if safe_encoding_rs_mem::is_utf8_latin1(data) { dst.truncate(len); @@ -205,7 +220,7 @@ fn fuzz_convert_utf16_to_latin1_lossy(data: &[u16]) { let needed = data.len(); let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - encoding_rs::mem::convert_utf16_to_latin1_lossy(data, &mut dst[..]); + encoding_rs2::mem::convert_utf16_to_latin1_lossy(data, &mut dst[..]); safe_encoding_rs_mem::convert_utf16_to_latin1_lossy(data, &mut safe_dst[..]); if safe_encoding_rs_mem::is_utf16_latin1(data) { assert_eq!(dst, safe_dst); @@ -213,7 +228,7 @@ fn fuzz_convert_utf16_to_latin1_lossy(data: &[u16]) { } fn fuzz_utf16_valid_up_to(data: &[u16]) { - let up_to = encoding_rs::mem::utf16_valid_up_to(data); + let up_to = encoding_rs2::mem::utf16_valid_up_to(data); let safe_up_to = safe_encoding_rs_mem::utf16_valid_up_to(data); assert_eq!(up_to, safe_up_to); } @@ -224,7 +239,7 @@ fn fuzz_ensure_utf16_validity(data: &[u16]) { let mut safe_dst = vec_with_len::(needed); dst.copy_from_slice(data); safe_dst.copy_from_slice(data); - encoding_rs::mem::ensure_utf16_validity(&mut dst[..]); + encoding_rs2::mem::ensure_utf16_validity(&mut dst[..]); safe_encoding_rs_mem::ensure_utf16_validity(&mut safe_dst[..]); assert_eq!(dst, safe_dst); check_utf16(&dst[..]); @@ -234,7 +249,7 @@ fn fuzz_copy_ascii_to_ascii(data: &[u8]) { let needed = data.len(); let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - let len = encoding_rs::mem::copy_ascii_to_ascii(data, &mut dst[..]); + let len = encoding_rs2::mem::copy_ascii_to_ascii(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::copy_ascii_to_ascii(data, &mut safe_dst[..]); dst.truncate(len); safe_dst.truncate(safe_len); @@ -246,7 +261,7 @@ fn fuzz_copy_ascii_to_basic_latin(data: &[u8]) { let needed = data.len(); let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - let len = encoding_rs::mem::copy_ascii_to_basic_latin(data, &mut dst[..]); + let len = encoding_rs2::mem::copy_ascii_to_basic_latin(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::copy_ascii_to_basic_latin(data, &mut safe_dst[..]); dst.truncate(len); safe_dst.truncate(safe_len); @@ -258,7 +273,7 @@ fn fuzz_copy_basic_latin_to_ascii(data: &[u16]) { let needed = data.len(); let mut dst = vec_with_len::(needed); let mut safe_dst = vec_with_len::(needed); - let len = encoding_rs::mem::copy_basic_latin_to_ascii(data, &mut dst[..]); + let len = encoding_rs2::mem::copy_basic_latin_to_ascii(data, &mut dst[..]); let safe_len = safe_encoding_rs_mem::copy_basic_latin_to_ascii(data, &mut safe_dst[..]); dst.truncate(len); safe_dst.truncate(safe_len); @@ -267,67 +282,83 @@ fn fuzz_copy_basic_latin_to_ascii(data: &[u16]) { } fn fuzz_is_utf8_bidi(data: &[u8]) { - assert_eq!(encoding_rs::mem::is_utf8_bidi(data), safe_encoding_rs_mem::is_utf8_bidi(data)); + assert_eq!( + encoding_rs2::mem::is_utf8_bidi(data), + safe_encoding_rs_mem::is_utf8_bidi(data) + ); } fn fuzz_is_str_bidi(data: &[u8]) { if let Ok(s) = std::str::from_utf8(data) { - assert_eq!(encoding_rs::mem::is_str_bidi(s), safe_encoding_rs_mem::is_str_bidi(s)); + assert_eq!( + encoding_rs2::mem::is_str_bidi(s), + safe_encoding_rs_mem::is_str_bidi(s) + ); } } fn fuzz_is_utf16_bidi(data: &[u16]) { - assert_eq!(encoding_rs::mem::is_utf16_bidi(data), safe_encoding_rs_mem::is_utf16_bidi(data)); + assert_eq!( + encoding_rs2::mem::is_utf16_bidi(data), + safe_encoding_rs_mem::is_utf16_bidi(data) + ); } // is_char_bidi() and is_utf16_code_unit_bidi() are tested exhaustively, so no need to fuzz them.as_u16_slice fn fuzz_check_utf8_for_latin1_and_bidi(data: &[u8]) { - assert_eq!(encoding_rs::mem::check_utf8_for_latin1_and_bidi(data), safe_encoding_rs_mem::check_utf8_for_latin1_and_bidi(data)); + assert_eq!( + encoding_rs2::mem::check_utf8_for_latin1_and_bidi(data), + safe_encoding_rs_mem::check_utf8_for_latin1_and_bidi(data) + ); } fn fuzz_check_str_for_latin1_and_bidi(data: &[u8]) { if let Ok(s) = std::str::from_utf8(data) { - assert_eq!(encoding_rs::mem::check_str_for_latin1_and_bidi(s), safe_encoding_rs_mem::check_str_for_latin1_and_bidi(s)); + assert_eq!( + encoding_rs2::mem::check_str_for_latin1_and_bidi(s), + safe_encoding_rs_mem::check_str_for_latin1_and_bidi(s) + ); } } fn fuzz_check_utf16_for_latin1_and_bidi(data: &[u16]) { - assert_eq!(encoding_rs::mem::check_utf16_for_latin1_and_bidi(data), safe_encoding_rs_mem::check_utf16_for_latin1_and_bidi(data)); -} - -fuzz_target!( - |data: &[u8]| { - if let Some(first) = data.first() { - match *first { - 0 => fuzz_is_ascii(&data[1..]), - 1 => fuzz_is_basic_latin(as_u16_slice(&data[1..])), - 2 => fuzz_is_utf8_latin1(&data[1..]), - 3 => fuzz_is_str_latin1(&data[1..]), - 4 => fuzz_is_utf16_latin1(as_u16_slice(&data[1..])), - 5 => fuzz_convert_utf8_to_utf16(&data[1..]), - 6 => fuzz_convert_str_to_utf16(&data[1..]), - 7 => fuzz_convert_utf16_to_utf8(as_u16_slice(&data[1..])), - 8 => fuzz_convert_utf16_to_str(as_u16_slice(&data[1..])), - 9 => fuzz_convert_latin1_to_utf16(&data[1..]), - 10 => fuzz_convert_latin1_to_utf8(&data[1..]), - 11 => fuzz_convert_latin1_to_str(&data[1..]), - 12 => fuzz_convert_utf8_to_latin1_lossy(&data[1..]), - 13 => fuzz_convert_utf16_to_latin1_lossy(as_u16_slice(&data[1..])), - 14 => fuzz_utf16_valid_up_to(as_u16_slice(&data[1..])), - 15 => fuzz_ensure_utf16_validity(as_u16_slice(&data[1..])), - 16 => fuzz_copy_ascii_to_ascii(&data[1..]), - 17 => fuzz_copy_ascii_to_basic_latin(&data[1..]), - 18 => fuzz_copy_basic_latin_to_ascii(as_u16_slice(&data[1..])), - 19 => fuzz_is_utf8_bidi(&data[1..]), - 20 => fuzz_is_str_bidi(&data[1..]), - 21 => fuzz_is_utf16_bidi(as_u16_slice(&data[1..])), - 22 => fuzz_check_utf8_for_latin1_and_bidi(&data[1..]), - 23 => fuzz_check_str_for_latin1_and_bidi(&data[1..]), - 24 => fuzz_check_utf16_for_latin1_and_bidi(as_u16_slice(&data[1..])), - _ => return, - } + assert_eq!( + encoding_rs2::mem::check_utf16_for_latin1_and_bidi(data), + safe_encoding_rs_mem::check_utf16_for_latin1_and_bidi(data) + ); +} + +fuzz_target!(|data: &[u8]| { + if let Some(first) = data.first() { + match *first { + 0 => fuzz_is_ascii(&data[1..]), + 1 => fuzz_is_basic_latin(as_u16_slice(&data[1..])), + 2 => fuzz_is_utf8_latin1(&data[1..]), + 3 => fuzz_is_str_latin1(&data[1..]), + 4 => fuzz_is_utf16_latin1(as_u16_slice(&data[1..])), + 5 => fuzz_convert_utf8_to_utf16(&data[1..]), + 6 => fuzz_convert_str_to_utf16(&data[1..]), + 7 => fuzz_convert_utf16_to_utf8(as_u16_slice(&data[1..])), + 8 => fuzz_convert_utf16_to_str(as_u16_slice(&data[1..])), + 9 => fuzz_convert_latin1_to_utf16(&data[1..]), + 10 => fuzz_convert_latin1_to_utf8(&data[1..]), + 11 => fuzz_convert_latin1_to_str(&data[1..]), + 12 => fuzz_convert_utf8_to_latin1_lossy(&data[1..]), + 13 => fuzz_convert_utf16_to_latin1_lossy(as_u16_slice(&data[1..])), + 14 => fuzz_utf16_valid_up_to(as_u16_slice(&data[1..])), + 15 => fuzz_ensure_utf16_validity(as_u16_slice(&data[1..])), + 16 => fuzz_copy_ascii_to_ascii(&data[1..]), + 17 => fuzz_copy_ascii_to_basic_latin(&data[1..]), + 18 => fuzz_copy_basic_latin_to_ascii(as_u16_slice(&data[1..])), + 19 => fuzz_is_utf8_bidi(&data[1..]), + 20 => fuzz_is_str_bidi(&data[1..]), + 21 => fuzz_is_utf16_bidi(as_u16_slice(&data[1..])), + 22 => fuzz_check_utf8_for_latin1_and_bidi(&data[1..]), + 23 => fuzz_check_str_for_latin1_and_bidi(&data[1..]), + 24 => fuzz_check_utf16_for_latin1_and_bidi(as_u16_slice(&data[1..])), + _ => return, } - // Comment to make rustfmt not introduce a compilation error } -); + // Comment to make rustfmt not introduce a compilation error +}); diff --git a/src/ascii.rs b/src/ascii.rs index 2f543608..a2574610 100644 --- a/src/ascii.rs +++ b/src/ascii.rs @@ -153,31 +153,33 @@ macro_rules! ascii_alu { // // Safety: This is the naïve code once again, for `until_alignment` bytes while until_alignment != 0 { - let code_unit = *(src.add(offset)); + let code_unit = unsafe { *(src.add(offset)) }; if code_unit > 127 { // Safety: Upholds safety-usable invariant here return Some((code_unit, offset)); } - *(dst.add(offset)) = code_unit as $dst_unit; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; // Safety: offset is the number of bytes copied so far offset += 1; until_alignment -= 1; } let len_minus_stride = len - ALU_STRIDE_SIZE; loop { - // Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant - if let Some(num_ascii) = $stride_fn( - // Safety: These are known to be valid and aligned since we have at - // least ALU_STRIDE_SIZE data in these buffers, and offset is the - // number of elements copied so far, which according to the - // until_alignment calculation above will cause both src and dst to be - // aligned to usize after this add - src.add(offset) as *const usize, - dst.add(offset) as *mut usize, - ) { - offset += num_ascii; - // Safety: Upholds safety-usable invariant here by indexing into non-ascii byte - return Some((*(src.add(offset)), offset)); + unsafe { + // Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant + if let Some(num_ascii) = $stride_fn( + // Safety: These are known to be valid and aligned since we have at + // least ALU_STRIDE_SIZE data in these buffers, and offset is the + // number of elements copied so far, which according to the + // until_alignment calculation above will cause both src and dst to be + // aligned to usize after this add + src.add(offset) as *const usize, + dst.add(offset) as *mut usize, + ) { + offset += num_ascii; + // Safety: Upholds safety-usable invariant here by indexing into non-ascii byte + return Some((*(src.add(offset)), offset)); + } } // Safety: offset continues to be the number of bytes copied so far, and // maintains usize alignment for the next loop iteration @@ -196,13 +198,13 @@ macro_rules! ascii_alu { // other than src/dst being valid for the the right lens while offset < len { // Safety: len invariant used here - let code_unit = *(src.add(offset)); + let code_unit = unsafe { *(src.add(offset)) }; if code_unit > 127 { // Safety: Upholds safety-usable invariant here return Some((code_unit, offset)); } // Safety: len invariant used here - *(dst.add(offset)) = code_unit as $dst_unit; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; offset += 1; } None @@ -283,28 +285,30 @@ macro_rules! basic_latin_alu { // // Safety: This is the naïve code once again, for `until_alignment` bytes while until_alignment != 0 { - let code_unit = *(src.add(offset)); + let code_unit = unsafe { *(src.add(offset)) }; if code_unit > 127 { // Safety: Upholds safety-usable invariant here return Some((code_unit, offset)); } - *(dst.add(offset)) = code_unit as $dst_unit; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; // Safety: offset is the number of bytes copied so far offset += 1; until_alignment -= 1; } let len_minus_stride = len - ALU_STRIDE_SIZE; loop { - if !$stride_fn( - // Safety: These are known to be valid and aligned since we have at - // least ALU_STRIDE_SIZE data in these buffers, and offset is the - // number of elements copied so far, which according to the - // until_alignment calculation above will cause both src and dst to be - // aligned to usize after this add - src.add(offset) as *const usize, - dst.add(offset) as *mut usize, - ) { - break; + unsafe { + if !$stride_fn( + // Safety: These are known to be valid and aligned since we have at + // least ALU_STRIDE_SIZE data in these buffers, and offset is the + // number of elements copied so far, which according to the + // until_alignment calculation above will cause both src and dst to be + // aligned to usize after this add + src.add(offset) as *const usize, + dst.add(offset) as *mut usize, + ) { + break; + } } // Safety: offset continues to be the number of bytes copied so far, and // maintains usize alignment for the next loop iteration @@ -321,13 +325,13 @@ macro_rules! basic_latin_alu { // Safety: This is the naïve code once again, for leftover bytes while offset < len { // Safety: len invariant used here - let code_unit = *(src.add(offset)); + let code_unit = unsafe { *(src.add(offset)) }; if code_unit > 127 { // Safety: Upholds safety-usable invariant here return Some((code_unit, offset)); } // Safety: len invariant used here - *(dst.add(offset)) = code_unit as $dst_unit; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; offset += 1; } None @@ -378,23 +382,25 @@ macro_rules! latin1_alu { if until_alignment + ALU_STRIDE_SIZE <= len { // Safety: This is the naïve code once again, for `until_alignment` bytes while until_alignment != 0 { - let code_unit = *(src.add(offset)); - *(dst.add(offset)) = code_unit as $dst_unit; + let code_unit = unsafe { *(src.add(offset)) }; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; // Safety: offset is the number of bytes copied so far offset += 1; until_alignment -= 1; } let len_minus_stride = len - ALU_STRIDE_SIZE; loop { - $stride_fn( - // Safety: These are known to be valid and aligned since we have at - // least ALU_STRIDE_SIZE data in these buffers, and offset is the - // number of elements copied so far, which according to the - // until_alignment calculation above will cause both src and dst to be - // aligned to usize after this add - src.add(offset) as *const usize, - dst.add(offset) as *mut usize, - ); + unsafe { + $stride_fn( + // Safety: These are known to be valid and aligned since we have at + // least ALU_STRIDE_SIZE data in these buffers, and offset is the + // number of elements copied so far, which according to the + // until_alignment calculation above will cause both src and dst to be + // aligned to usize after this add + src.add(offset) as *const usize, + dst.add(offset) as *mut usize, + ); + } // Safety: offset continues to be the number of bytes copied so far, and // maintains usize alignment for the next loop iteration offset += ALU_STRIDE_SIZE; @@ -410,8 +416,8 @@ macro_rules! latin1_alu { // Safety: This is the naïve code once again, for leftover bytes while offset < len { // Safety: len invariant used here - let code_unit = *(src.add(offset)); - *(dst.add(offset)) = code_unit as $dst_unit; + let code_unit = unsafe { *(src.add(offset)) }; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; offset += 1; } } @@ -1278,10 +1284,12 @@ cfg_if! { ((0x0000_FF00_0000_0000usize & second_word) >> 24) | ((0x0000_00FF_0000_0000usize & second_word) >> 32); // Safety: fn invariant used here - *dst = first; - *(dst.add(1)) = second; - *(dst.add(2)) = third; - *(dst.add(3)) = fourth; + unsafe { + *dst = first; + *(dst.add(1)) = second; + *(dst.add(2)) = third; + *(dst.add(3)) = fourth; + } } /// Safety: dst must point to valid space for writing two `usize`s @@ -1304,8 +1312,10 @@ cfg_if! { ((0x0000_0000_00FF_0000usize & third) >> 8) | (0x0000_0000_0000_00FFusize & third); // Safety: fn invariant used here - *dst = word; - *(dst.add(1)) = second_word; + unsafe { + *dst = word; + *(dst.add(1)) = second_word; + } } } else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] { // Aligned ALU word, little-endian, 32-bit @@ -1712,9 +1722,11 @@ cfg_if! { /// Safety-usable invariant: will return byte index of first non-ascii byte #[inline(always)] unsafe fn validate_ascii_stride(src: *const usize) -> Option { - let word = *src; - let second_word = *(src.add(1)); - find_non_ascii(word, second_word) + unsafe { + let word = *src; + let second_word = *(src.add(1)); + find_non_ascii(word, second_word) + } } /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being @@ -1785,69 +1797,79 @@ cfg_if! { } else { // Safety: src points to two valid `usize`s, dst points to four valid `usize`s #[inline(always)] - unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) { + fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) { // Safety: src safety invariant used here - let word = *src; - let second_word = *(src.add(1)); - // Safety: dst safety invariant passed down - unpack_alu(word, second_word, dst); + unsafe { + let word = *src; + let second_word = *(src.add(1)); + // Safety: dst safety invariant passed down + unpack_alu(word, second_word, dst); + } } // Safety: src points to four valid `usize`s, dst points to two valid `usize`s #[inline(always)] - unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) { + fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) { // Safety: src safety invariant used here - let first = *src; - let second = *(src.add(1)); - let third = *(src.add(2)); - let fourth = *(src.add(3)); - // Safety: dst safety invariant passed down - pack_alu(first, second, third, fourth, dst); + unsafe { + let first = *src; + let second = *(src.add(1)); + let third = *(src.add(2)); + let fourth = *(src.add(3)); + // Safety: dst safety invariant passed down + pack_alu(first, second, third, fourth, dst); + } } // Safety: src points to two valid `usize`s, dst points to four valid `usize`s #[inline(always)] - unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool { + fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool { // Safety: src safety invariant used here - let word = *src; - let second_word = *(src.add(1)); - // Check if the words contains non-ASCII - if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { - return false; + unsafe { + let word = *src; + let second_word = *(src.add(1)); + // Check if the words contains non-ASCII + if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { + return false; + } + // Safety: dst safety invariant passed down + unpack_alu(word, second_word, dst); } - // Safety: dst safety invariant passed down - unpack_alu(word, second_word, dst); true } // Safety: src points four valid `usize`s, dst points to two valid `usize`s #[inline(always)] - unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool { + fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool { // Safety: src safety invariant used here - let first = *src; - let second = *(src.add(1)); - let third = *(src.add(2)); - let fourth = *(src.add(3)); - if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { - return false; + unsafe { + let first = *src; + let second = *(src.add(1)); + let third = *(src.add(2)); + let fourth = *(src.add(3)); + if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { + return false; + } + // Safety: dst safety invariant passed down + pack_alu(first, second, third, fourth, dst); } - // Safety: dst safety invariant passed down - pack_alu(first, second, third, fourth, dst); true } // Safety: src, dst both point to two valid `usize`s each // Safety-usable invariant: Will return byte index of first non-ascii byte. #[inline(always)] - unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option { + fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option { // Safety: src safety invariant used here - let word = *src; - let second_word = *(src.add(1)); - // Safety: src safety invariant used here - *dst = word; - *(dst.add(1)) = second_word; - // Relies on safety-usable invariant here - find_non_ascii(word, second_word) + unsafe { + let word = *src; + let second_word = *(src.add(1)); + // Safety: src safety invariant used here + *dst = word; + *(dst.add(1)) = second_word; + // Relies on safety-usable invariant here + find_non_ascii(word, second_word) + } } basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu); diff --git a/src/big5.rs b/src/big5.rs index 3d161b56..2ae9d672 100644 --- a/src/big5.rs +++ b/src/big5.rs @@ -279,82 +279,82 @@ mod tests { #[test] fn test_big5_decode() { // Empty - decode_big5(b"", &""); + decode_big5(b"", ""); // ASCII - decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}"); + decode_big5(&[0x61u8, 0x62u8], "\u{0061}\u{0062}"); // Edge cases - decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}"); - decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}"); - decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}"); - decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}"); - decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}"); - decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}"); - decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}"); - decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}"); - decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}"); - decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}"); - decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}"); - decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}"); + decode_big5(&[0x87u8, 0x40u8], "\u{43F0}"); + decode_big5(&[0xFEu8, 0xFEu8], "\u{79D4}"); + decode_big5(&[0xFEu8, 0xFDu8], "\u{2910D}"); + decode_big5(&[0x88u8, 0x62u8], "\u{00CA}\u{0304}"); + decode_big5(&[0x88u8, 0x64u8], "\u{00CA}\u{030C}"); + decode_big5(&[0x88u8, 0x66u8], "\u{00CA}"); + decode_big5(&[0x88u8, 0xA3u8], "\u{00EA}\u{0304}"); + decode_big5(&[0x88u8, 0xA5u8], "\u{00EA}\u{030C}"); + decode_big5(&[0x88u8, 0xA7u8], "\u{00EA}"); + decode_big5(&[0x99u8, 0xD4u8], "\u{8991}"); + decode_big5(&[0x99u8, 0xD5u8], "\u{27967}"); + decode_big5(&[0x99u8, 0xD6u8], "\u{8A29}"); // Edge cases surrounded with ASCII decode_big5( &[0x61u8, 0x87u8, 0x40u8, 0x62u8], - &"\u{0061}\u{43F0}\u{0062}", + "\u{0061}\u{43F0}\u{0062}", ); decode_big5( &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8], - &"\u{0061}\u{79D4}\u{0062}", + "\u{0061}\u{79D4}\u{0062}", ); decode_big5( &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8], - &"\u{0061}\u{2910D}\u{0062}", + "\u{0061}\u{2910D}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0x62u8, 0x62u8], - &"\u{0061}\u{00CA}\u{0304}\u{0062}", + "\u{0061}\u{00CA}\u{0304}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0x64u8, 0x62u8], - &"\u{0061}\u{00CA}\u{030C}\u{0062}", + "\u{0061}\u{00CA}\u{030C}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0x66u8, 0x62u8], - &"\u{0061}\u{00CA}\u{0062}", + "\u{0061}\u{00CA}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0xA3u8, 0x62u8], - &"\u{0061}\u{00EA}\u{0304}\u{0062}", + "\u{0061}\u{00EA}\u{0304}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0xA5u8, 0x62u8], - &"\u{0061}\u{00EA}\u{030C}\u{0062}", + "\u{0061}\u{00EA}\u{030C}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0xA7u8, 0x62u8], - &"\u{0061}\u{00EA}\u{0062}", + "\u{0061}\u{00EA}\u{0062}", ); decode_big5( &[0x61u8, 0x99u8, 0xD4u8, 0x62u8], - &"\u{0061}\u{8991}\u{0062}", + "\u{0061}\u{8991}\u{0062}", ); decode_big5( &[0x61u8, 0x99u8, 0xD5u8, 0x62u8], - &"\u{0061}\u{27967}\u{0062}", + "\u{0061}\u{27967}\u{0062}", ); decode_big5( &[0x61u8, 0x99u8, 0xD6u8, 0x62u8], - &"\u{0061}\u{8A29}\u{0062}", + "\u{0061}\u{8A29}\u{0062}", ); // Bad sequences - decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}"); - decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}"); - decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}"); - decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}"); - decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}"); - decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}"); + decode_big5(&[0x80u8, 0x61u8], "\u{FFFD}\u{0061}"); + decode_big5(&[0xFFu8, 0x61u8], "\u{FFFD}\u{0061}"); + decode_big5(&[0xFEu8, 0x39u8], "\u{FFFD}\u{0039}"); + decode_big5(&[0x87u8, 0x66u8], "\u{FFFD}\u{0066}"); + decode_big5(&[0x81u8, 0x40u8], "\u{FFFD}\u{0040}"); + decode_big5(&[0x61u8, 0x81u8], "\u{0061}\u{FFFD}"); } #[test] diff --git a/src/euc_jp.rs b/src/euc_jp.rs index bf95a1ff..6eb946a8 100644 --- a/src/euc_jp.rs +++ b/src/euc_jp.rs @@ -24,10 +24,7 @@ enum EucJpPending { impl EucJpPending { fn is_none(&self) -> bool { - match *self { - EucJpPending::None => true, - _ => false, - } + matches!(*self, EucJpPending::None) } fn count(&self) -> usize { @@ -362,7 +359,7 @@ mod tests { #[test] fn test_euc_jp_decode() { // Empty - decode_euc_jp(b"", &""); + decode_euc_jp(b"", ""); // ASCII decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/euc_kr.rs b/src/euc_kr.rs index ab92b0fa..23d32519 100644 --- a/src/euc_kr.rs +++ b/src/euc_kr.rs @@ -255,7 +255,7 @@ fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) { } else { 0x41 }; - (lead as u8, (cp949_trail + offset) as u8) + (lead, cp949_trail + offset) } } } @@ -378,7 +378,7 @@ mod tests { #[test] fn test_euc_kr_decode() { // Empty - decode_euc_kr(b"", &""); + decode_euc_kr(b"", ""); // ASCII decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/gb18030.rs b/src/gb18030.rs index 5cfd97e4..e9e1cba4 100644 --- a/src/gb18030.rs +++ b/src/gb18030.rs @@ -25,10 +25,7 @@ enum Gb18030Pending { impl Gb18030Pending { fn is_none(&self) -> bool { - match *self { - Gb18030Pending::None => true, - _ => false, - } + matches!(*self, Gb18030Pending::None) } fn count(&self) -> usize { @@ -270,7 +267,7 @@ impl Gb18030Decoder { } else { handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16)) } - } else if pointer >= 189_000 && pointer <= 1_237_575 { + } else if (189_000..=1_237_575).contains(&pointer) { // Astral handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32) } else { @@ -602,7 +599,7 @@ mod tests { #[test] fn test_gb18030_decode() { // Empty - decode_gb18030(b"", &""); + decode_gb18030(b"", ""); // ASCII decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/handles.rs b/src/handles.rs index 94cecbd2..3de228e1 100644 --- a/src/handles.rs +++ b/src/handles.rs @@ -387,7 +387,7 @@ pub struct ByteSource<'a> { impl<'a> ByteSource<'a> { #[inline(always)] - pub fn new(src: &[u8]) -> ByteSource { + pub fn new(src: &'a [u8]) -> ByteSource<'a> { ByteSource { slice: src, pos: 0 } } #[inline(always)] @@ -594,7 +594,7 @@ pub struct Utf16Destination<'a> { impl<'a> Utf16Destination<'a> { #[inline(always)] - pub fn new(dst: &mut [u16]) -> Utf16Destination { + pub fn new(dst: &'a mut [u16]) -> Utf16Destination<'a> { Utf16Destination { slice: dst, pos: 0 } } #[inline(always)] @@ -695,7 +695,7 @@ impl<'a> Utf16Destination<'a> { source.pos += consumed; self.pos += consumed; source.pos += 1; // +1 for non_ascii - // Safety: non-ascii bubbled out here + // Safety: non-ascii bubbled out here non_ascii } } @@ -734,7 +734,7 @@ impl<'a> Utf16Destination<'a> { self.pos += consumed; if self.pos + 1 < dst_len { source.pos += 1; // +1 for non_ascii - // Safety: non-ascii bubbled out here + // Safety: non-ascii bubbled out here non_ascii } else { return CopyAsciiResult::Stop(( @@ -939,7 +939,7 @@ pub struct Utf8Destination<'a> { impl<'a> Utf8Destination<'a> { #[inline(always)] - pub fn new(dst: &mut [u8]) -> Utf8Destination { + pub fn new(dst: &mut [u8]) -> Utf8Destination<'_> { Utf8Destination { slice: dst, pos: 0 } } #[inline(always)] @@ -1116,7 +1116,7 @@ impl<'a> Utf8Destination<'a> { // Validate first, then memcpy to let memcpy do its thing even for // non-ASCII. (And potentially do something better than SSE2 for ASCII.) let valid_len = utf8_valid_up_to(&src_remaining[..min_len]); - (&mut dst_remaining[..valid_len]).copy_from_slice(&src_remaining[..valid_len]); + dst_remaining[..valid_len].copy_from_slice(&src_remaining[..valid_len]); source.pos += valid_len; self.pos += valid_len; } @@ -1164,7 +1164,7 @@ pub struct Utf16Source<'a> { impl<'a> Utf16Source<'a> { #[inline(always)] - pub fn new(src: &[u16]) -> Utf16Source { + pub fn new(src: &[u16]) -> Utf16Source<'_> { Utf16Source { slice: src, pos: 0 } } #[inline(always)] @@ -1272,6 +1272,7 @@ impl<'a> Utf16Source<'a> { Some((non_ascii, consumed)) => { self.pos += consumed; dest.advance(consumed); + #[allow(clippy::len_zero)] if dest.remaining().len() >= 1 { self.pos += 1; // commit to reading `non_ascii` let unit = non_ascii; @@ -1466,7 +1467,7 @@ pub struct Utf8Source<'a> { impl<'a> Utf8Source<'a> { #[inline(always)] - pub fn new(src: &str) -> Utf8Source { + pub fn new(src: &str) -> Utf8Source<'_> { Utf8Source { slice: src.as_bytes(), pos: 0, @@ -1614,6 +1615,7 @@ impl<'a> Utf8Source<'a> { Some((non_ascii, consumed)) => { self.pos += consumed; dest.advance(consumed); + #[allow(clippy::len_zero)] if dest.remaining().len() >= 1 { if non_ascii < 0xE0 { let point = ((u16::from(non_ascii) & 0x1F) << 6) @@ -1922,7 +1924,7 @@ pub struct ByteDestination<'a> { impl<'a> ByteDestination<'a> { #[inline(always)] - pub fn new(dst: &mut [u8]) -> ByteDestination { + pub fn new(dst: &mut [u8]) -> ByteDestination<'_> { ByteDestination { start: dst.as_ptr(), slice: dst, @@ -1930,10 +1932,11 @@ impl<'a> ByteDestination<'a> { } #[inline(always)] pub fn remaining(&mut self) -> &mut [u8] { - &mut self.slice + self.slice } #[inline(always)] pub fn check_space_one<'b>(&'b mut self) -> Space> { + #[allow(clippy::len_zero)] if self.slice.len() >= 1 { Space::Available(ByteOneHandle::new(self)) } else { diff --git a/src/iso_2022_jp.rs b/src/iso_2022_jp.rs index 39bb38c5..dac1a321 100644 --- a/src/iso_2022_jp.rs +++ b/src/iso_2022_jp.rs @@ -190,7 +190,7 @@ impl Iso2022JpDecoder { continue; } self.output_flag = false; - if b >= 0x21u8 && b <= 0x5Fu8 { + if (0x21u8..=0x5Fu8).contains(&b) { destination_handle.write_upper_bmp(u16::from(b) - 0x21u16 + 0xFF61u16); continue; } @@ -206,7 +206,7 @@ impl Iso2022JpDecoder { continue; } self.output_flag = false; - if b >= 0x21u8 && b <= 0x7Eu8 { + if (0x21u8..=0x7Eu8).contains(&b) { self.lead = b; self.decoder_state = Iso2022JpDecoderState::TrailByte; continue; @@ -376,6 +376,7 @@ fn is_kanji_mapped(bmp: u16) -> bool { #[allow(clippy::redundant_pattern_matching, clippy::if_same_then_else)] #[inline(always)] fn is_kanji_mapped(bmp: u16) -> bool { + #[allow(clippy::match_like_matches_macro)] if 0x4EDD == bmp { true } else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) { @@ -407,6 +408,7 @@ fn is_mapped_for_two_byte_encode(bmp: u16) -> bool { true } else { let bmp_minus_space = bmp.wrapping_sub(0x3000); + #[allow(clippy::match_like_matches_macro)] if bmp_minus_space < 3 { // fast-track common punctuation true @@ -477,10 +479,7 @@ impl Iso2022JpEncoder { } pub fn has_pending_state(&self) -> bool { - match self.state { - Iso2022JpEncoderState::Ascii => false, - _ => true, - } + !matches!(self.state, Iso2022JpEncoderState::Ascii) } pub fn max_buffer_length_from_utf16_without_replacement( @@ -764,7 +763,7 @@ mod tests { #[test] fn test_iso_2022_jp_decode() { // Empty - decode_iso_2022_jp(b"", &""); + decode_iso_2022_jp(b"", ""); // ASCII decode_iso_2022_jp(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/lib.rs b/src/lib.rs index 3239db54..81d0e9fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -93,7 +93,7 @@ //! //! ``` //! #[cfg(feature = "alloc")] { -//! use encoding_rs::*; +//! use encoding_rs2::*; //! //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}"; //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h"; @@ -108,7 +108,7 @@ //! Decode using the streaming API with minimal `unsafe`: //! //! ``` -//! use encoding_rs::*; +//! use encoding_rs2::*; //! //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}"; //! @@ -584,13 +584,13 @@ //! rust-encodingencoding_rs //! //! -//! encoding::EncodingRef&'static encoding_rs::Encoding -//! encoding::all::WINDOWS_31J (not based on the WHATWG name for some encodings)encoding_rs::SHIFT_JIS (always the WHATWG name uppercased and hyphens replaced with underscores) +//! encoding::EncodingRef&'static encoding_rs2::Encoding +//! encoding::all::WINDOWS_31J (not based on the WHATWG name for some encodings)encoding_rs2::SHIFT_JIS (always the WHATWG name uppercased and hyphens replaced with underscores) //! encoding::all::ERRORNot available because not in the Encoding Standard //! encoding::all::ASCIINot available because not in the Encoding Standard //! encoding::all::ISO_8859_1Not available because not in the Encoding Standard //! encoding::all::HZNot available because not in the Encoding Standard -//! encoding::label::encoding_from_whatwg_label(string)encoding_rs::Encoding::for_label(string) +//! encoding::label::encoding_from_whatwg_label(string)encoding_rs2::Encoding::for_label(string) //! enc.whatwg_name() (always lower case)enc.name() (potentially mixed case) //! enc.name()Not available because not in the Encoding Standard //! encoding::decode(bytes, encoding::DecoderTrap::Replace, enc)enc.decode(bytes) @@ -598,8 +598,8 @@ //! enc.encode(string, encoding::EncoderTrap::NcrEscape)enc.encode(string) //! enc.raw_decoder()enc.new_decoder_without_bom_handling() //! enc.raw_encoder()enc.new_encoder() -//! encoding::RawDecoderencoding_rs::Decoder -//! encoding::RawEncoderencoding_rs::Encoder +//! encoding::RawDecoderencoding_rs2::Decoder +//! encoding::RawEncoderencoding_rs2::Encoder //! raw_decoder.raw_feed(src, dst_string)dst_string.reserve(decoder.max_utf8_buffer_length_without_replacement(src.len()));
decoder.decode_to_string_without_replacement(src, dst_string, false)
//! raw_encoder.raw_feed(src, dst_vec)dst_vec.reserve(encoder.max_buffer_length_from_utf8_without_replacement(src.len()));
encoder.encode_from_utf8_to_vec_without_replacement(src, dst_vec, false)
//! raw_decoder.raw_finish(dst)dst_string.reserve(decoder.max_utf8_buffer_length_without_replacement(0));
decoder.decode_to_string_without_replacement(b"", dst, true)
@@ -2732,21 +2732,21 @@ impl Encoding { /// /// # Example /// ``` - /// use encoding_rs::Encoding; + /// use encoding_rs2::Encoding; /// - /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"utf-8")); - /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"unicode11utf8")); + /// assert_eq!(Some(encoding_rs2::UTF_8), Encoding::for_label(b"utf-8")); + /// assert_eq!(Some(encoding_rs2::UTF_8), Encoding::for_label(b"unicode11utf8")); /// - /// assert_eq!(Some(encoding_rs::ISO_8859_2), Encoding::for_label(b"latin2")); + /// assert_eq!(Some(encoding_rs2::ISO_8859_2), Encoding::for_label(b"latin2")); /// - /// assert_eq!(Some(encoding_rs::UTF_16BE), Encoding::for_label(b"utf-16be")); + /// assert_eq!(Some(encoding_rs2::UTF_16BE), Encoding::for_label(b"utf-16be")); /// /// assert_eq!(None, Encoding::for_label(b"unrecognized label")); /// ``` pub fn for_label(label: &[u8]) -> Option<&'static Encoding> { let mut trimmed = [0u8; LONGEST_LABEL_LENGTH]; let mut trimmed_pos = 0usize; - let mut iter = label.into_iter(); + let mut iter = label.iter(); // before loop { match iter.next() { @@ -3322,6 +3322,7 @@ impl Encoding { .unwrap() .next_power_of_two(), ); + #[allow(clippy::uninit_vec)] unsafe { vec.set_len(valid_up_to); core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to); @@ -3452,7 +3453,7 @@ impl Encoding { impl PartialEq for Encoding { #[inline] fn eq(&self, other: &Encoding) -> bool { - (self as *const Encoding) == (other as *const Encoding) + core::ptr::eq(self, other) } } @@ -3461,7 +3462,7 @@ impl Eq for Encoding {} #[cfg(test)] impl PartialOrd for Encoding { fn partial_cmp(&self, other: &Self) -> Option { - (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize)) + Some(self.cmp(other)) } } @@ -4352,9 +4353,7 @@ impl Decoder { /// Available via the C wrapper. pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option { match self.life_cycle { - DecoderLifeCycle::Converting => { - return self.variant.latin1_byte_compatible_up_to(bytes); - } + DecoderLifeCycle::Converting => self.variant.latin1_byte_compatible_up_to(bytes), DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."), _ => None, } @@ -5468,11 +5467,13 @@ mod tests { #[test] fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() { - assert!(UTF_8 - .decode_without_bom_handling_and_without_replacement( - b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4" - ) - .is_none()); + assert!( + UTF_8 + .decode_without_bom_handling_and_without_replacement( + b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4" + ) + .is_none() + ); } #[test] @@ -5490,9 +5491,11 @@ mod tests { #[test] fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() { - assert!(WINDOWS_1257 - .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4") - .is_none()); + assert!( + WINDOWS_1257 + .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4") + .is_none() + ); } #[test] @@ -5903,10 +5906,12 @@ mod tests { .unwrap(), 1 ); - assert!(REPLACEMENT - .new_decoder_without_bom_handling() - .latin1_byte_compatible_up_to(buffer) - .is_none()); + assert!( + REPLACEMENT + .new_decoder_without_bom_handling() + .latin1_byte_compatible_up_to(buffer) + .is_none() + ); assert_eq!( SHIFT_JIS .new_decoder_without_bom_handling() @@ -5921,14 +5926,18 @@ mod tests { .unwrap(), 1 ); - assert!(UTF_16BE - .new_decoder_without_bom_handling() - .latin1_byte_compatible_up_to(buffer) - .is_none()); - assert!(UTF_16LE - .new_decoder_without_bom_handling() - .latin1_byte_compatible_up_to(buffer) - .is_none()); + assert!( + UTF_16BE + .new_decoder_without_bom_handling() + .latin1_byte_compatible_up_to(buffer) + .is_none() + ); + assert!( + UTF_16LE + .new_decoder_without_bom_handling() + .latin1_byte_compatible_up_to(buffer) + .is_none() + ); assert_eq!( ISO_2022_JP .new_decoder_without_bom_handling() @@ -6141,10 +6150,12 @@ mod tests { 1 ); - assert!(UTF_8 - .new_decoder() - .latin1_byte_compatible_up_to(buffer) - .is_none()); + assert!( + UTF_8 + .new_decoder() + .latin1_byte_compatible_up_to(buffer) + .is_none() + ); let mut decoder = UTF_8.new_decoder(); let mut output = [0u16; 4]; diff --git a/src/mem.rs b/src/mem.rs index 92941bb9..aa45e15a 100644 --- a/src/mem.rs +++ b/src/mem.rs @@ -31,12 +31,12 @@ use alloc::string::String; #[cfg(feature = "alloc")] use alloc::vec::Vec; +use super::DecoderResult; +use super::in_inclusive_range8; use super::in_inclusive_range16; use super::in_inclusive_range32; -use super::in_inclusive_range8; use super::in_range16; use super::in_range32; -use super::DecoderResult; use crate::ascii::*; use crate::utf_8::*; @@ -229,7 +229,7 @@ macro_rules! by_unit_check_simd { let mut simd_accu = $splat; while offset <= len_minus_stride { // Safety: the above check lets us perform one $simd_ty read. - simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) }; + simd_accu |= unsafe { *(src.add(offset) as *const $simd_ty) }; offset += SIMD_STRIDE_SIZE / unit_size; } if !$func(simd_accu) { @@ -566,7 +566,7 @@ cfg_if! { } } } - let mut iter = (&buffer[offset..]).iter(); + let mut iter = buffer[offset..].iter(); loop { if let Some(&u) = iter.next() { if u > 0xFF { @@ -623,7 +623,7 @@ cfg_if! { } } } - let mut iter = (&buffer[offset..]).iter(); + let mut iter = buffer[offset..].iter(); loop { if let Some(&u) = iter.next() { if u > 0xFF { @@ -2017,10 +2017,11 @@ pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { let (head, tail) = bytes.split_at(up_to); let capacity = head.len() + tail.len() * 2; let mut vec = Vec::with_capacity(capacity); + #[allow(clippy::uninit_vec)] unsafe { vec.set_len(capacity); } - (&mut vec[..up_to]).copy_from_slice(head); + vec[..up_to].copy_from_slice(head); let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]); vec.truncate(up_to + written); Cow::Owned(unsafe { String::from_utf8_unchecked(vec) }) @@ -2054,10 +2055,11 @@ pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> { let (head, tail) = bytes.split_at(up_to); let capacity = bytes.len(); let mut vec = Vec::with_capacity(capacity); + #[allow(clippy::uninit_vec)] unsafe { vec.set_len(capacity); } - (&mut vec[..up_to]).copy_from_slice(head); + vec[..up_to].copy_from_slice(head); let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]); vec.truncate(up_to + written); Cow::Owned(vec) @@ -2079,7 +2081,7 @@ pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize { /// Returns the index of first byte that starts a non-Latin1 byte /// sequence, or the length of the string if there are none. pub fn str_latin1_up_to(buffer: &str) -> usize { - is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len()) + is_str_latin1_impl(buffer).unwrap_or(buffer.len()) } /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. @@ -2182,8 +2184,7 @@ mod tests { #[test] fn test_is_ascii_success() { - let mut src: Vec = Vec::with_capacity(128); - src.resize(128, 0); + let mut src: Vec = vec![0; 128]; for i in 0..src.len() { src[i] = i as u8; } @@ -2194,8 +2195,7 @@ mod tests { #[test] fn test_is_ascii_fail() { - let mut src: Vec = Vec::with_capacity(128); - src.resize(128, 0); + let mut src: Vec = vec![0; 128]; for i in 0..src.len() { src[i] = i as u8; } @@ -2210,8 +2210,7 @@ mod tests { #[test] fn test_is_basic_latin_success() { - let mut src: Vec = Vec::with_capacity(128); - src.resize(128, 0); + let mut src: Vec = vec![0; 128]; for i in 0..src.len() { src[i] = i as u16; } @@ -2222,8 +2221,7 @@ mod tests { #[test] fn test_is_basic_latin_fail() { - let mut src: Vec = Vec::with_capacity(128); - src.resize(128, 0); + let mut src: Vec = vec![0; 128]; for i in 0..src.len() { src[i] = i as u16; } @@ -2238,8 +2236,7 @@ mod tests { #[test] fn test_is_utf16_latin1_success() { - let mut src: Vec = Vec::with_capacity(256); - src.resize(256, 0); + let mut src: Vec = vec![0; 256]; for i in 0..src.len() { src[i] = i as u16; } @@ -2255,8 +2252,7 @@ mod tests { #[test] fn test_is_utf16_latin1_fail() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2273,8 +2269,7 @@ mod tests { #[test] fn test_is_str_latin1_success() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2288,8 +2283,7 @@ mod tests { #[test] fn test_is_str_latin1_fail() { let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2307,8 +2301,7 @@ mod tests { #[test] fn test_is_utf8_latin1_success() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2325,8 +2318,7 @@ mod tests { #[test] fn test_is_utf8_latin1_fail() { let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2357,8 +2349,7 @@ mod tests { #[test] fn test_convert_utf8_to_utf16() { let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; - let mut dst: Vec = Vec::with_capacity(src.len() + 1); - dst.resize(src.len() + 1, 0); + let mut dst: Vec = vec![0; src.len() + 1]; let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]); dst.truncate(len); let reference: Vec = src.encode_utf16().collect(); @@ -2368,8 +2359,7 @@ mod tests { #[test] fn test_convert_str_to_utf16() { let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; - let mut dst: Vec = Vec::with_capacity(src.len()); - dst.resize(src.len(), 0); + let mut dst: Vec = vec![0; src.len()]; let len = convert_str_to_utf16(src, &mut dst[..]); dst.truncate(len); let reference: Vec = src.encode_utf16().collect(); @@ -2380,8 +2370,7 @@ mod tests { fn test_convert_utf16_to_utf8_partial() { let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; let src: Vec = reference.encode_utf16().collect(); - let mut dst: Vec = Vec::with_capacity(src.len() * 3 + 1); - dst.resize(src.len() * 3 + 1, 0); + let mut dst: Vec = vec![0; src.len() * 3 + 1]; let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]); let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]); dst.truncate(len); @@ -2392,8 +2381,7 @@ mod tests { fn test_convert_utf16_to_utf8() { let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; let src: Vec = reference.encode_utf16().collect(); - let mut dst: Vec = Vec::with_capacity(src.len() * 3 + 1); - dst.resize(src.len() * 3 + 1, 0); + let mut dst: Vec = vec![0; src.len() * 3 + 1]; let len = convert_utf16_to_utf8(&src[..], &mut dst[..]); dst.truncate(len); assert_eq!(dst, reference.as_bytes()); @@ -2401,16 +2389,13 @@ mod tests { #[test] fn test_convert_latin1_to_utf16() { - let mut src: Vec = Vec::with_capacity(256); - src.resize(256, 0); - let mut reference: Vec = Vec::with_capacity(256); - reference.resize(256, 0); + let mut src: Vec = vec![0; 256]; + let mut reference: Vec = vec![0; 256]; for i in 0..256 { src[i] = i as u8; reference[i] = i as u16; } - let mut dst: Vec = Vec::with_capacity(src.len()); - dst.resize(src.len(), 0); + let mut dst: Vec = vec![0; src.len()]; convert_latin1_to_utf16(&src[..], &mut dst[..]); assert_eq!(dst, reference); } @@ -2425,17 +2410,14 @@ mod tests { #[test] fn test_convert_latin1_to_utf8() { - let mut src: Vec = Vec::with_capacity(256); - src.resize(256, 0); - let mut reference: Vec = Vec::with_capacity(256); - reference.resize(256, 0); + let mut src: Vec = vec![0; 256]; + let mut reference: Vec = vec![0; 256]; for i in 0..256 { src[i] = i as u8; reference[i] = i as u16; } let s = String::from_utf16(&reference[..]).unwrap(); - let mut dst: Vec = Vec::with_capacity(src.len() * 2); - dst.resize(src.len() * 2, 0); + let mut dst: Vec = vec![0; src.len() * 2]; let len = convert_latin1_to_utf8(&src[..], &mut dst[..]); dst.truncate(len); assert_eq!(&dst[..], s.as_bytes()); @@ -2443,17 +2425,14 @@ mod tests { #[test] fn test_convert_utf8_to_latin1_lossy() { - let mut reference: Vec = Vec::with_capacity(256); - reference.resize(256, 0); - let mut src16: Vec = Vec::with_capacity(256); - src16.resize(256, 0); + let mut reference: Vec = vec![0; 256]; + let mut src16: Vec = vec![0; 256]; for i in 0..256 { src16[i] = i as u16; reference[i] = i as u8; } let src = String::from_utf16(&src16[..]).unwrap(); - let mut dst: Vec = Vec::with_capacity(src.len()); - dst.resize(src.len(), 0); + let mut dst: Vec = vec![0; src.len()]; let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]); dst.truncate(len); assert_eq!(dst, reference); @@ -2469,16 +2448,13 @@ mod tests { #[test] fn test_convert_utf16_to_latin1_lossy() { - let mut src: Vec = Vec::with_capacity(256); - src.resize(256, 0); - let mut reference: Vec = Vec::with_capacity(256); - reference.resize(256, 0); + let mut src: Vec = vec![0; 256]; + let mut reference: Vec = vec![0; 256]; for i in 0..256 { src[i] = i as u16; reference[i] = i as u8; } - let mut dst: Vec = Vec::with_capacity(src.len()); - dst.resize(src.len(), 0); + let mut dst: Vec = vec![0; src.len()]; convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]); assert_eq!(dst, reference); } @@ -2487,7 +2463,7 @@ mod tests { // #[should_panic] fn test_convert_utf16_to_latin1_lossy_panics() { let mut dst = [0u8; 16]; - let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); + convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); } #[test] diff --git a/src/shift_jis.rs b/src/shift_jis.rs index b201ae4d..cb481a88 100644 --- a/src/shift_jis.rs +++ b/src/shift_jis.rs @@ -321,7 +321,7 @@ mod tests { #[test] fn test_shift_jis_decode() { // Empty - decode_shift_jis(b"", &""); + decode_shift_jis(b"", ""); // ASCII decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/simd_funcs.rs b/src/simd_funcs.rs index d0824188..aa19d361 100644 --- a/src/simd_funcs.rs +++ b/src/simd_funcs.rs @@ -7,18 +7,18 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use any_all_workaround::all_mask16x8; use any_all_workaround::all_mask8x16; -use any_all_workaround::any_mask16x8; +use any_all_workaround::all_mask16x8; use any_all_workaround::any_mask8x16; +use any_all_workaround::any_mask16x8; +use core::simd::ToBytes; use core::simd::cmp::SimdPartialEq; use core::simd::cmp::SimdPartialOrd; -use core::simd::mask16x8; use core::simd::mask8x16; +use core::simd::mask16x8; use core::simd::simd_swizzle; -use core::simd::u16x8; use core::simd::u8x16; -use core::simd::ToBytes; +use core::simd::u16x8; // TODO: Migrate unaligned access to stdlib code if/when the RFC // https://github.com/rust-lang/rfcs/pull/1725 is implemented. @@ -388,8 +388,7 @@ mod tests { ]; let first = unsafe { load8_unaligned(basic_latin.as_ptr()) }; let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) }; - let mut vec = Vec::with_capacity(16); - vec.resize(16, 0u8); + let mut vec = vec![0; 16]; let ptr = vec.as_mut_ptr(); assert!(simd_is_basic_latin(first | second)); unsafe { diff --git a/src/single_byte.rs b/src/single_byte.rs index 49f099e5..303f7951 100644 --- a/src/single_byte.rs +++ b/src/single_byte.rs @@ -646,18 +646,11 @@ mod tests { fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) { let mut with_replacement = [0u16; 128]; let mut it = data.iter().enumerate(); - loop { - match it.next() { - Some((i, code_point)) => { - if *code_point == 0 { - with_replacement[i] = 0xFFFD; - } else { - with_replacement[i] = *code_point; - } - } - None => { - break; - } + while let Some((i, code_point)) = it.next() { + if *code_point == 0 { + with_replacement[i] = 0xFFFD; + } else { + with_replacement[i] = *code_point; } } @@ -667,18 +660,11 @@ mod tests { fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) { let mut with_zeros = [0u8; 128]; let mut it = data.iter().enumerate(); - loop { - match it.next() { - Some((i, code_point)) => { - if *code_point == 0 { - with_zeros[i] = 0; - } else { - with_zeros[i] = HIGH_BYTES[i]; - } - } - None => { - break; - } + while let Some((i, code_point)) = it.next() { + if *code_point == 0 { + with_zeros[i] = 0; + } else { + with_zeros[i] = HIGH_BYTES[i]; } } diff --git a/src/utf_16.rs b/src/utf_16.rs index f24806d6..de402cfc 100644 --- a/src/utf_16.rs +++ b/src/utf_16.rs @@ -145,7 +145,7 @@ impl Utf16Decoder { // The previous high surrogate was in // error and this one becomes the new // pending one. - self.lead_surrogate = code_unit as u16; + self.lead_surrogate = code_unit; return ( DecoderResult::Malformed(2, 2), unread_handle.consumed(), diff --git a/src/utf_8.rs b/src/utf_8.rs index 8d836c2d..93cfb0f3 100644 --- a/src/utf_8.rs +++ b/src/utf_8.rs @@ -868,7 +868,7 @@ impl Utf8Encoder { let bytes = src.as_bytes(); let mut to_write = bytes.len(); if to_write <= dst.len() { - (&mut dst[..to_write]).copy_from_slice(bytes); + dst[..to_write].copy_from_slice(bytes); return (EncoderResult::InputEmpty, to_write, to_write); } to_write = dst.len(); @@ -876,7 +876,7 @@ impl Utf8Encoder { while (bytes[to_write] & 0xC0) == 0x80 { to_write -= 1; } - (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]); + dst[..to_write].copy_from_slice(&bytes[..to_write]); (EncoderResult::OutputFull, to_write, to_write) } } diff --git a/src/variant.rs b/src/variant.rs index dffaf053..bdcbc9ec 100644 --- a/src/variant.rs +++ b/src/variant.rs @@ -28,8 +28,8 @@ use iso_2022_jp::*; use replacement::*; use shift_jis::*; use single_byte::*; -use utf_16::*; use utf_8::*; +use utf_16::*; use x_user_defined::*; pub enum VariantDecoder { @@ -392,9 +392,9 @@ impl VariantEncoding { } pub fn is_single_byte(&self) -> bool { - match *self { - VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true, - _ => false, - } + matches!( + *self, + VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined + ) } } diff --git a/src/x_user_defined.rs b/src/x_user_defined.rs index cd87b9a2..15534840 100644 --- a/src/x_user_defined.rs +++ b/src/x_user_defined.rs @@ -16,6 +16,7 @@ cfg_if! { use simd_funcs::*; use core::simd::u16x8; use core::simd::cmp::SimdPartialOrd; + use core::simd::Select; #[inline(always)] fn shift_upper(unpacked: u16x8) -> u16x8 { @@ -180,7 +181,7 @@ impl UserDefinedEncoder { destination_handle.write_one(c as u8); continue; } - if c < '\u{F780}' || c > '\u{F7FF}' { + if !('\u{F780}'..='\u{F7FF}').contains(&c) { return ( EncoderResult::Unmappable(c), unread_handle.consumed(), @@ -226,7 +227,10 @@ mod tests { decode_x_user_defined(b"\x61\x62", "\u{0061}\u{0062}"); decode_x_user_defined(b"\x80\xFF", "\u{F780}\u{F7FF}"); - decode_x_user_defined(b"\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62", "\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}"); + decode_x_user_defined( + b"\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62", + "\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}", + ); } #[test] diff --git a/tests/test_data/big5.rs b/tests/test_data/big5.rs index fc45e26a..5db529bf 100644 --- a/tests/test_data/big5.rs +++ b/tests/test_data/big5.rs @@ -1,4 +1,4 @@ -use encoding_rs::*; +use encoding_rs2::*; #[test] #[cfg_attr(miri, ignore)] // Miri is too slow diff --git a/tests/test_data/euc_jp.rs b/tests/test_data/euc_jp.rs index 038b7a13..60e70ad5 100644 --- a/tests/test_data/euc_jp.rs +++ b/tests/test_data/euc_jp.rs @@ -1,4 +1,4 @@ -use encoding_rs::*; +use encoding_rs2::*; #[test] #[cfg_attr(miri, ignore)] // Miri is too slow diff --git a/tests/test_data/euc_kr.rs b/tests/test_data/euc_kr.rs index 117bbd49..5062ed48 100644 --- a/tests/test_data/euc_kr.rs +++ b/tests/test_data/euc_kr.rs @@ -1,4 +1,4 @@ -use encoding_rs::*; +use encoding_rs2::*; #[test] #[cfg_attr(miri, ignore)] // Miri is too slow diff --git a/tests/test_data/gb18030.rs b/tests/test_data/gb18030.rs index ef45c03e..cec33adb 100644 --- a/tests/test_data/gb18030.rs +++ b/tests/test_data/gb18030.rs @@ -1,4 +1,4 @@ -use encoding_rs::*; +use encoding_rs2::*; #[test] #[cfg_attr(miri, ignore)] // Miri is too slow diff --git a/tests/test_data/iso_2022_jp.rs b/tests/test_data/iso_2022_jp.rs index 288e65a6..0a30f2ed 100644 --- a/tests/test_data/iso_2022_jp.rs +++ b/tests/test_data/iso_2022_jp.rs @@ -1,4 +1,4 @@ -use encoding_rs::*; +use encoding_rs2::*; #[test] #[cfg_attr(miri, ignore)] // Miri is too slow diff --git a/tests/test_data/shift_jis.rs b/tests/test_data/shift_jis.rs index 24d6215d..f6421326 100644 --- a/tests/test_data/shift_jis.rs +++ b/tests/test_data/shift_jis.rs @@ -1,4 +1,4 @@ -use encoding_rs::*; +use encoding_rs2::*; #[test] #[cfg_attr(miri, ignore)] // Miri is too slow