Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 48 additions & 36 deletions src/uu/dd/src/conversion_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,43 +9,55 @@

pub type ConversionTable = [u8; 256];

pub const ASCII_UCASE_TO_LCASE: ConversionTable = [
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
];
/// Builds a lowercase conversion table using locale-aware libc::tolower

Check failure on line 12 in src/uu/dd/src/conversion_tables.rs

View workflow job for this annotation

GitHub Actions / Style/spelling (ubuntu-latest, feat_os_unix)

ERROR: `cspell`: Unknown word 'tolower' (file:'src/uu/dd/src/conversion_tables.rs', line:12)
/// This function builds the table dynamically based on the current locale.
/// The nix crate doesn't provide safe wrappers for locale functions, so we use libc directly.
pub fn build_lcase_table() -> ConversionTable {
// Initialize locale from environment if not already done
// SAFETY: setlocale is called with a valid C string and is used to initialize
// the locale for character conversion functions
unsafe { libc::setlocale(libc::LC_CTYPE, c"".as_ptr()) };

Check failure on line 19 in src/uu/dd/src/conversion_tables.rs

View workflow job for this annotation

GitHub Actions / Style/spelling (ubuntu-latest, feat_os_unix)

ERROR: `cspell`: Unknown word 'CTYPE' (file:'src/uu/dd/src/conversion_tables.rs', line:19)

pub const ASCII_LCASE_TO_UCASE: ConversionTable = [
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
];
let mut table = [0u8; 256];
for (i, item) in table.iter_mut().enumerate() {
// SAFETY: tolower is called with a valid byte value and returns a valid byte

Check failure on line 23 in src/uu/dd/src/conversion_tables.rs

View workflow job for this annotation

GitHub Actions / Style/spelling (ubuntu-latest, feat_os_unix)

ERROR: `cspell`: Unknown word 'tolower' (file:'src/uu/dd/src/conversion_tables.rs', line:23)
*item = unsafe { libc::tolower(i as libc::c_int) } as u8;

Check failure on line 24 in src/uu/dd/src/conversion_tables.rs

View workflow job for this annotation

GitHub Actions / Style/spelling (ubuntu-latest, feat_os_unix)

ERROR: `cspell`: Unknown word 'tolower' (file:'src/uu/dd/src/conversion_tables.rs', line:24)
}
table
}

/// Builds an uppercase conversion table using locale-aware libc::toupper

Check failure on line 29 in src/uu/dd/src/conversion_tables.rs

View workflow job for this annotation

GitHub Actions / Style/spelling (ubuntu-latest, feat_os_unix)

ERROR: `cspell`: Unknown word 'toupper' (file:'src/uu/dd/src/conversion_tables.rs', line:29)
/// This function builds the table dynamically based on the current locale.
/// The nix crate doesn't provide safe wrappers for locale functions, so we use libc directly.
pub fn build_ucase_table() -> ConversionTable {
// Initialize locale from environment if not already done
// SAFETY: setlocale is called with a valid C string and is used to initialize
// the locale for character conversion functions
unsafe { libc::setlocale(libc::LC_CTYPE, c"".as_ptr()) };

Check failure on line 36 in src/uu/dd/src/conversion_tables.rs

View workflow job for this annotation

GitHub Actions / Style/spelling (ubuntu-latest, feat_os_unix)

ERROR: `cspell`: Unknown word 'CTYPE' (file:'src/uu/dd/src/conversion_tables.rs', line:36)

let mut table = [0u8; 256];
for (i, item) in table.iter_mut().enumerate() {
// SAFETY: toupper is called with a valid byte value and returns a valid byte

Check failure on line 40 in src/uu/dd/src/conversion_tables.rs

View workflow job for this annotation

GitHub Actions / Style/spelling (ubuntu-latest, feat_os_unix)

ERROR: `cspell`: Unknown word 'toupper' (file:'src/uu/dd/src/conversion_tables.rs', line:40)
*item = unsafe { libc::toupper(i as libc::c_int) } as u8;

Check failure on line 41 in src/uu/dd/src/conversion_tables.rs

View workflow job for this annotation

GitHub Actions / Style/spelling (ubuntu-latest, feat_os_unix)

ERROR: `cspell`: Unknown word 'toupper' (file:'src/uu/dd/src/conversion_tables.rs', line:41)
}
table
}

/// Gets the uppercase to lowercase conversion table using current locale
pub fn get_ucase_to_lcase_table() -> &'static ConversionTable {
// For now, simply build a fresh table each time and leak it
// This is simpler and safer than trying to manage mutable static state
let table = build_lcase_table();
Box::leak(Box::new(table))
}

/// Gets the lowercase to uppercase conversion table using current locale
pub fn get_lcase_to_ucase_table() -> &'static ConversionTable {
// For now, simply build a fresh table each time and leak it
// This is simpler and safer than trying to manage mutable static state
let table = build_ucase_table();
Box::leak(Box::new(table))
}

pub const ASCII_TO_EBCDIC: ConversionTable = [
0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
Expand Down
4 changes: 2 additions & 2 deletions src/uu/dd/src/parseargs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,8 +552,8 @@ fn get_ctable(
Conversion::Ibm => &ASCII_TO_IBM,
},
(None, Some(case)) => match case {
Case::Lower => &ASCII_UCASE_TO_LCASE,
Case::Upper => &ASCII_LCASE_TO_UCASE,
Case::Lower => get_ucase_to_lcase_table(),
Case::Upper => get_lcase_to_ucase_table(),
},
(Some(conv), Some(case)) => match (conv, case) {
(Conversion::Ascii, Case::Upper) => &EBCDIC_TO_ASCII_LCASE_TO_UCASE,
Expand Down
146 changes: 146 additions & 0 deletions tests/by-util/test_dd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1945,3 +1945,149 @@ fn test_nocache_eof_fadvise_zero_length() {
"Expected len=0 at EOF: {strace}"
);
}

#[test]
fn test_iso8859_1_case_conversion() {
// Test ISO-8859-1 case conversion for accented characters
// Skip test if required locale is not available (common in CI environments)
let locale_test = Command::new("locale")
.arg("-a")
.output()
.ok()
.and_then(|output| String::from_utf8(output.stdout).ok())
.map(|locales| locales.contains("fr_FR"))
.unwrap_or(false);

if !locale_test {
eprintln!("Skipping ISO-8859-1 test: French locale not available");
return;
}

let locale = "fr_FR";

// É (0xC9) should convert to é (0xE9) with lcase
let input = vec![0xC9, 0x0A]; // É\n in ISO-8859-1
let expected = vec![0xE9, 0x0A]; // é\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", locale)
.pipe_in(input)
.succeeds();
assert_eq!(result.stdout(), expected);

// é (0xE9) should convert to É (0xC9) with ucase
let input = vec![0xE9, 0x0A]; // é\n in ISO-8859-1
let expected = vec![0xC9, 0x0A]; // É\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=ucase", "status=none"])
.env("LC_ALL", locale)
.pipe_in(input)
.succeeds();
assert_eq!(result.stdout(), expected);
}

#[test]
fn test_locale_aware_case_conversion() {
// Test that case conversion respects different single-byte locales

// Test Turkish (ISO-8859-9) where 'I' has special behavior
// Turkish has İ (0xDD) ↔ i (0xFD) and I (0x49) ↔ ı (0xFD in some positions)
// For simplicity, test some basic accented characters that differ between locales

// Test with ISO-8859-9 (Turkish) - Ğ (0xD0) should convert to ğ (0xF0)
let input = vec![0xD0, 0x0A]; // Ğ\n in ISO-8859-9
let expected = vec![0xF0, 0x0A]; // ğ\n in ISO-8859-9
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", "tr_TR.iso8859-9")
.pipe_in(input)
.succeeds();

// Note: This test may not work if the system doesn't have Turkish locale installed
// In that case, it should fall back to C locale behavior
if result.stdout() == expected {
println!("Turkish locale case conversion working correctly");
} else {
println!("Turkish locale not available, using fallback behavior");
// Test that it at least doesn't crash and produces some output
assert!(!result.stdout().is_empty());
}
}

#[test]
fn test_french_locale_case_conversion() {
// Test French (ISO-8859-1) case conversion for French accented characters
// This test uses the same charset as the previous ISO-8859-1 test but with French locale

// Test French accented characters: À (0xC0) should convert to à (0xE0) with lcase
let input = vec![0xC0, 0x0A]; // À\n in ISO-8859-1
let expected = vec![0xE0, 0x0A]; // à\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", "fr_FR.iso8859-1")
.pipe_in(input)
.succeeds();

// Note: This test may not work if the system doesn't have French locale installed
// In that case, it should fall back to C locale behavior
if result.stdout() == expected {
println!("French locale case conversion working correctly for À -> à");
} else {
println!("French locale not available, using fallback behavior");
// Test that it at least doesn't crash and produces some output
assert!(!result.stdout().is_empty());
}

// Test reverse conversion: à (0xE0) should convert to À (0xC0) with ucase
let input = vec![0xE0, 0x0A]; // à\n in ISO-8859-1
let expected = vec![0xC0, 0x0A]; // À\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=ucase", "status=none"])
.env("LC_ALL", "fr_FR.iso8859-1")
.pipe_in(input)
.succeeds();

if result.stdout() == expected {
println!("French locale case conversion working correctly for à -> À");
} else {
println!("French locale not available for reverse conversion, using fallback behavior");
assert!(!result.stdout().is_empty());
}

// Test another French character: Ç (0xC7) should convert to ç (0xE7) with lcase
let input = vec![0xC7, 0x0A]; // Ç\n in ISO-8859-1
let expected = vec![0xE7, 0x0A]; // ç\n in ISO-8859-1
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", "fr_FR.iso8859-1")
.pipe_in(input)
.succeeds();

if result.stdout() == expected {
println!("French locale case conversion working correctly for Ç -> ç");
} else {
println!("French locale not available for Ç conversion, using fallback behavior");
assert!(!result.stdout().is_empty());
}
}

#[test]
fn test_ascii_case_conversion_fallback() {
// Test that ASCII characters always convert correctly regardless of locale
let input = vec![b'A', b'B', b'C', 0x0A]; // ABC\n
let expected = vec![b'a', b'b', b'c', 0x0A]; // abc\n
let result = new_ucmd!()
.args(&["conv=lcase", "status=none"])
.env("LC_ALL", "C")
.pipe_in(input.clone())
.succeeds();
assert_eq!(result.stdout(), expected);

// Test reverse conversion
let result = new_ucmd!()
.args(&["conv=ucase", "status=none"])
.env("LC_ALL", "C")
.pipe_in(expected)
.succeeds();
assert_eq!(result.stdout(), input);
}
Loading