From c458437d7795c803df8f1fbefca5848804bbea1e Mon Sep 17 00:00:00 2001 From: Tony <1094086026@qq.com> Date: Wed, 27 May 2026 08:56:27 +0800 Subject: [PATCH] fix: disambiguate archive entries when multiple inputs share the same basename When compressing multiple directories that share the same basename (e.g., /a/parent1/Scripts and /a/parent2/Scripts), ouch previously stripped paths to just the basename using file_name(). This caused: - Zip: 'Invalid zip archive - Duplicate filename: Scripts/' - Tar: both directories merge into one, data loss on decompress Fix: detect when input paths have duplicate basenames and, in that case, compute the common ancestor directory of all inputs. Archive entries are then stored relative to this common ancestor, including enough parent path components to disambiguate (e.g., parent1/Scripts vs parent2/Scripts). When basenames don't collide, behavior is unchanged (backward compatible). Applied consistently to all three archive builders: tar, zip, and 7z. Fixes #978 --- src/archive/sevenz.rs | 30 +++++++++++++--- src/archive/tar.rs | 27 ++++++++++++-- src/archive/zip.rs | 33 +++++++++++++---- src/utils/fs.rs | 50 ++++++++++++++++++++++++++ tests/integration.rs | 84 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 211 insertions(+), 13 deletions(-) diff --git a/src/archive/sevenz.rs b/src/archive/sevenz.rs index a45248e61..9d608a784 100644 --- a/src/archive/sevenz.rs +++ b/src/archive/sevenz.rs @@ -18,7 +18,8 @@ use crate::{ info, list::{FileInArchive, ListFileType}, utils::{ - BytesFmt, FileVisibilityPolicy, PathFmt, cd_into_same_dir_as, ensure_parent_dir_exists, is_same_file_as_output, + BytesFmt, FileVisibilityPolicy, PathFmt, cd_into_same_dir_as, common_ancestor, ensure_parent_dir_exists, + has_duplicate_basenames, is_same_file_as_output, }, warning, }; @@ -134,14 +135,35 @@ where let mut writer = sevenz_rust2::ArchiveWriter::new(writer)?; let output_handle = Handle::from_path(output_path); + // When multiple inputs share the same basename, archive entry names would collide. + // In that case, use the common ancestor of all paths and store entries relative to it, + // including enough parent path components to disambiguate. + let use_common_ancestor = files.len() > 1 && has_duplicate_basenames(files); + let common_ancestor = if use_common_ancestor { + Some(common_ancestor(files)) + } else { + None + }; + for filename in files { - let previous_location = cd_into_same_dir_as(filename)?; + let previous_location = if let Some(ref ancestor) = common_ancestor { + let previous = env::current_dir()?; + env::set_current_dir(ancestor)?; + previous + } else { + cd_into_same_dir_as(filename)? + }; // Unwrap safety: // paths should be canonicalized by now, and the root directory rejected. - let filename = filename.file_name().unwrap(); + // When using common ancestor, include enough parent path to avoid collisions. + let walker_root = if let Some(ref ancestor) = common_ancestor { + filename.strip_prefix(ancestor).unwrap().to_path_buf() + } else { + PathBuf::from(filename.file_name().unwrap()) + }; - for entry in file_visibility_policy.build_walker(filename) { + for entry in file_visibility_policy.build_walker(&walker_root) { let entry = entry?; let path = entry.path(); diff --git a/src/archive/tar.rs b/src/archive/tar.rs index 843cd3cf7..128e496b8 100644 --- a/src/archive/tar.rs +++ b/src/archive/tar.rs @@ -144,14 +144,35 @@ where let output_handle = Handle::from_path(output_path); let mut seen_inode: HashMap<(u64, u64), PathBuf> = HashMap::new(); + // When multiple inputs share the same basename, archive entry names would collide. + // In that case, use the common ancestor of all paths and store entries relative to it, + // including enough parent path components to disambiguate. + let use_common_ancestor = explicit_paths.len() > 1 && utils::has_duplicate_basenames(explicit_paths); + let common_ancestor = if use_common_ancestor { + Some(utils::common_ancestor(explicit_paths)) + } else { + None + }; + for explicit_path in explicit_paths { - let previous_location = utils::cd_into_same_dir_as(explicit_path)?; + let previous_location = if let Some(ref ancestor) = common_ancestor { + let previous = env::current_dir()?; + env::set_current_dir(ancestor)?; + previous + } else { + utils::cd_into_same_dir_as(explicit_path)? + }; // Unwrap expectation: // paths should be canonicalized by now, and the root directory rejected. - let filename = explicit_path.file_name().unwrap(); + // When using common ancestor, include enough parent path to avoid collisions. + let walker_root = if let Some(ref ancestor) = common_ancestor { + explicit_path.strip_prefix(ancestor).unwrap().as_os_str() + } else { + explicit_path.file_name().unwrap() + }; - let iter = file_visibility_policy.workaround_build_walker_or_broken_link_path(explicit_path, filename); + let iter = file_visibility_policy.workaround_build_walker_or_broken_link_path(explicit_path, walker_root); for entry in iter { let path = entry?; diff --git a/src/archive/zip.rs b/src/archive/zip.rs index eae9d327a..feb47186c 100644 --- a/src/archive/zip.rs +++ b/src/archive/zip.rs @@ -21,9 +21,9 @@ use crate::{ info, info_accessible, list::{FileInArchive, ListFileType}, utils::{ - BytesFmt, FileType, FileVisibilityPolicy, PathFmt, canonicalize, cd_into_same_dir_as, create_symlink, - ensure_parent_dir_exists, get_invalid_utf8_paths, is_same_file_as_output, pretty_format_list_of_paths, - read_file_type, strip_cur_dir, + BytesFmt, FileType, FileVisibilityPolicy, PathFmt, canonicalize, cd_into_same_dir_as, common_ancestor, + create_symlink, ensure_parent_dir_exists, get_invalid_utf8_paths, has_duplicate_basenames, + is_same_file_as_output, pretty_format_list_of_paths, read_file_type, strip_cur_dir, }, warning, }; @@ -180,14 +180,35 @@ where return Err(error.into()); } + // When multiple inputs share the same basename, archive entry names would collide. + // In that case, use the common ancestor of all paths and store entries relative to it, + // including enough parent path components to disambiguate. + let use_common_ancestor = input_filenames.len() > 1 && has_duplicate_basenames(input_filenames); + let common_ancestor = if use_common_ancestor { + Some(common_ancestor(input_filenames)) + } else { + None + }; + for explicit_path in input_filenames { - let previous_location = cd_into_same_dir_as(explicit_path)?; + let previous_location = if let Some(ref ancestor) = common_ancestor { + let previous = env::current_dir()?; + env::set_current_dir(ancestor)?; + previous + } else { + cd_into_same_dir_as(explicit_path)? + }; // Unwrap safety: // paths should be canonicalized by now, and the root directory rejected. - let filename = explicit_path.file_name().unwrap(); + // When using common ancestor, include enough parent path to avoid collisions. + let walker_root = if let Some(ref ancestor) = common_ancestor { + explicit_path.strip_prefix(ancestor).unwrap().as_os_str() + } else { + explicit_path.file_name().unwrap() + }; - let iter = file_visibility_policy.workaround_build_walker_or_broken_link_path(explicit_path, filename); + let iter = file_visibility_policy.workaround_build_walker_or_broken_link_path(explicit_path, walker_root); for entry in iter { let path = entry?; diff --git a/src/utils/fs.rs b/src/utils/fs.rs index b9df5e219..eefcef6f9 100644 --- a/src/utils/fs.rs +++ b/src/utils/fs.rs @@ -138,6 +138,56 @@ pub fn cd_into_same_dir_as(filename: &Path) -> Result { Ok(previous_location) } +/// Check if any of the given paths share the same basename (file_name). +/// +/// Returns `true` if there are duplicate basenames among the paths. +/// Used to detect when archive entry names would collide. +pub fn has_duplicate_basenames(paths: &[PathBuf]) -> bool { + use std::{collections::HashSet, ffi::OsStr}; + + let basenames: Vec<&OsStr> = paths.iter().filter_map(|p| p.file_name()).collect(); + + let mut seen = HashSet::new(); + for name in &basenames { + if !seen.insert(name) { + return true; + } + } + false +} + +/// Compute the longest common ancestor directory shared by all given paths. +/// +/// All paths should be absolute (canonicalized). Returns the longest path +/// prefix that is a common ancestor of all input paths. +/// +/// For example: +/// - `/a/b/Scripts` and `/c/d/Scripts` → `/` +/// - `/home/user/a/Scripts` and `/home/user/b/Scripts` → `/home/user` +pub fn common_ancestor(paths: &[PathBuf]) -> PathBuf { + use std::path::Component; + + if paths.is_empty() { + return PathBuf::new(); + } + + let components: Vec> = paths.iter().map(|p| p.components().collect()).collect(); + + let min_len = components.iter().map(|c| c.len()).min().unwrap_or(0); + + let mut common = PathBuf::new(); + for i in 0..min_len { + let component = components[0][i]; + if components.iter().all(|c| c[i] == component) { + common.push(component.as_os_str()); + } else { + break; + } + } + + common +} + /// Check if a path refers to the same file as the output handle. pub fn is_same_file_as_output(path: &Path, output_handle: &Handle) -> bool { if matches!(Handle::from_path(path), Ok(x) if &x == output_handle) { diff --git a/tests/integration.rs b/tests/integration.rs index 4c30533b7..900b16dd3 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -1569,3 +1569,87 @@ fn decompress_concatenated_lz4_frames() { encoder.finish().unwrap() }); } + +/// Test that compressing multiple directories with the same basename works correctly. +/// This is the fix for issue #978: when two input directories share the same name +/// (e.g., `/a/parent1/Scripts` and `/a/parent2/Scripts`), ouch should include enough +/// parent path to disambiguate them in the archive, rather than collapsing both into +/// a single "Scripts" entry. +#[test] +fn compress_dirs_with_same_basename_tar() { + let (_tempdir, dir) = testdir().unwrap(); + + // Create two directories with the same basename but different parents + let dir_a = dir.join("parent1").join("Scripts"); + let dir_b = dir.join("parent2").join("Scripts"); + fs::create_dir_all(&dir_a).unwrap(); + fs::create_dir_all(&dir_b).unwrap(); + + // Create distinct files in each + fs::write(dir_a.join("a.txt"), "content from A").unwrap(); + fs::write(dir_b.join("b.txt"), "content from B").unwrap(); + + let archive = dir.join("archive.tar"); + ouch!("-A", "c", &dir_a, &dir_b, &archive); + + let after = dir.join("after"); + ouch!("-A", "d", &archive, "-d", &after); + + // Both directories should be present with their parent disambiguators + assert!( + after.join("parent1").join("Scripts").join("a.txt").exists(), + "parent1/Scripts/a.txt missing from archive" + ); + assert!( + after.join("parent2").join("Scripts").join("b.txt").exists(), + "parent2/Scripts/b.txt missing from archive" + ); + + // Verify file contents + assert_eq!( + "content from A", + fs::read_to_string(after.join("parent1").join("Scripts").join("a.txt")).unwrap() + ); + assert_eq!( + "content from B", + fs::read_to_string(after.join("parent2").join("Scripts").join("b.txt")).unwrap() + ); +} + +/// Same as above but for zip format +#[test] +fn compress_dirs_with_same_basename_zip() { + let (_tempdir, dir) = testdir().unwrap(); + + let dir_a = dir.join("parent1").join("Scripts"); + let dir_b = dir.join("parent2").join("Scripts"); + fs::create_dir_all(&dir_a).unwrap(); + fs::create_dir_all(&dir_b).unwrap(); + + fs::write(dir_a.join("a.txt"), "content from A").unwrap(); + fs::write(dir_b.join("b.txt"), "content from B").unwrap(); + + let archive = dir.join("archive.zip"); + ouch!("-A", "c", &dir_a, &dir_b, &archive); + + let after = dir.join("after"); + ouch!("-A", "d", &archive, "-d", &after); + + assert!( + after.join("parent1").join("Scripts").join("a.txt").exists(), + "parent1/Scripts/a.txt missing from zip archive" + ); + assert!( + after.join("parent2").join("Scripts").join("b.txt").exists(), + "parent2/Scripts/b.txt missing from zip archive" + ); + + assert_eq!( + "content from A", + fs::read_to_string(after.join("parent1").join("Scripts").join("a.txt")).unwrap() + ); + assert_eq!( + "content from B", + fs::read_to_string(after.join("parent2").join("Scripts").join("b.txt")).unwrap() + ); +}