diff --git a/.github/buildomat/jobs/test-live-repair.sh b/.github/buildomat/jobs/test-live-repair.sh index 5dfcf9694..d7e06712f 100644 --- a/.github/buildomat/jobs/test-live-repair.sh +++ b/.github/buildomat/jobs/test-live-repair.sh @@ -106,7 +106,7 @@ $BINDIR/dsc start \ # This gives dsc time to fail, as it is known to happen. If we don't check, # then the later test will just hang forever waiting for downstairs that # will never show up. -sleep 5 +sleep 60 # bonus time for region checking dsc_pid=$(pgrep dsc); if [[ "$dsc_pid" -eq 0 ]]; then diff --git a/Cargo.lock b/Cargo.lock index a751005cd..9fadc8180 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1546,6 +1546,19 @@ dependencies = [ "thiserror 2.0.16", ] +[[package]] +name = "crucible-verify-raw" +version = "0.1.0" +dependencies = [ + "anyhow", + "bincode", + "clap", + "crucible-common", + "crucible-protocol", + "crucible-workspace-hack", + "serde", +] + [[package]] name = "crucible-workspace-hack" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index f9851f8c0..e52b6b28c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ members = [ "workspace-hack", "xtask", "agent-antagonist", + "verify-raw", ] resolver = "2" diff --git a/common/src/lib.rs b/common/src/lib.rs index 3ab74b835..826485a86 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -145,8 +145,8 @@ pub enum CrucibleError { #[error("Invalid downstairs replace {0}")] ReplaceRequestInvalid(String), - #[error("missing context slot for block {0}")] - MissingContextSlot(u64), + #[error("missing context slot for block {block} in extent {extent}")] + MissingContextSlot { block: u64, extent: u32 }, #[error("metadata deserialization failed: {0}")] BadMetadata(String), @@ -458,7 +458,7 @@ impl From for dropshot::HttpError { | CrucibleError::UpstairsActivateInProgress | CrucibleError::UpstairsDeactivating | CrucibleError::UuidMismatch - | CrucibleError::MissingContextSlot(..) + | CrucibleError::MissingContextSlot { .. } | CrucibleError::BadMetadata(..) | CrucibleError::BadContextSlot(..) | CrucibleError::MissingBlockContext diff --git a/downstairs/src/dump.rs b/downstairs/src/dump.rs index 78db92e01..c6861c75e 100644 --- a/downstairs/src/dump.rs +++ b/downstairs/src/dump.rs @@ -11,28 +11,52 @@ struct ExtInfo { ei_hm: HashMap, } -pub fn verify_region(region_dir: PathBuf, log: Logger) -> Result<()> { +pub fn verify_region( + region_dir: PathBuf, + thread_count: Option, + log: Logger, +) -> Result<()> { let region = Region::open(region_dir, false, true, &log)?; - let errors: Vec<_> = region - .extents - .par_iter() - .filter_map(|e| { - let extent = match e { - extent::ExtentState::Opened(extent) => extent, - extent::ExtentState::Closed => panic!("dump on closed extent!"), - }; - if let Err(err) = extent.validate() { - Some((extent.number, err)) - } else { - None - } - }) - .collect(); + // Configure thread pool based on parameter + let pool = if let Some(count) = thread_count { + rayon::ThreadPoolBuilder::new() + .num_threads(count) + .build() + .expect("Failed to build thread pool") + } else { + rayon::ThreadPoolBuilder::new() + .build() + .expect("Failed to build thread pool") + }; + + let errors: Vec<_> = pool.install(|| { + region + .extents + .par_iter() + .filter_map(|e| { + let extent = match e { + extent::ExtentState::Opened(extent) => extent, + extent::ExtentState::Closed => { + panic!("dump on closed extent!") + } + }; + + if let Err(err) = extent.validate() { + Some((extent.number, err)) + } else { + None + } + }) + .collect() + }); if !errors.is_empty() { for (number, err) in &errors { - println!("validation failed for extent {}: {:?}", number, err); + println!( + "validation failed for extent {} 0x{:03X} : {:?}", + number, number.0, err + ); } bail!("Region failed to verify"); } diff --git a/downstairs/src/extent.rs b/downstairs/src/extent.rs index 4bba34762..e8b07d4aa 100644 --- a/downstairs/src/extent.rs +++ b/downstairs/src/extent.rs @@ -251,7 +251,22 @@ pub fn extent_dir>(dir: P, number: ExtentId) -> PathBuf { * anchored under "dir". */ pub fn extent_path>(dir: P, number: ExtentId) -> PathBuf { - extent_dir(dir, number).join(extent_file_name(number, ExtentType::Data)) + let e = extent_file_name(number, ExtentType::Data); + + // XXX terrible hack: if someone has already provided a full directory tree + // ending in `.copy`, then just append the extent file name. This lets us + // open individual extent files during live-repair. + if dir + .as_ref() + .iter() + .next_back() + .and_then(|s| s.to_str()) + .is_some_and(|s| s.ends_with(".copy")) + { + dir.as_ref().join(e) + } else { + extent_dir(dir, number).join(e) + } } /** diff --git a/downstairs/src/extent_inner_raw.rs b/downstairs/src/extent_inner_raw.rs index 715630ae4..2a9bf698b 100644 --- a/downstairs/src/extent_inner_raw.rs +++ b/downstairs/src/extent_inner_raw.rs @@ -31,6 +31,7 @@ use std::path::Path; struct OnDiskDownstairsBlockContext { block_context: BlockContext, on_disk_hash: u64, + flush_id: u16, } /// Size of backup data @@ -91,32 +92,46 @@ pub struct RawInner { /// This allows us to only write the flag when the value changes dirty: bool, - /// Marks whether the given context slot is dirty + /// Marks whether the given block is dirty /// - /// A dirty context slot has not yet been saved to disk, and must be - /// synched before being overwritten. - context_slot_dirty: ContextSlotsDirty, + /// A dirty block has not yet been saved to disk, and must be synched before + /// being overwritten. In other words, if we write to the same block twice + /// (without a flush in between), we must add a sync. + /// + /// See `crucible#1788` for details on why this is necessary + block_dirty: BlockBitArray, /// Total number of extra syscalls due to context slot fragmentation extra_syscall_count: u64, /// Denominator corresponding to `extra_syscall_count` extra_syscall_denominator: u64, + + // DEBUG STUFF BELOW + /// Number of times we've done a Bonus Sync for context slot purposes + /// + /// The lower 32 bits are written to the metadata for debug + bonus_sync_count: u64, + + /// Number of times we've done context slot defragmentation + /// + /// The lower 32 bits are written to the metadata for debug + defrag_count: u64, + + /// Most recent flush number + last_flush: u64, } -/// Data structure containing a list of active context slots -/// -/// Under the hood, this is a bitpacked array with one bit per block; 0 -/// indicates that `ContextSlot::A` is active and 1 selects `ContextSlot::B`. +/// Bitpacked array, with one bit per block #[derive(Debug)] -struct ActiveContextSlots { +struct BlockBitArray { data: Vec, /// Number of blocks block_count: u64, } -impl ActiveContextSlots { - /// Builds a new list with [`ContextSlot::A`] initially active +impl BlockBitArray { + /// Builds a new list with all bits set to zero fn new(block_count: u64) -> Self { Self { data: vec![0u32; (block_count as usize).div_ceil(32)], @@ -135,102 +150,90 @@ impl ActiveContextSlots { } /// Sets the context slot for the given block - fn set(&mut self, block: u64, slot: ContextSlot) { + fn set(&mut self, block: u64, value: bool) { let (index, mask) = self.decode(block); let target = &mut self.data[index]; - match slot { - ContextSlot::A => *target &= !mask, // clear the bit - ContextSlot::B => *target |= mask, // set the bit + if value { + *target |= mask // set the bit + } else { + *target &= !mask // clear the bit } } - /// Swaps the active context slot for the given block + /// Swaps the bit for the given block fn swap(&mut self, block: u64) { let (index, mask) = self.decode(block); self.data[index] ^= mask; } - /// Iterates over context slots - fn iter(&self) -> impl Iterator + '_ { + fn len(&self) -> usize { + self.block_count as usize + } + + fn iter(&self) -> impl Iterator + '_ { (0..self.block_count).map(|i| self[i]) } - fn len(&self) -> usize { - self.block_count as usize + /// Sets all bits in the array to `false`, without changing its size + fn reset(&mut self) { + self.data.fill(0); } } -impl std::ops::Index for ActiveContextSlots { - type Output = ContextSlot; +impl std::ops::Index for BlockBitArray { + type Output = bool; fn index(&self, block: u64) -> &Self::Output { let (index, mask) = self.decode(block); if self.data[index] & mask == 0 { - &ContextSlot::A + &false } else { - &ContextSlot::B + &true } } } -/// Data structure containing bitpacked dirty context slot flags +/// Data structure containing a list of active context slots /// -/// Each block has two flags, independently tracking the dirty state of context -/// slots A and B. +/// Under the hood, this is a bitpacked array with one bit per block; 0 +/// indicates that `ContextSlot::A` is active and 1 selects `ContextSlot::B`. #[derive(Debug)] -struct ContextSlotsDirty { - data: Vec, - block_count: u64, -} +struct ActiveContextSlots(BlockBitArray); -impl ContextSlotsDirty { +impl ActiveContextSlots { + /// Builds a new list with [`ContextSlot::A`] initially active fn new(block_count: u64) -> Self { - Self { - data: vec![0u32; (block_count as usize).div_ceil(16)], - block_count, - } + Self(BlockBitArray::new(block_count)) } - /// Decodes a block + slot into an index and mask - /// - /// # Panics - /// If the block is above our max block count - #[must_use] - fn decode(&self, block: u64, slot: ContextSlot) -> (usize, u32) { - assert!(block < self.block_count); - let b = block as usize; - (b / 16, 1 << (slot as usize + (b % 16) * 2)) + /// Sets the context slot for the given block + fn set(&mut self, block: u64, slot: ContextSlot) { + self.0.set(block, slot == ContextSlot::B) } - /// Checks whether the given `(block, slot)` tuple is dirty - #[must_use] - fn get(&self, block: u64, slot: ContextSlot) -> bool { - let (index, mask) = self.decode(block, slot); - self.data[index] & mask != 0 + /// Swaps the active context slot for the given block + fn swap(&mut self, block: u64) { + self.0.swap(block) } - /// Sets the given `(block, slot)` tuple as dirty - fn set(&mut self, block: u64, slot: ContextSlot) { - let (index, mask) = self.decode(block, slot); - self.data[index] |= mask; + /// Iterates over context slots + fn iter(&self) -> impl Iterator + '_ { + self.0 + .iter() + .map(|b| if b { ContextSlot::B } else { ContextSlot::A }) } - /// Marks every slot as not dirty - fn reset(&mut self) { - self.data.fill(0); + fn len(&self) -> usize { + self.0.len() } } -#[cfg(test)] -impl std::ops::Index for ContextSlotsDirty { - type Output = u8; +impl std::ops::Index for ActiveContextSlots { + type Output = ContextSlot; fn index(&self, block: u64) -> &Self::Output { - let lo = self.get(block, ContextSlot::A); - let hi = self.get(block, ContextSlot::B); - match (hi, lo) { - (false, false) => &0b00, - (false, true) => &0b01, - (true, false) => &0b10, - (true, true) => &0b11, + if self.0[block] { + &ContextSlot::B + } else { + &ContextSlot::A } } } @@ -549,7 +552,7 @@ impl ExtentInner for RawInner { self.extent_number, ))); } - self.context_slot_dirty.reset(); + self.block_dirty.reset(); cdt::extent__flush__file__done!(|| { (job_id.get(), self.extent_number.0) }); @@ -700,12 +703,14 @@ impl RawInner { 0, ctxs.iter().map(Option::as_ref), ContextSlot::A, + 0, )?; layout.write_context_slots_contiguous( file, 0, std::iter::repeat(None).take(block_count), ContextSlot::B, + 0, )?; layout.write_active_context_and_metadata( @@ -714,6 +719,8 @@ impl RawInner { dirty, flush_number, gen_number, + 0, + 0, )?; Ok(()) @@ -746,9 +753,12 @@ impl RawInner { layout, extent_number, active_context: ActiveContextSlots::new(extent_size.value), - context_slot_dirty: ContextSlotsDirty::new(extent_size.value), + block_dirty: BlockBitArray::new(extent_size.value), extra_syscall_count: 0, extra_syscall_denominator: 0, + bonus_sync_count: 0, + defrag_count: 0, + last_flush: 0, }; // Setting the flush number also writes the extent version, since // they're serialized together in the same block. @@ -889,7 +899,10 @@ impl RawInner { } matching_slot.or(empty_slot).ok_or( - CrucibleError::MissingContextSlot(block as u64), + CrucibleError::MissingContextSlot { + block: block as u64, + extent: extent_number.0, + }, )? }; active_context.set(block as u64, slot); @@ -904,16 +917,41 @@ impl RawInner { extent_number, extent_size: def.extent_size(), layout: RawLayout::new(def.extent_size()), - context_slot_dirty: ContextSlotsDirty::new(def.extent_size().value), + block_dirty: BlockBitArray::new(def.extent_size().value), extra_syscall_count: 0, extra_syscall_denominator: 0, + bonus_sync_count: u64::from(meta.bonus_sync_count), + defrag_count: u64::from(meta.defrag_count), + last_flush: meta.flush_number, }) } fn set_dirty(&mut self) -> Result<(), CrucibleError> { if !self.dirty { self.layout.set_dirty(&self.file)?; + // We must sync the file after writing the dirty flag; otherwise, + // a later sync could merge a subsequent write to this address. + // This is a property of ZFS: writes are normally persisted in the + // same order that they're submitted, but if you write to the same + // location in a file twice, the later write's data will be used + // when persisting the earlier write to disk. See crucible#1788 for + // details. + self.file.sync_all().map_err(|e| { + CrucibleError::IoError(format!( + "extent {}: sync_all failed in `set_dirty`: {e}", + self.extent_number, + )) + })?; self.dirty = true; + } else { + // XXX debug code remove this before merging + let meta = self.layout.get_metadata(&self.file)?; + assert!( + meta.dirty, + "extent {} has `self.dirty = false` \ + but on-disk metadata diagrees: {meta:?}", + self.extent_number + ); } Ok(()) } @@ -967,9 +1005,12 @@ impl RawInner { empty_slot = Some(slot); } } - let value = matching_slot - .or(empty_slot) - .ok_or(CrucibleError::MissingContextSlot(block))?; + let value = matching_slot.or(empty_slot).ok_or( + CrucibleError::MissingContextSlot { + block, + extent: self.extent_number.0, + }, + )?; self.active_context.set(block, value); Ok(()) } @@ -978,13 +1019,15 @@ impl RawInner { &mut self, block_contexts: &[DownstairsBlockContext], ) -> Result<(), CrucibleError> { - // If any of these block contexts will be overwriting an unsynched - // context slot, then we insert a sync here. - let needs_sync = block_contexts.iter().any(|block_context| { - // We'll be writing to the inactive slot - let slot = !self.active_context[block_context.block]; - self.context_slot_dirty.get(block_context.block, slot) - }); + // If any of these blocks is overwriting an unsynched block, then we + // insert a sync here. This is because ZFS will combine writes to the + // same offset in the file; if you write to the same block twice, the + // later write could be persisted when ZFS persists the earlier write to + // the ZIL, which would be before the later write's context is + // persisted. This would be a problem if we lost power in between! + let needs_sync = block_contexts + .iter() + .any(|block_context| self.block_dirty[block_context.block]); if needs_sync { self.file.sync_all().map_err(|e| { CrucibleError::IoError(format!( @@ -992,16 +1035,16 @@ impl RawInner { self.extent_number, )) })?; - self.context_slot_dirty.reset(); + self.block_dirty.reset(); + self.bonus_sync_count += 1; } - // Mark the to-be-written slots as unsynched on disk + // Mark the to-be-written blocks as unsynched on disk // - // It's harmless if we bail out before writing the actual context slot - // here, because all it will do is force a sync next time this is called - // (that sync is right above here!) + // It's harmless if we bail out before writing the actual block here, + // because all it will do is force a sync next time this is called (that + // sync is right above here!) for block_context in block_contexts { - let slot = !self.active_context[block_context.block]; - self.context_slot_dirty.set(block_context.block, slot); + self.block_dirty.set(block_context.block, true); } let mut start = 0; @@ -1053,6 +1096,7 @@ impl RawInner { start, group.map(Option::Some), slot, + self.last_flush, )?; writes += 1; } @@ -1080,8 +1124,11 @@ impl RawInner { false, // dirty new_flush, new_gen, + self.bonus_sync_count, + self.defrag_count, )?; self.dirty = false; + self.last_flush = new_flush; Ok(()) } @@ -1268,10 +1315,6 @@ impl RawInner { for (i, block) in (counter.min_block..=counter.max_block).enumerate() { if self.active_context[block] == copy_from { dest_slots[i] = source_slots[i]; - - // Mark this slot as unsynched, so that we don't overwrite - // it later on without a sync - self.context_slot_dirty.set(block, !copy_from); } } let r = self.layout.write_context_slots_contiguous( @@ -1279,6 +1322,7 @@ impl RawInner { counter.min_block, dest_slots.iter().map(|v| v.as_ref()), !copy_from, + self.last_flush, ); // If this write failed, then try recomputing every context slot @@ -1300,6 +1344,7 @@ impl RawInner { // data** in both context slots, so it would still be a valid state // for the file. } + self.defrag_count += 1; r.map(|_| ()) } } @@ -1404,6 +1449,7 @@ impl RawLayout { block_start: u64, iter: I, slot: ContextSlot, + last_flush: u64, ) -> Result<(), CrucibleError> where I: Iterator>, @@ -1416,6 +1462,7 @@ impl RawLayout { let d = block_context.map(|b| OnDiskDownstairsBlockContext { block_context: b.block_context, on_disk_hash: b.on_disk_hash, + flush_id: last_flush as u16, }); bincode::serialize_into(&mut buf[n..], &d).unwrap(); } @@ -1503,6 +1550,8 @@ impl RawLayout { dirty: bool, flush_number: u64, gen_number: u64, + bonus_sync_count: u64, + defrag_count: u64, ) -> Result<(), CrucibleError> { assert_eq!(active_context.len(), self.block_count() as usize); @@ -1521,6 +1570,8 @@ impl RawLayout { flush_number, gen_number, ext_version: EXTENT_META_RAW, + bonus_sync_count: bonus_sync_count as u32, + defrag_count: defrag_count as u32, }; let mut meta = [0u8; BLOCK_META_SIZE_BYTES as usize]; bincode::serialize_into(meta.as_mut_slice(), &d).unwrap(); @@ -1908,31 +1959,34 @@ mod test { ..write.clone() }; inner.write(JobId(10), &write1, false, IOV_MAX_TEST)?; - assert_eq!(inner.context_slot_dirty[0], 0b00); + assert!(!inner.block_dirty[0]); assert_eq!(inner.active_context[0], ContextSlot::A); - assert_eq!(inner.context_slot_dirty[1], 0b10); + assert!(inner.block_dirty[1]); assert_eq!(inner.active_context[1], ContextSlot::B); // The context should be written to block 0, slot B inner.write(JobId(10), &write, false, IOV_MAX_TEST)?; - assert_eq!(inner.context_slot_dirty[0], 0b10); + assert!(inner.block_dirty[0]); assert_eq!(inner.active_context[0], ContextSlot::B); - assert_eq!(inner.context_slot_dirty[1], 0b10); // unchanged + assert!(inner.block_dirty[1]); // unchanged assert_eq!(inner.active_context[1], ContextSlot::B); // unchanged + assert_eq!(inner.bonus_sync_count, 0); - // The context should be written to block 0, slot A + // The context should be written to block 0, slot A, forcing a sync inner.write(JobId(11), &write, false, IOV_MAX_TEST)?; - assert_eq!(inner.context_slot_dirty[0], 0b11); + assert!(inner.block_dirty[0]); assert_eq!(inner.active_context[0], ContextSlot::A); - assert_eq!(inner.context_slot_dirty[1], 0b10); // unchanged + assert!(!inner.block_dirty[1]); // synched assert_eq!(inner.active_context[1], ContextSlot::B); // unchanged + assert_eq!(inner.bonus_sync_count, 1); - // The context should be written to slot B, forcing a sync + // The context should be written to block 0, slot B, forcing a sync inner.write(JobId(12), &write, false, IOV_MAX_TEST)?; - assert_eq!(inner.context_slot_dirty[0], 0b10); + assert!(inner.block_dirty[0]); assert_eq!(inner.active_context[0], ContextSlot::B); - assert_eq!(inner.context_slot_dirty[1], 0b00); + assert!(!inner.block_dirty[1]); assert_eq!(inner.active_context[1], ContextSlot::B); // unchanged + assert_eq!(inner.bonus_sync_count, 2); Ok(()) } @@ -1961,27 +2015,32 @@ mod test { // The context should be written to slot B inner.write(JobId(10), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::B); - assert_eq!(inner.context_slot_dirty[0], 0b10); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 0); // Flush! This will mark all slots as synched inner.flush(12, 12, JobId(11).into())?; assert_eq!(inner.active_context[0], ContextSlot::B); - assert_eq!(inner.context_slot_dirty[0], 0b00); + assert!(!inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 0); // The context should be written to slot A inner.write(JobId(11), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::A); - assert_eq!(inner.context_slot_dirty[0], 0b01); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 0); - // The context should be written to slot B + // The context should be written to slot B, triggering a sync inner.write(JobId(12), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::B); - assert_eq!(inner.context_slot_dirty[0], 0b11); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 1); - // The context should be written to slot A, forcing a sync + // The context should be written to slot A; the block remains dirty inner.write(JobId(12), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::A); - assert_eq!(inner.context_slot_dirty[0], 0b01); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 2); Ok(()) } @@ -2010,32 +2069,38 @@ mod test { // The context should be written to slot B inner.write(JobId(10), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::B); - assert_eq!(inner.context_slot_dirty[0], 0b10); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 0); - // The context should be written to slot A + // The context should be written to slot A, triggering a sync inner.write(JobId(11), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::A); - assert_eq!(inner.context_slot_dirty[0], 0b11); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 1); // Flush! This will mark all slots as synched inner.flush(12, 12, JobId(11).into())?; assert_eq!(inner.active_context[0], ContextSlot::A); - assert_eq!(inner.context_slot_dirty[0], 0b00); + assert!(!inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 1); - // The context should be written to slot B + // The context should be written to slot B, triggering a sync inner.write(JobId(12), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::B); - assert_eq!(inner.context_slot_dirty[0], 0b10); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 1); - // The context should be written to slot A + // The context should be written to slot A, requiring a sync inner.write(JobId(12), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::A); - assert_eq!(inner.context_slot_dirty[0], 0b11); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 2); // The context should be written to slot B, forcing a sync inner.write(JobId(11), &write, false, IOV_MAX_TEST)?; assert_eq!(inner.active_context[0], ContextSlot::B); - assert_eq!(inner.context_slot_dirty[0], 0b10); + assert!(inner.block_dirty[0]); + assert_eq!(inner.bonus_sync_count, 3); Ok(()) } @@ -2147,7 +2212,7 @@ mod test { assert_eq!(inner.extra_syscall_count, 0); assert_eq!(inner.extra_syscall_denominator, 5); inner.flush(10, 10, JobId(10).into())?; - assert!(inner.context_slot_dirty.data.iter().all(|v| *v == 0)); + assert!(inner.block_dirty.data.iter().all(|v| *v == 0)); // This should not have changed active context slots! for i in 0..10 { @@ -2545,6 +2610,7 @@ mod test { }), }, on_disk_hash: u64::MAX, + flush_id: u16::MAX, }; let mut ctx_buf = [0u8; BLOCK_CONTEXT_SLOT_SIZE_BYTES as usize]; bincode::serialize_into(ctx_buf.as_mut_slice(), &Some(c)).unwrap(); diff --git a/downstairs/src/extent_inner_raw_common.rs b/downstairs/src/extent_inner_raw_common.rs index e792c4e6d..a8fe4e0ed 100644 --- a/downstairs/src/extent_inner_raw_common.rs +++ b/downstairs/src/extent_inner_raw_common.rs @@ -16,6 +16,10 @@ pub(super) struct OnDiskMeta { pub gen_number: u64, pub flush_number: u64, pub ext_version: u32, + + // Extra data added for debugging purposes + pub bonus_sync_count: u32, + pub defrag_count: u32, } impl OnDiskMeta { @@ -106,6 +110,8 @@ mod test { gen_number: u64::MAX, flush_number: u64::MAX, ext_version: u32::MAX, + bonus_sync_count: u32::MAX, + defrag_count: u32::MAX, }; let mut meta_buf = [0u8; BLOCK_META_SIZE_BYTES as usize]; bincode::serialize_into(meta_buf.as_mut_slice(), &Some(m)).unwrap(); diff --git a/downstairs/src/extent_inner_sqlite.rs b/downstairs/src/extent_inner_sqlite.rs index c9bdec223..d59814790 100644 --- a/downstairs/src/extent_inner_sqlite.rs +++ b/downstairs/src/extent_inner_sqlite.rs @@ -104,9 +104,8 @@ impl ExtentInner for SqliteInner { } fn validate(&self) -> Result<(), CrucibleError> { - Err(CrucibleError::GenericError( - "`validate` is not implemented for Sqlite extent".to_owned(), - )) + // SQLite databases are always perfect and have no problems + Ok(()) } #[cfg(test)] diff --git a/downstairs/src/main.rs b/downstairs/src/main.rs index 3b39579fa..e2920caa6 100644 --- a/downstairs/src/main.rs +++ b/downstairs/src/main.rs @@ -223,6 +223,11 @@ enum Args { /// Directory containing a region. #[clap(short, long, value_name = "DIRECTORY", action)] data: PathBuf, + + /// Number of threads to use for validation. + /// If not specified, uses available_parallelism (all CPU cores). + #[clap(short, long, action)] + threads: Option, }, Version, /// Measure an isolated downstairs' disk usage @@ -471,7 +476,7 @@ async fn main() -> Result<()> { run_dropshot(bind_addr, &log).await } - Args::Verify { data } => verify_region(data, log), + Args::Verify { data, threads } => verify_region(data, threads, log), Args::Version => { let info = crucible_common::BuildInfo::default(); println!("Crucible Version: {}", info); diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs index 90330ed49..5698bf5bf 100644 --- a/downstairs/src/region.rs +++ b/downstairs/src/region.rs @@ -400,6 +400,39 @@ impl Region { assert_eq!(self.get_opened_extent(eid).number, eid); } assert_eq!(self.def.extent_count() as usize, self.extents.len()); + use rayon::prelude::*; + + // Use a thread pool with a maximum of 8 threads for validation + // during startup to avoid overwhelming the system + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(8) + .build() + .expect("Failed to build thread pool"); + + let errors: Vec<_> = pool.install(|| { + self.extents + .par_iter() + .filter_map(|e| { + let ExtentState::Opened(extent) = e else { + unreachable!("got closed extent"); + }; + if let Err(err) = extent.validate() { + Some((extent.number, err)) + } else { + None + } + }) + .collect() + }); + if !errors.is_empty() { + for (number, err) in &errors { + warn!( + self.log, + "validation falied for extent {number}: {err:?}" + ); + } + panic!("validation failed"); + } } /// Walk the list of extents and close each one. @@ -682,6 +715,28 @@ impl Region { ); } + // Validate the extent that we just received, before copying it over + { + let new_extent = match Extent::open( + ©_dir, + &self.def(), + eid, + true, // read-only + &self.log.clone(), + ) { + Ok(e) => e, + Err(e) => { + panic!( + "Failed to open live-repair extent {eid} in \ + {copy_dir:?}: {e:?}" + ); + } + }; + if let Err(e) = new_extent.validate() { + panic!("Failed to validate live-repair extent {eid}: {e:?}"); + } + } + // After we have all files: move the repair dir. info!( self.log, diff --git a/verify-raw/Cargo.toml b/verify-raw/Cargo.toml new file mode 100644 index 000000000..6304d5a88 --- /dev/null +++ b/verify-raw/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "crucible-verify-raw" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow.workspace = true +bincode.workspace = true +clap.workspace = true +crucible-protocol.workspace = true +crucible-common.workspace = true +serde.workspace = true +crucible-workspace-hack.workspace = true diff --git a/verify-raw/src/main.rs b/verify-raw/src/main.rs new file mode 100644 index 000000000..01b1473ec --- /dev/null +++ b/verify-raw/src/main.rs @@ -0,0 +1,235 @@ +use anyhow::{Result, anyhow, bail}; +use clap::Parser; +use crucible_common::integrity_hash; +use crucible_protocol::BlockContext; +use serde::{Deserialize, Serialize}; + +/// dsc DownStairs Controller +#[derive(Debug, Parser)] +#[clap(name = "crucible-verify-raw", term_width = 80)] +#[clap(about = "Verifier for raw extent files", long_about = None)] +struct Args { + /// Raw extent file to check + file: std::path::PathBuf, + + /// Block size in the extent (usually autodetected) + #[clap(long)] + block_size: Option, +} + +fn main() -> Result<()> { + let args = Args::parse(); + check_one(&args.file, args.block_size) +} + +fn check_one(p: &std::path::Path, block_size: Option) -> Result<()> { + let data = std::fs::read(p)?; + let data_len = data.len() - BLOCK_META_SIZE_BYTES; + let meta: OnDiskMeta = bincode::deserialize(&data[data_len..])?; + if meta.ext_version != 2 { + bail!("expected extent version 2, got {}", meta.ext_version); + } + + let (block_size, block_count) = block_size + .map(|bs| { + get_block_count(data_len, bs) + .map(|bc| (bs, bc)) + .ok_or_else(|| anyhow!("could not find block count")) + }) + .unwrap_or_else(|| { + // Check a range of block sizes to find which one has a valid block + // count. This should usually work! + use crucible_common::{MAX_SHIFT, MIN_SHIFT}; + let mut valid = vec![]; + for shift in MIN_SHIFT..=MAX_SHIFT { + let block_size = 1 << shift; + if let Some(bc) = get_block_count(data_len, block_size) { + valid.push((block_size, bc)); + } + } + if valid.len() == 1 { + Ok(valid[0]) + } else { + bail!( + "Found multiple valid block size / count combinations: \ + {valid:?}; specify --block-size to disambiguate" + ); + } + })?; + print!("bs:{block_size} bytes bc:{block_count:>6}"); + println!( + " dirty:{:>5} gen:{} flush_number:{:>6} ext_ver:{} bonus_sync:{} defrag:{}", + meta.dirty, + meta.gen_number, + meta.flush_number, + meta.ext_version, + meta.bonus_sync_count, + meta.defrag_count, + ); + + let slot_selected = if !meta.dirty { + let mut selected = vec![]; + for d in &data[data_len - block_count.div_ceil(8)..data_len] { + for i in 0..8 { + selected.push((d & (1 << i)) == 0); + } + } + Some(selected) + } else { + None + }; + + let context_slots = (0..block_count * 2) + .map(|i| { + let offset = + block_size * block_count + i * BLOCK_CONTEXT_SLOT_SIZE_BYTES; + let chunk = &data[offset..][..BLOCK_CONTEXT_SLOT_SIZE_BYTES]; + bincode::deserialize(chunk).unwrap() + }) + .collect::>>(); + let (ctx_a, ctx_b) = context_slots.split_at(block_count); + + let mut failed = false; + for (i, chunk) in data[..block_size * block_count] + .chunks_exact(block_size) + .enumerate() + { + let hash = integrity_hash(&[chunk]); + if let Some(slot_selected) = &slot_selected { + // If the slot selected array is valid (i.e. the extent file is not + // dirty), then it must be correct. + let s = slot_selected[i]; + let ctx = if s { ctx_a[i] } else { ctx_b[i] }; + let r = check_block(chunk, hash, ctx); + if r.is_err() { + failed = true; + print!("Error at block {:>6}:", i); + print!( + " slot {} [selected]: {r:?}{}", + if s { "A" } else { "B" }, + if let Some(ctx) = ctx { + format!(", flush id: {}", ctx.flush_id) + } else { + "".to_owned() + } + ); + let other_ctx = if s { ctx_b[i] } else { ctx_a[i] }; + print!( + " | slot {} [deselected]: {:?}{}", + if s { "B" } else { "A" }, + check_block(chunk, hash, other_ctx), + if let Some(ctx) = other_ctx { + format!(", flush id: {}", ctx.flush_id) + } else { + "".to_owned() + } + ); + if chunk.iter().all(|b| *b == 0u8) { + print!(" Block is all zeros"); + } + println!(); + } + } else if let Err(ea) = check_block(chunk, hash, ctx_a[i]) + && let Err(eb) = check_block(chunk, hash, ctx_b[i]) + { + // Otherwise, check both context slots to see if either is valid + failed = true; + print!("Error at block {i}:"); + print!( + " slot A: {ea:?}{}", + if let Some(ctx) = ctx_a[i] { + format!(", flush id: {}", ctx.flush_id) + } else { + "".to_owned() + } + ); + print!( + " | slot B: {eb:?}{}", + if let Some(ctx) = ctx_b[i] { + format!(", flush id: {}", ctx.flush_id) + } else { + "".to_owned() + } + ); + if chunk.iter().all(|b| *b == 0u8) { + print!(" Block is all zeros"); + } + println!(); + } + } + + if failed { + bail!("verification failed, see logs for details"); + } + Ok(()) +} + +fn check_block( + block: &[u8], + hash: u64, + ctx: Option, +) -> Result { + if let Some(ctx) = ctx { + if ctx.on_disk_hash == hash { + Ok(Success::HashMatch) + } else { + Err(Failure::SlotHashMismatch) + } + } else if block.iter().all(|v| *v == 0u8) { + Ok(Success::BlockEmpty) + } else { + Err(Failure::EmptySlotWithNonzeroData) + } +} + +#[derive(Copy, Clone, Debug)] +enum Success { + HashMatch, + BlockEmpty, +} + +#[derive(Copy, Clone, Debug)] +enum Failure { + SlotHashMismatch, + EmptySlotWithNonzeroData, +} + +/// Brute force strategy to get block count +fn get_block_count(data_len: usize, block_size: usize) -> Option { + let estimated_block_count = + data_len / (block_size + BLOCK_CONTEXT_SLOT_SIZE_BYTES * 2); + for i in 0..estimated_block_count { + let block_count = estimated_block_count - i; + let actual = block_count + * (block_size + BLOCK_CONTEXT_SLOT_SIZE_BYTES * 2) + + block_count.div_ceil(8); + if actual == data_len { + return Some(block_count); + } else if actual < data_len { + return None; + } + } + None +} + +//////////////////////////////////////////////////////////////////////////////// +// Types copied from `extent_inner_raw.rs` +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq)] +struct OnDiskDownstairsBlockContext { + block_context: BlockContext, + on_disk_hash: u64, + flush_id: u16, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct OnDiskMeta { + pub dirty: bool, + pub gen_number: u64, + pub flush_number: u64, + pub ext_version: u32, + pub bonus_sync_count: u32, + pub defrag_count: u32, +} + +const BLOCK_CONTEXT_SLOT_SIZE_BYTES: usize = 48; +const BLOCK_META_SIZE_BYTES: usize = 32;