diff --git a/.claude/skills/add-background-task/SKILL.md b/.claude/skills/add-background-task/SKILL.md index 34b1d569561..39f3b4fa1a1 100644 --- a/.claude/skills/add-background-task/SKILL.md +++ b/.claude/skills/add-background-task/SKILL.md @@ -31,6 +31,8 @@ If the task takes config values that need conversion or validation (e.g., conver Include a unit test in the same file using `TestDatabase::new_with_datastore` that calls `actually_activate` directly. If the task has a datastore method, a single test exercising the task end-to-end (including the limit/batching behavior) is sufficient — don't add a redundant test for the datastore method separately unless it has complex logic worth testing in isolation. +For tasks that mutate or delete a bounded set of rows, the test should prove which items were affected, not just how many. Seed a mix of rows that should be affected, rows that should be skipped because they're on the safe side of the cutoff or predicate, and rows that should be skipped because they're structurally ineligible. Assert exact identities of the affected and surviving rows after each activation. If the task uses ordering plus a per-activation limit, make the first activation hit the limit and verify that the highest-priority eligible items were chosen, then run a second activation to verify the remainder, and a final activation to verify the no-op case. When eligibility depends on time, set timestamps explicitly in the test rather than relying on sleeps. When selection depends on multiple sort keys, include a case that exercises the tiebreaker. Prefer assertions on row IDs or full records over aggregate counts and status fields. + ### 3. Register the module (`nexus/src/app/background/tasks/mod.rs`) Add `pub mod ;` in alphabetical order. diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index ba9eed29d2c..2c4427a813b 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -52,6 +52,7 @@ use nexus_types::deployment::OximeterReadPolicy; use nexus_types::fm; use nexus_types::internal_api::background::AbandonedVmmReaperStatus; use nexus_types::internal_api::background::AttachedSubnetManagerStatus; +use nexus_types::internal_api::background::AuditLogCleanupStatus; use nexus_types::internal_api::background::AuditLogTimeoutIncompleteStatus; use nexus_types::internal_api::background::BlueprintPlannerStatus; use nexus_types::internal_api::background::BlueprintRendezvousStats; @@ -1223,6 +1224,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "attached_subnet_manager" => { print_task_attached_subnet_manager_status(details); } + "audit_log_cleanup" => { + print_task_audit_log_cleanup(details); + } "audit_log_timeout_incomplete" => { print_task_audit_log_timeout_incomplete(details); } @@ -2688,6 +2692,36 @@ fn print_task_saga_recovery(details: &serde_json::Value) { } } +fn print_task_audit_log_cleanup(details: &serde_json::Value) { + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(status) => { + const DELETED: &str = "rows deleted:"; + const CUTOFF: &str = "cutoff:"; + const MAX_DELETE: &str = "max deleted per activation:"; + const ERROR: &str = "error:"; + const WIDTH: usize = + const_max_len(&[DELETED, CUTOFF, MAX_DELETE, ERROR]) + 1; + + println!(" {DELETED:( details.clone(), diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 02ac948f4a2..ee3e3bd841a 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -38,6 +38,10 @@ task: "attached_subnet_manager" distributes attached subnets to sleds and switch +task: "audit_log_cleanup" + hard-deletes completed audit log entries older than the retention period + + task: "audit_log_timeout_incomplete" transitions stale incomplete audit log entries to timeout status so they become visible in the audit log @@ -293,6 +297,10 @@ task: "attached_subnet_manager" distributes attached subnets to sleds and switch +task: "audit_log_cleanup" + hard-deletes completed audit log entries older than the retention period + + task: "audit_log_timeout_incomplete" transitions stale incomplete audit log entries to timeout status so they become visible in the audit log @@ -535,6 +543,10 @@ task: "attached_subnet_manager" distributes attached subnets to sleds and switch +task: "audit_log_cleanup" + hard-deletes completed audit log entries older than the retention period + + task: "audit_log_timeout_incomplete" transitions stale incomplete audit log entries to timeout status so they become visible in the audit log diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 98140113cdc..b534b5b72c1 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -273,6 +273,10 @@ task: "attached_subnet_manager" distributes attached subnets to sleds and switch +task: "audit_log_cleanup" + hard-deletes completed audit log entries older than the retention period + + task: "audit_log_timeout_incomplete" transitions stale incomplete audit log entries to timeout status so they become visible in the audit log @@ -598,6 +602,14 @@ task: "attached_subnet_manager" no dendrite instances found no sleds found +task: "audit_log_cleanup" + configured period: every m + last completed activation: , triggered by + started at (s ago) and ran for ms + rows deleted: 0 + cutoff: + max deleted per activation: 10000 + task: "audit_log_timeout_incomplete" configured period: every m last completed activation: , triggered by @@ -1207,6 +1219,14 @@ task: "attached_subnet_manager" no dendrite instances found no sleds found +task: "audit_log_cleanup" + configured period: every m + last completed activation: , triggered by + started at (s ago) and ran for ms + rows deleted: 0 + cutoff: + max deleted per activation: 10000 + task: "audit_log_timeout_incomplete" configured period: every m last completed activation: , triggered by diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 8fe8f417b61..08b8f0b001d 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -27,6 +27,7 @@ use std::collections::HashMap; use std::fmt; use std::net::IpAddr; use std::net::SocketAddr; +use std::num::NonZeroU32; use std::time::Duration; use uuid::Uuid; @@ -439,6 +440,8 @@ pub struct BackgroundTaskConfig { pub session_cleanup: SessionCleanupConfig, /// configuration for audit log incomplete timeout task pub audit_log_timeout_incomplete: AuditLogTimeoutIncompleteConfig, + /// configuration for audit log cleanup (retention) task + pub audit_log_cleanup: AuditLogCleanupConfig, } #[serde_as] @@ -467,6 +470,24 @@ pub struct AuditLogTimeoutIncompleteConfig { pub max_timed_out_per_activation: u32, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct AuditLogCleanupConfig { + /// period (in seconds) for periodic activations of this task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, + + /// retention period in days; must be at least 1 + /// + /// This should be much longer than the `audit_log_timeout_incomplete` + /// timeout so orphaned entries are completed before they become eligible + /// for cleanup. + pub retention_days: NonZeroU32, + + /// maximum rows hard-deleted per activation + pub max_deleted_per_activation: u32, +} + #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct DnsTasksConfig { @@ -1296,6 +1317,9 @@ mod test { audit_log_timeout_incomplete.period_secs = 600 audit_log_timeout_incomplete.timeout_secs = 14400 audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 + audit_log_cleanup.period_secs = 600 + audit_log_cleanup.retention_days = 90 + audit_log_cleanup.max_deleted_per_activation = 10000 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1570,6 +1594,11 @@ mod test { timeout_secs: Duration::from_secs(14400), max_timed_out_per_activation: 1000, }, + audit_log_cleanup: AuditLogCleanupConfig { + period_secs: Duration::from_secs(600), + retention_days: NonZeroU32::new(90).unwrap(), + max_deleted_per_activation: 10_000, + }, }, multicast: MulticastConfig { enabled: false }, default_region_allocation_strategy: @@ -1681,6 +1710,9 @@ mod test { audit_log_timeout_incomplete.period_secs = 600 audit_log_timeout_incomplete.timeout_secs = 14400 audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 + audit_log_cleanup.period_secs = 600 + audit_log_cleanup.retention_days = 90 + audit_log_cleanup.max_deleted_per_activation = 10000 [default_region_allocation_strategy] type = "random" @@ -1804,6 +1836,19 @@ mod test { } } + #[test] + fn test_invalid_audit_log_cleanup_retention_days() { + let error = toml::from_str::( + r##" + period_secs = 600 + retention_days = 0 + max_deleted_per_activation = 10000 + "##, + ) + .expect_err("retention_days = 0 should be rejected"); + assert!(error.message().contains("nonzero"), "error = {}", error); + } + #[test] fn test_repo_configs_are_valid() { // The example config file should be valid. diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index 1bacb6f259f..91902e95c93 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -37,6 +37,7 @@ pub struct BackgroundTasks { pub task_instance_reincarnation: Activator, pub task_service_firewall_propagation: Activator, pub task_abandoned_vmm_reaper: Activator, + pub task_audit_log_cleanup: Activator, pub task_audit_log_timeout_incomplete: Activator, pub task_vpc_route_manager: Activator, pub task_saga_recovery: Activator, diff --git a/nexus/db-queries/src/db/datastore/audit_log.rs b/nexus/db-queries/src/db/datastore/audit_log.rs index b244f93b16f..e428f4ec45f 100644 --- a/nexus/db-queries/src/db/datastore/audit_log.rs +++ b/nexus/db-queries/src/db/datastore/audit_log.rs @@ -140,6 +140,46 @@ impl DataStore { Ok(()) } + /// Hard-delete completed audit log entries older than `cutoff`. + /// + /// Deletes up to `limit` rows where `time_completed IS NOT NULL` and + /// `time_completed < cutoff`, and returns the number of rows deleted. + /// + /// We must never delete an entry before it has been completed, because + /// incomplete entries are not yet visible in the audit log. Deleting + /// one would mean the operation never appears in the log at all. The + /// `audit_log_timeout_incomplete` task ensures that even crash-orphaned + /// entries get completed (with result_kind = timeout) well before the + /// retention cutoff, but the `time_completed IS NOT NULL` filter is the + /// structural guarantee that we cannot delete something that hasn't + /// appeared in the log yet. + pub async fn audit_log_cleanup( + &self, + opctx: &OpContext, + cutoff: DateTime, + limit: u32, + ) -> Result { + opctx.authorize(authz::Action::CreateChild, &authz::AUDIT_LOG).await?; + + // Diesel's DeleteStatement doesn't support ORDER BY or LIMIT, so we use + // raw SQL. We could use a subquery instead and use the DSL, but this + // feels like a more direct and literal a representation of what we're + // doing. The semantics of the query are checked by the integration test + // for the background task. + diesel::sql_query( + "DELETE FROM omicron.public.audit_log \ + WHERE time_completed IS NOT NULL \ + AND time_completed < $1 \ + ORDER BY time_completed, id \ + LIMIT $2", + ) + .bind::(cutoff) + .bind::(i64::from(limit)) + .execute_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Transition stale incomplete audit log rows to `result_kind = timeout`. /// /// Finds up to `limit` rows where `time_completed IS NULL` and diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 305015055f2..72ed64e0222 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -198,6 +198,9 @@ session_cleanup.max_delete_per_activation = 10000 audit_log_timeout_incomplete.period_secs = 600 audit_log_timeout_incomplete.timeout_secs = 14400 audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 +audit_log_cleanup.period_secs = 600 +audit_log_cleanup.retention_days = 90 +audit_log_cleanup.max_deleted_per_activation = 10000 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index d950e39e873..d5cf0b1a918 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -182,6 +182,9 @@ session_cleanup.max_delete_per_activation = 10000 audit_log_timeout_incomplete.period_secs = 600 audit_log_timeout_incomplete.timeout_secs = 14400 audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 +audit_log_cleanup.period_secs = 600 +audit_log_cleanup.retention_days = 90 +audit_log_cleanup.max_deleted_per_activation = 10000 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 53067faba15..cb3a8fcb33e 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -92,6 +92,7 @@ use super::driver::TaskDefinition; use super::tasks::abandoned_vmm_reaper; use super::tasks::alert_dispatcher::AlertDispatcher; use super::tasks::attached_subnets; +use super::tasks::audit_log_cleanup; use super::tasks::audit_log_timeout_incomplete; use super::tasks::bfd; use super::tasks::blueprint_execution; @@ -248,6 +249,7 @@ impl BackgroundTasksInitializer { task_instance_reincarnation: Activator::new(), task_service_firewall_propagation: Activator::new(), task_abandoned_vmm_reaper: Activator::new(), + task_audit_log_cleanup: Activator::new(), task_audit_log_timeout_incomplete: Activator::new(), task_vpc_route_manager: Activator::new(), task_saga_recovery: Activator::new(), @@ -365,6 +367,7 @@ impl BackgroundTasksInitializer { task_attached_subnet_manager, task_session_cleanup, task_audit_log_timeout_incomplete, + task_audit_log_cleanup, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -1211,7 +1214,7 @@ impl BackgroundTasksInitializer { period: config.audit_log_timeout_incomplete.period_secs, task_impl: Box::new( audit_log_timeout_incomplete::AuditLogTimeoutIncomplete::new( - datastore, + datastore.clone(), config.audit_log_timeout_incomplete.timeout_secs, config .audit_log_timeout_incomplete @@ -1223,6 +1226,21 @@ impl BackgroundTasksInitializer { activator: task_audit_log_timeout_incomplete, }); + driver.register(TaskDefinition { + name: "audit_log_cleanup", + description: "hard-deletes completed audit log entries older \ + than the retention period", + period: config.audit_log_cleanup.period_secs, + task_impl: Box::new(audit_log_cleanup::AuditLogCleanup::new( + datastore, + config.audit_log_cleanup.retention_days, + config.audit_log_cleanup.max_deleted_per_activation, + )), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_audit_log_cleanup, + }); + driver } } diff --git a/nexus/src/app/background/tasks/audit_log_cleanup.rs b/nexus/src/app/background/tasks/audit_log_cleanup.rs new file mode 100644 index 00000000000..a4589892746 --- /dev/null +++ b/nexus/src/app/background/tasks/audit_log_cleanup.rs @@ -0,0 +1,331 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task that hard-deletes completed audit log entries older than +//! the retention period. + +use crate::app::background::BackgroundTask; +use chrono::TimeDelta; +use chrono::Utc; +use futures::future::BoxFuture; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::AuditLogCleanupStatus; +use serde_json::json; +use slog_error_chain::InlineErrorChain; +use std::num::NonZeroU32; +use std::sync::Arc; + +pub struct AuditLogCleanup { + datastore: Arc, + retention: TimeDelta, + max_deleted_per_activation: u32, +} + +impl AuditLogCleanup { + pub fn new( + datastore: Arc, + retention_days: NonZeroU32, + max_deleted_per_activation: u32, + ) -> Self { + // i64::from on a u32 can never produce a negative value, so the + // only failure mode here is a value too large for TimeDelta. + let Some(retention) = + TimeDelta::try_days(i64::from(retention_days.get())) + else { + panic!( + "invalid retention_days {retention_days} \ + (must be representable as a TimeDelta)" + ); + }; + Self { datastore, retention, max_deleted_per_activation } + } + + pub(crate) async fn actually_activate( + &mut self, + opctx: &OpContext, + ) -> AuditLogCleanupStatus { + // retention was validated at construction time; only the + // subtraction from "now" can fail (effectively impossible). + let cutoff = match Utc::now().checked_sub_signed(self.retention) { + Some(c) => c, + None => { + let msg = format!( + "retention {:?} overflows date arithmetic", + self.retention, + ); + slog::error!(&opctx.log, "{msg}"); + return AuditLogCleanupStatus { + rows_deleted: 0, + cutoff: Utc::now(), + max_deleted_per_activation: self.max_deleted_per_activation, + error: Some(msg), + }; + } + }; + + let rows_deleted = match self + .datastore + .audit_log_cleanup(opctx, cutoff, self.max_deleted_per_activation) + .await + { + Ok(count) => count, + Err(err) => { + slog::error!(&opctx.log, "audit log cleanup failed"; &err); + return AuditLogCleanupStatus { + rows_deleted: 0, + cutoff, + max_deleted_per_activation: self.max_deleted_per_activation, + error: Some(InlineErrorChain::new(&err).to_string()), + }; + } + }; + + if rows_deleted > 0 { + slog::info!( + &opctx.log, + "audit log cleanup: deleted {rows_deleted} old entries"; + "cutoff" => %cutoff, + "limit" => self.max_deleted_per_activation, + ); + } else { + slog::debug!( + &opctx.log, + "audit log cleanup: no old entries to delete"; + "cutoff" => %cutoff, + ); + } + + AuditLogCleanupStatus { + rows_deleted, + cutoff, + max_deleted_per_activation: self.max_deleted_per_activation, + error: None, + } + } +} + +impl BackgroundTask for AuditLogCleanup { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async { + let status = self.actually_activate(opctx).await; + match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => { + json!({ "error": format!("failed to serialize status: {err}") }) + } + } + }) + } +} + +#[cfg(test)] +mod tests { + use std::collections::BTreeSet; + + use super::*; + use chrono::TimeDelta; + use chrono::Utc; + use nexus_db_model::{ + AuditLogActor, AuditLogCompletion, AuditLogEntryInitParams, + }; + use nexus_db_queries::db::pub_test_utils::TestDatabase; + use omicron_test_utils::dev; + use uuid::Uuid; + + async fn get_audit_log_ids(datastore: &DataStore) -> BTreeSet { + use async_bb8_diesel::AsyncRunQueryDsl; + use diesel::prelude::*; + use nexus_db_schema::schema::audit_log; + audit_log::table + // fake filter required to get around the table scan rule + .filter(audit_log::id.ne(uuid::Uuid::nil())) + .select(audit_log::id) + .load_async::( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .expect("could not load audit log ids") + .into_iter() + .collect() + } + + fn make_entry(request_id: &str) -> AuditLogEntryInitParams { + AuditLogEntryInitParams { + request_id: request_id.to_string(), + operation_id: "project_create".to_string(), + request_uri: "/v1/projects".to_string(), + source_ip: "1.1.1.1".parse().unwrap(), + user_agent: None, + actor: AuditLogActor::Unauthenticated, + auth_method: None, + credential_id: None, + } + } + + async fn set_time_completed( + datastore: &DataStore, + id: Uuid, + time_completed: chrono::DateTime, + ) { + use async_bb8_diesel::AsyncRunQueryDsl; + use diesel::prelude::*; + use nexus_db_schema::schema::audit_log; + diesel::update(audit_log::table) + .filter(audit_log::id.eq(id)) + .set(audit_log::time_completed.eq(Some(time_completed))) + .execute_async( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .expect("could not set time_completed"); + } + + #[tokio::test] + async fn test_audit_log_cleanup_activation() { + let logctx = dev::test_setup_log("test_audit_log_cleanup_activation"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let now = Utc::now(); + // The task uses retention_days=7, so cutoff ≈ now - 7 days. + // Give each old entry a distinct time_completed so we can verify + // oldest-first deletion order. + let completion = AuditLogCompletion::Success { http_status_code: 200 }; + + // 5 old entries with distinct times, all before cutoff. + // Oldest first: 12d, 11d, 10d, 9d, 8d ago. + let mut old_ids = Vec::new(); + for i in 0..5u32 { + let age_days = 12 - i; // 12, 11, 10, 9, 8 + let entry = datastore + .audit_log_entry_init( + opctx, + make_entry(&format!("old-{i}")).into(), + ) + .await + .unwrap(); + datastore + .audit_log_entry_complete( + opctx, + &entry, + completion.clone().into(), + ) + .await + .unwrap(); + let completed_at = + now - TimeDelta::try_days(i64::from(age_days)).unwrap(); + set_time_completed(datastore, entry.id, completed_at).await; + old_ids.push(entry.id); + } + + // 1 completed entry just inside the retention window (6d ago). + // With retention_days=7, cutoff ≈ now - 7d, so 6d ago is safely + // after the cutoff and must not be deleted. + let within_retention = datastore + .audit_log_entry_init(opctx, make_entry("within-retention").into()) + .await + .unwrap(); + datastore + .audit_log_entry_complete( + opctx, + &within_retention, + completion.clone().into(), + ) + .await + .unwrap(); + let six_days_ago = now - TimeDelta::try_days(6).unwrap(); + set_time_completed(datastore, within_retention.id, six_days_ago).await; + + // 1 incomplete entry (should not be deleted) + let incomplete = datastore + .audit_log_entry_init(opctx, make_entry("incomplete-req").into()) + .await + .unwrap(); + + let expected_survivors: BTreeSet = + [within_retention.id, incomplete.id].into(); + + assert_eq!(get_audit_log_ids(&datastore).await.len(), 7); + + // max_deleted_per_activation = 3, so it takes two activations to + // delete all 5 old entries + let mut task = AuditLogCleanup::new( + datastore.clone(), + NonZeroU32::new(7).unwrap(), + 3, + ); + + // First activation deletes the 3 oldest (12d, 11d, 10d ago). + let status = task.actually_activate(opctx).await; + assert_eq!(status.rows_deleted, 3); + assert!(status.error.is_none()); + + // The reported cutoff should be between our old entries and the + // within-retention entry. + let eight_days_ago = now - TimeDelta::try_days(8).unwrap(); + assert!( + status.cutoff > eight_days_ago && status.cutoff < six_days_ago, + "cutoff {} should be between 8d ago ({}) and 6d ago ({})", + status.cutoff, + eight_days_ago, + six_days_ago, + ); + + let remaining = get_audit_log_ids(&datastore).await; + assert_eq!(remaining.len(), 4); + // The 3 oldest should be gone + for &id in &old_ids[..3] { + assert!(!remaining.contains(&id), "old entry should be deleted"); + } + // The 2 newest old entries should still be present + for &id in &old_ids[3..] { + assert!(remaining.contains(&id), "newer old entry should remain"); + } + assert!(expected_survivors.is_subset(&remaining)); + + // Second activation deletes remaining 2 old (9d, 8d ago). + let status = task.actually_activate(opctx).await; + assert_eq!(status.rows_deleted, 2); + assert!(status.error.is_none()); + assert_eq!(get_audit_log_ids(&datastore).await, expected_survivors); + + // Third activation: within-retention entry survives. + let status = task.actually_activate(opctx).await; + assert_eq!(status.rows_deleted, 0); + assert!(status.error.is_none()); + assert_eq!(get_audit_log_ids(&datastore).await, expected_survivors); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_audit_log_cleanup_retention_days_overflow() { + let logctx = dev::test_setup_log( + "test_audit_log_cleanup_retention_days_overflow", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, _datastore) = (db.opctx(), db.datastore()); + + let mut task = AuditLogCleanup::new( + db.datastore().clone(), + NonZeroU32::new(u32::MAX).unwrap(), + 100, + ); + let status = task.actually_activate(opctx).await; + assert_eq!(status.rows_deleted, 0); + assert!( + status.error.as_ref().unwrap().contains("overflows"), + "expected overflow error, got: {:?}", + status.error, + ); + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 475c1bc99ae..8fb56d653f2 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -7,6 +7,7 @@ pub mod abandoned_vmm_reaper; pub mod alert_dispatcher; pub mod attached_subnets; +pub mod audit_log_cleanup; pub mod audit_log_timeout_incomplete; pub mod bfd; pub mod blueprint_execution; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index d7e20e5fa77..f024b3446dd 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -216,6 +216,9 @@ session_cleanup.max_delete_per_activation = 10000 audit_log_timeout_incomplete.period_secs = 600 audit_log_timeout_incomplete.timeout_secs = 14400 audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 +audit_log_cleanup.period_secs = 600 +audit_log_cleanup.retention_days = 90 +audit_log_cleanup.max_deleted_per_activation = 10000 [multicast] # Enable multicast functionality for tests (disabled by default in production) diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index a3479ba798e..38089a7f4d7 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -986,6 +986,19 @@ pub struct AuditLogTimeoutIncompleteStatus { pub error: Option, } +/// The status of an `audit_log_cleanup` background task activation. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct AuditLogCleanupStatus { + /// Number of completed audit log entries deleted in this activation. + pub rows_deleted: usize, + /// The cutoff time used: completed entries older than this were eligible. + pub cutoff: DateTime, + /// Configured max rows to delete in this activation. + pub max_deleted_per_activation: u32, + /// Error encountered during this activation, if any. + pub error: Option, +} + /// The status of a `session_cleanup` background task activation. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct SessionCleanupStatus { diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index bb5af9dfd75..092902ae68e 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -126,6 +126,9 @@ session_cleanup.max_delete_per_activation = 10000 audit_log_timeout_incomplete.period_secs = 600 audit_log_timeout_incomplete.timeout_secs = 14400 audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 +audit_log_cleanup.period_secs = 600 +audit_log_cleanup.retention_days = 90 +audit_log_cleanup.max_deleted_per_activation = 10000 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 266cc5787a9..ef229ccd87e 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -126,6 +126,9 @@ session_cleanup.max_delete_per_activation = 10000 audit_log_timeout_incomplete.period_secs = 600 audit_log_timeout_incomplete.timeout_secs = 14400 audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 +audit_log_cleanup.period_secs = 600 +audit_log_cleanup.retention_days = 90 +audit_log_cleanup.max_deleted_per_activation = 10000 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds.