Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .claude/skills/add-background-task/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ Logging conventions: `debug` when there's nothing to do, `info` when routine wor

If the task takes config values that need conversion or validation (e.g., converting a `Duration` to `TimeDelta`, or checking a numeric range), do it once in `new()` and store the validated form. Don't re-validate on every activation — if the config is invalid, panic in `new()` with a message that includes the invalid value.

Include a unit test in the same file using `TestDatabase::new_with_datastore` that calls `actually_activate` directly. If the task has a datastore method, a single test exercising the task end-to-end (including the limit/batching behavior) is sufficient — don't add a redundant test for the datastore method separately unless it has complex logic worth testing in isolation.
Include a unit test in the same file using `TestDatabase::new_with_datastore` that calls `actually_activate` directly. If the task has a datastore method, a single test exercising the task end-to-end is usually sufficient — don't add a redundant test for the datastore method separately unless it has complex logic worth testing in isolation.

For tasks that mutate or delete a bounded set of rows, the test should prove which items were affected, not just how many. Seed a mix of rows that should be affected, rows that should be skipped because they're on the safe side of the cutoff or predicate, and rows that should be skipped because they're structurally ineligible. Assert exact identities of the affected and surviving rows after each activation. If the task uses ordering plus a per-activation limit, make the first activation hit the limit and verify that the highest-priority eligible items were chosen, then run a second activation to verify the remainder, and a final activation to verify the no-op case. When eligibility depends on time, set timestamps explicitly in the test rather than relying on sleeps. When selection depends on multiple sort keys, include a case that exercises the tiebreaker. Prefer assertions on row IDs or full records over aggregate counts and status fields.

### 3. Register the module (`nexus/src/app/background/tasks/mod.rs`)

Expand Down
34 changes: 34 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ use nexus_types::deployment::OximeterReadPolicy;
use nexus_types::fm;
use nexus_types::internal_api::background::AbandonedVmmReaperStatus;
use nexus_types::internal_api::background::AttachedSubnetManagerStatus;
use nexus_types::internal_api::background::AuditLogCleanupStatus;
use nexus_types::internal_api::background::AuditLogTimeoutIncompleteStatus;
use nexus_types::internal_api::background::BlueprintPlannerStatus;
use nexus_types::internal_api::background::BlueprintRendezvousStats;
Expand Down Expand Up @@ -1223,6 +1224,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
"attached_subnet_manager" => {
print_task_attached_subnet_manager_status(details);
}
"audit_log_cleanup" => {
print_task_audit_log_cleanup(details);
}
"audit_log_timeout_incomplete" => {
print_task_audit_log_timeout_incomplete(details);
}
Expand Down Expand Up @@ -2688,6 +2692,36 @@ fn print_task_saga_recovery(details: &serde_json::Value) {
}
}

fn print_task_audit_log_cleanup(details: &serde_json::Value) {
match serde_json::from_value::<AuditLogCleanupStatus>(details.clone()) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(status) => {
const DELETED: &str = "rows deleted:";
const CUTOFF: &str = "cutoff:";
const MAX_DELETE: &str = "max deleted per activation:";
const ERROR: &str = "error:";
const WIDTH: usize =
const_max_len(&[DELETED, CUTOFF, MAX_DELETE, ERROR]) + 1;

println!(" {DELETED:<WIDTH$}{}", status.rows_deleted);
println!(
" {CUTOFF:<WIDTH$}{}",
status.cutoff.to_rfc3339_opts(SecondsFormat::AutoSi, true),
);
println!(
" {MAX_DELETE:<WIDTH$}{}",
status.max_deleted_per_activation
);
if let Some(error) = &status.error {
println!(" {ERROR:<WIDTH$}{error}");
}
}
};
}

fn print_task_audit_log_timeout_incomplete(details: &serde_json::Value) {
match serde_json::from_value::<AuditLogTimeoutIncompleteStatus>(
details.clone(),
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ task: "attached_subnet_manager"
distributes attached subnets to sleds and switch


task: "audit_log_cleanup"
hard-deletes completed audit log entries older than the retention period


task: "audit_log_timeout_incomplete"
transitions stale incomplete audit log entries to timeout status so they
become visible in the audit log
Expand Down Expand Up @@ -293,6 +297,10 @@ task: "attached_subnet_manager"
distributes attached subnets to sleds and switch


task: "audit_log_cleanup"
hard-deletes completed audit log entries older than the retention period


task: "audit_log_timeout_incomplete"
transitions stale incomplete audit log entries to timeout status so they
become visible in the audit log
Expand Down Expand Up @@ -535,6 +543,10 @@ task: "attached_subnet_manager"
distributes attached subnets to sleds and switch


task: "audit_log_cleanup"
hard-deletes completed audit log entries older than the retention period


task: "audit_log_timeout_incomplete"
transitions stale incomplete audit log entries to timeout status so they
become visible in the audit log
Expand Down
20 changes: 20 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,10 @@ task: "attached_subnet_manager"
distributes attached subnets to sleds and switch


task: "audit_log_cleanup"
hard-deletes completed audit log entries older than the retention period


task: "audit_log_timeout_incomplete"
transitions stale incomplete audit log entries to timeout status so they
become visible in the audit log
Expand Down Expand Up @@ -598,6 +602,14 @@ task: "attached_subnet_manager"
no dendrite instances found
no sleds found

task: "audit_log_cleanup"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
rows deleted: 0
cutoff: <REDACTED_TIMESTAMP>
max deleted per activation: 10000

task: "audit_log_timeout_incomplete"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down Expand Up @@ -1207,6 +1219,14 @@ task: "attached_subnet_manager"
no dendrite instances found
no sleds found

task: "audit_log_cleanup"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
rows deleted: 0
cutoff: <REDACTED_TIMESTAMP>
max deleted per activation: 10000

task: "audit_log_timeout_incomplete"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down
45 changes: 45 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use std::collections::HashMap;
use std::fmt;
use std::net::IpAddr;
use std::net::SocketAddr;
use std::num::NonZeroU32;
use std::time::Duration;
use uuid::Uuid;

Expand Down Expand Up @@ -439,6 +440,8 @@ pub struct BackgroundTaskConfig {
pub session_cleanup: SessionCleanupConfig,
/// configuration for audit log incomplete timeout task
pub audit_log_timeout_incomplete: AuditLogTimeoutIncompleteConfig,
/// configuration for audit log cleanup (retention) task
pub audit_log_cleanup: AuditLogCleanupConfig,
}

#[serde_as]
Expand Down Expand Up @@ -467,6 +470,24 @@ pub struct AuditLogTimeoutIncompleteConfig {
pub max_timed_out_per_activation: u32,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct AuditLogCleanupConfig {
/// period (in seconds) for periodic activations of this task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,

/// retention period in days; must be at least 1
///
/// This should be much longer than the `audit_log_timeout_incomplete`
/// timeout so orphaned entries are completed before they become eligible
/// for cleanup.
pub retention_days: NonZeroU32,

/// maximum rows hard-deleted per activation
pub max_deleted_per_activation: u32,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct DnsTasksConfig {
Expand Down Expand Up @@ -1296,6 +1317,9 @@ mod test {
audit_log_timeout_incomplete.period_secs = 600
audit_log_timeout_incomplete.timeout_secs = 14400
audit_log_timeout_incomplete.max_timed_out_per_activation = 1000
audit_log_cleanup.period_secs = 600
audit_log_cleanup.retention_days = 90
audit_log_cleanup.max_deleted_per_activation = 10000
[default_region_allocation_strategy]
type = "random"
seed = 0
Expand Down Expand Up @@ -1570,6 +1594,11 @@ mod test {
timeout_secs: Duration::from_secs(14400),
max_timed_out_per_activation: 1000,
},
audit_log_cleanup: AuditLogCleanupConfig {
period_secs: Duration::from_secs(600),
retention_days: NonZeroU32::new(90).unwrap(),
max_deleted_per_activation: 10_000,
},
},
multicast: MulticastConfig { enabled: false },
default_region_allocation_strategy:
Expand Down Expand Up @@ -1681,6 +1710,9 @@ mod test {
audit_log_timeout_incomplete.period_secs = 600
audit_log_timeout_incomplete.timeout_secs = 14400
audit_log_timeout_incomplete.max_timed_out_per_activation = 1000
audit_log_cleanup.period_secs = 600
audit_log_cleanup.retention_days = 90
audit_log_cleanup.max_deleted_per_activation = 10000

[default_region_allocation_strategy]
type = "random"
Expand Down Expand Up @@ -1804,6 +1836,19 @@ mod test {
}
}

#[test]
fn test_invalid_audit_log_cleanup_retention_days() {
let error = toml::from_str::<AuditLogCleanupConfig>(
r##"
period_secs = 600
retention_days = 0
max_deleted_per_activation = 10000
"##,
)
.expect_err("retention_days = 0 should be rejected");
assert!(error.message().contains("nonzero"), "error = {}", error);
}

#[test]
fn test_repo_configs_are_valid() {
// The example config file should be valid.
Expand Down
1 change: 1 addition & 0 deletions nexus/background-task-interface/src/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ pub struct BackgroundTasks {
pub task_instance_reincarnation: Activator,
pub task_service_firewall_propagation: Activator,
pub task_abandoned_vmm_reaper: Activator,
pub task_audit_log_cleanup: Activator,
pub task_audit_log_timeout_incomplete: Activator,
pub task_vpc_route_manager: Activator,
pub task_saga_recovery: Activator,
Expand Down
45 changes: 42 additions & 3 deletions nexus/db-queries/src/db/datastore/audit_log.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,46 @@ impl DataStore {
Ok(())
}

/// Hard-delete completed audit log entries older than `cutoff`.
///
/// Deletes up to `limit` rows where `time_completed IS NOT NULL` and
/// `time_completed < cutoff`, and returns the number of rows deleted.
///
/// We must never delete an entry before it has been completed, because
/// incomplete entries are not yet visible in the audit log. Deleting
/// one would mean the operation never appears in the log at all. The
/// `audit_log_timeout_incomplete` task ensures that even crash-orphaned
/// entries get completed (with result_kind = timeout) well before the
/// retention cutoff, but the `time_completed IS NOT NULL` filter is the
/// structural guarantee that we cannot delete something that hasn't
/// appeared in the log yet.
pub async fn audit_log_cleanup(
&self,
opctx: &OpContext,
cutoff: DateTime<Utc>,
limit: u32,
) -> Result<usize, omicron_common::api::external::Error> {
opctx.authorize(authz::Action::CreateChild, &authz::AUDIT_LOG).await?;

// Diesel's DeleteStatement doesn't support ORDER BY or LIMIT, so we use
// raw SQL. We could use a subquery instead and use the DSL, but this
// feels like a more direct and literal a representation of what we're
// doing. The semantics of the query are checked by the integration test
// for the background task.
diesel::sql_query(
"DELETE FROM omicron.public.audit_log \
WHERE time_completed IS NOT NULL \
AND time_completed < $1 \
ORDER BY time_completed, id \
LIMIT $2",
)
.bind::<diesel::sql_types::Timestamptz, _>(cutoff)
.bind::<diesel::sql_types::BigInt, _>(i64::from(limit))
.execute_async(&*self.pool_connection_authorized(opctx).await?)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// Transition stale incomplete audit log rows to `result_kind = timeout`.
///
/// Finds up to `limit` rows where `time_completed IS NULL` and
Expand Down Expand Up @@ -452,12 +492,11 @@ mod tests {
) {
use diesel::prelude::*;
use nexus_db_schema::schema::audit_log;
let pool = &*datastore.pool_connection_for_tests().await.unwrap();
diesel::update(audit_log::table)
.filter(audit_log::id.eq(id))
.set(audit_log::time_started.eq(time_started))
.execute_async(
&*datastore.pool_connection_for_tests().await.unwrap(),
)
.execute_async(pool)
.await
.expect("could not set time_started");
}
Expand Down
3 changes: 3 additions & 0 deletions nexus/examples/config-second.toml
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ session_cleanup.period_secs = 300
session_cleanup.max_delete_per_activation = 10000
audit_log_timeout_incomplete.period_secs = 600
audit_log_timeout_incomplete.timeout_secs = 14400
audit_log_cleanup.period_secs = 600
audit_log_cleanup.retention_days = 90
audit_log_cleanup.max_deleted_per_activation = 10000
audit_log_timeout_incomplete.max_timed_out_per_activation = 1000

[default_region_allocation_strategy]
Expand Down
3 changes: 3 additions & 0 deletions nexus/examples/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@ session_cleanup.period_secs = 300
session_cleanup.max_delete_per_activation = 10000
audit_log_timeout_incomplete.period_secs = 600
audit_log_timeout_incomplete.timeout_secs = 14400
audit_log_cleanup.period_secs = 600
audit_log_cleanup.retention_days = 90
audit_log_cleanup.max_deleted_per_activation = 10000
audit_log_timeout_incomplete.max_timed_out_per_activation = 1000

[default_region_allocation_strategy]
Expand Down
20 changes: 19 additions & 1 deletion nexus/src/app/background/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ use super::driver::TaskDefinition;
use super::tasks::abandoned_vmm_reaper;
use super::tasks::alert_dispatcher::AlertDispatcher;
use super::tasks::attached_subnets;
use super::tasks::audit_log_cleanup;
use super::tasks::audit_log_timeout_incomplete;
use super::tasks::bfd;
use super::tasks::blueprint_execution;
Expand Down Expand Up @@ -248,6 +249,7 @@ impl BackgroundTasksInitializer {
task_instance_reincarnation: Activator::new(),
task_service_firewall_propagation: Activator::new(),
task_abandoned_vmm_reaper: Activator::new(),
task_audit_log_cleanup: Activator::new(),
task_audit_log_timeout_incomplete: Activator::new(),
task_vpc_route_manager: Activator::new(),
task_saga_recovery: Activator::new(),
Expand Down Expand Up @@ -365,6 +367,7 @@ impl BackgroundTasksInitializer {
task_attached_subnet_manager,
task_session_cleanup,
task_audit_log_timeout_incomplete,
task_audit_log_cleanup,
// Add new background tasks here. Be sure to use this binding in a
// call to `Driver::register()` below. That's what actually wires
// up the Activator to the corresponding background task.
Expand Down Expand Up @@ -1211,7 +1214,7 @@ impl BackgroundTasksInitializer {
period: config.audit_log_timeout_incomplete.period_secs,
task_impl: Box::new(
audit_log_timeout_incomplete::AuditLogTimeoutIncomplete::new(
datastore,
datastore.clone(),
config.audit_log_timeout_incomplete.timeout_secs,
config
.audit_log_timeout_incomplete
Expand All @@ -1223,6 +1226,21 @@ impl BackgroundTasksInitializer {
activator: task_audit_log_timeout_incomplete,
});

driver.register(TaskDefinition {
name: "audit_log_cleanup",
description: "hard-deletes completed audit log entries older \
than the retention period",
period: config.audit_log_cleanup.period_secs,
task_impl: Box::new(audit_log_cleanup::AuditLogCleanup::new(
datastore,
config.audit_log_cleanup.retention_days,
config.audit_log_cleanup.max_deleted_per_activation,
)),
opctx: opctx.child(BTreeMap::new()),
watchers: vec![],
activator: task_audit_log_cleanup,
});

driver
}
}
Expand Down
Loading
Loading