Skip to content

Commit 74a6c13

Browse files
authored
[repo depot 3/n] nexus background task to replicate TUF artifacts across sleds (#7129)
1 parent 345e095 commit 74a6c13

54 files changed

Lines changed: 2007 additions & 139 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus;
5454
use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus;
5555
use nexus_types::internal_api::background::SupportBundleCleanupReport;
5656
use nexus_types::internal_api::background::SupportBundleCollectionReport;
57+
use nexus_types::internal_api::background::TufArtifactReplicationCounters;
58+
use nexus_types::internal_api::background::TufArtifactReplicationRequest;
59+
use nexus_types::internal_api::background::TufArtifactReplicationStatus;
5760
use nexus_types::inventory::BaseboardId;
5861
use omicron_uuid_kinds::BlueprintUuid;
5962
use omicron_uuid_kinds::CollectionUuid;
@@ -952,6 +955,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
952955
"support_bundle_collector" => {
953956
print_task_support_bundle_collector(details);
954957
}
958+
"tuf_artifact_replication" => {
959+
print_task_tuf_artifact_replication(details);
960+
}
955961
_ => {
956962
println!(
957963
"warning: unknown background task: {:?} \
@@ -2143,6 +2149,72 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
21432149
}
21442150
}
21452151

2152+
fn print_task_tuf_artifact_replication(details: &serde_json::Value) {
2153+
fn print_counters(counters: TufArtifactReplicationCounters) {
2154+
const ROWS: &[&str] = &[
2155+
"list ok:",
2156+
"list err:",
2157+
"put ok:",
2158+
"put err:",
2159+
"copy ok:",
2160+
"copy err:",
2161+
"delete ok:",
2162+
"delete err:",
2163+
];
2164+
const WIDTH: usize = const_max_len(ROWS);
2165+
2166+
for (label, value) in ROWS.iter().zip([
2167+
counters.list_ok,
2168+
counters.list_err,
2169+
counters.put_ok,
2170+
counters.put_err,
2171+
counters.copy_ok,
2172+
counters.copy_err,
2173+
counters.delete_ok,
2174+
counters.delete_err,
2175+
]) {
2176+
println!(" {label:<WIDTH$} {value:>3}");
2177+
}
2178+
}
2179+
2180+
match serde_json::from_value::<TufArtifactReplicationStatus>(
2181+
details.clone(),
2182+
) {
2183+
Err(error) => eprintln!(
2184+
"warning: failed to interpret task details: {:?}: {:?}",
2185+
error, details
2186+
),
2187+
Ok(status) => {
2188+
println!(" request ringbuf:");
2189+
if status.request_debug_ringbuf.is_empty() {
2190+
println!(" [no entries]");
2191+
}
2192+
for TufArtifactReplicationRequest {
2193+
time,
2194+
target_sled,
2195+
operation,
2196+
error,
2197+
} in status.request_debug_ringbuf.iter()
2198+
{
2199+
println!(" - target sled: {target_sled}");
2200+
println!(" operation: {operation:?}");
2201+
println!(
2202+
" at: {}",
2203+
time.to_rfc3339_opts(SecondsFormat::Secs, true)
2204+
);
2205+
if let Some(error) = error {
2206+
println!(" error: {error}")
2207+
}
2208+
}
2209+
println!(" last run:");
2210+
print_counters(status.last_run_counters);
2211+
println!(" lifetime:");
2212+
print_counters(status.lifetime_counters);
2213+
println!(" local repos: {}", status.local_repos);
2214+
}
2215+
}
2216+
}
2217+
21462218
/// Summarizes an `ActivationReason`
21472219
fn reason_str(reason: &ActivationReason) -> &'static str {
21482220
match reason {

dev-tools/omdb/tests/env.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,10 @@ task: "switch_port_config_manager"
175175
manages switch port settings for rack switches
176176

177177

178+
task: "tuf_artifact_replication"
179+
replicate update repo artifacts across sleds
180+
181+
178182
task: "v2p_manager"
179183
manages opte v2p mappings for vpc networking
180184

@@ -355,6 +359,10 @@ task: "switch_port_config_manager"
355359
manages switch port settings for rack switches
356360

357361

362+
task: "tuf_artifact_replication"
363+
replicate update repo artifacts across sleds
364+
365+
358366
task: "v2p_manager"
359367
manages opte v2p mappings for vpc networking
360368

@@ -522,6 +530,10 @@ task: "switch_port_config_manager"
522530
manages switch port settings for rack switches
523531

524532

533+
task: "tuf_artifact_replication"
534+
replicate update repo artifacts across sleds
535+
536+
525537
task: "v2p_manager"
526538
manages opte v2p mappings for vpc networking
527539

dev-tools/omdb/tests/successes.out

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,10 @@ task: "switch_port_config_manager"
394394
manages switch port settings for rack switches
395395

396396

397+
task: "tuf_artifact_replication"
398+
replicate update repo artifacts across sleds
399+
400+
397401
task: "v2p_manager"
398402
manages opte v2p mappings for vpc networking
399403

@@ -729,6 +733,33 @@ task: "switch_port_config_manager"
729733
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
730734
warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {})
731735

736+
task: "tuf_artifact_replication"
737+
configured period: every <REDACTED_DURATION>h
738+
currently executing: no
739+
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
740+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
741+
request ringbuf:
742+
<REDACTED_SECTION>
743+
last run:
744+
list ok: <LIST_OK_REDACTED>
745+
list err: 0
746+
put ok: 0
747+
put err: 0
748+
copy ok: 0
749+
copy err: 0
750+
delete ok: 0
751+
delete err: 0
752+
lifetime:
753+
list ok: <LIST_OK_REDACTED>
754+
list err: 0
755+
put ok: 0
756+
put err: 0
757+
copy ok: 0
758+
copy err: 0
759+
delete ok: 0
760+
delete err: 0
761+
local repos: 0
762+
732763
task: "v2p_manager"
733764
configured period: every <REDACTED_DURATION>s
734765
currently executing: no
@@ -1193,6 +1224,33 @@ task: "switch_port_config_manager"
11931224
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
11941225
warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {})
11951226

1227+
task: "tuf_artifact_replication"
1228+
configured period: every <REDACTED_DURATION>h
1229+
currently executing: no
1230+
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
1231+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1232+
request ringbuf:
1233+
<REDACTED_SECTION>
1234+
last run:
1235+
list ok: <LIST_OK_REDACTED>
1236+
list err: 0
1237+
put ok: 0
1238+
put err: 0
1239+
copy ok: 0
1240+
copy err: 0
1241+
delete ok: 0
1242+
delete err: 0
1243+
lifetime:
1244+
list ok: <LIST_OK_REDACTED>
1245+
list err: 0
1246+
put ok: 0
1247+
put err: 0
1248+
copy ok: 0
1249+
copy err: 0
1250+
delete ok: 0
1251+
delete err: 0
1252+
local repos: 0
1253+
11961254
task: "v2p_manager"
11971255
configured period: every <REDACTED_DURATION>s
11981256
currently executing: no

dev-tools/omdb/tests/test_all_output.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,14 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
219219
redactor.extra_variable_length("cockroachdb_version", &crdb_version);
220220
}
221221

222+
// The `tuf_artifact_replication` task's output depends on how
223+
// many sleds happened to register with Nexus before its first
224+
// execution. These redactions work around the issue described in
225+
// https://github.com/oxidecomputer/omicron/issues/7417.
226+
redactor
227+
.field("list ok:", r"\d+")
228+
.section(&["task: \"tuf_artifact_replication\"", "request ringbuf:"]);
229+
222230
for args in invocations {
223231
println!("running commands with args: {:?}", args);
224232
let p = postgres_url.to_string();

dev-tools/omdb/tests/usage_errors.out

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -315,17 +315,19 @@ Options:
315315
Show sleds that match the given filter
316316

317317
Possible values:
318-
- all: All sleds in the system, regardless of policy or state
319-
- commissioned: All sleds that are currently part of the control plane cluster
320-
- decommissioned: All sleds that were previously part of the control plane cluster
321-
but have been decommissioned
322-
- discretionary: Sleds that are eligible for discretionary services
323-
- in-service: Sleds that are in service (even if they might not be eligible
318+
- all: All sleds in the system, regardless of policy or state
319+
- commissioned: All sleds that are currently part of the control plane cluster
320+
- decommissioned: All sleds that were previously part of the control plane
321+
cluster but have been decommissioned
322+
- discretionary: Sleds that are eligible for discretionary services
323+
- in-service: Sleds that are in service (even if they might not be eligible
324324
for discretionary services)
325-
- query-during-inventory: Sleds whose sled agents should be queried for inventory
326-
- reservation-create: Sleds on which reservations can be created
327-
- vpc-routing: Sleds which should be sent OPTE V2P mappings and Routing rules
328-
- vpc-firewall: Sleds which should be sent VPC firewall rules
325+
- query-during-inventory: Sleds whose sled agents should be queried for inventory
326+
- reservation-create: Sleds on which reservations can be created
327+
- vpc-routing: Sleds which should be sent OPTE V2P mappings and Routing rules
328+
- vpc-firewall: Sleds which should be sent VPC firewall rules
329+
- tuf-artifact-replication: Sleds which should have TUF repo artifacts replicated onto
330+
them
329331

330332
--log-level <LOG_LEVEL>
331333
log level filter

nexus-config/src/nexus_config.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,8 @@ pub struct BackgroundTaskConfig {
417417
/// configuration for region snapshot replacement finisher task
418418
pub region_snapshot_replacement_finish:
419419
RegionSnapshotReplacementFinishConfig,
420+
/// configuration for TUF artifact replication task
421+
pub tuf_artifact_replication: TufArtifactReplicationConfig,
420422
}
421423

422424
#[serde_as]
@@ -722,6 +724,17 @@ pub struct RegionSnapshotReplacementFinishConfig {
722724
pub period_secs: Duration,
723725
}
724726

727+
#[serde_as]
728+
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
729+
pub struct TufArtifactReplicationConfig {
730+
/// period (in seconds) for periodic activations of this background task
731+
#[serde_as(as = "DurationSeconds<u64>")]
732+
pub period_secs: Duration,
733+
/// The number of sleds that artifacts must be present on before a local
734+
/// copy of a repo's artifacts is dropped.
735+
pub min_sled_replication: usize,
736+
}
737+
725738
/// Configuration for a nexus server
726739
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
727740
pub struct PackageConfig {
@@ -978,6 +991,8 @@ mod test {
978991
region_snapshot_replacement_garbage_collection.period_secs = 30
979992
region_snapshot_replacement_step.period_secs = 30
980993
region_snapshot_replacement_finish.period_secs = 30
994+
tuf_artifact_replication.period_secs = 300
995+
tuf_artifact_replication.min_sled_replication = 3
981996
[default_region_allocation_strategy]
982997
type = "random"
983998
seed = 0
@@ -1174,6 +1189,11 @@ mod test {
11741189
RegionSnapshotReplacementFinishConfig {
11751190
period_secs: Duration::from_secs(30),
11761191
},
1192+
tuf_artifact_replication:
1193+
TufArtifactReplicationConfig {
1194+
period_secs: Duration::from_secs(300),
1195+
min_sled_replication: 3,
1196+
},
11771197
},
11781198
default_region_allocation_strategy:
11791199
crate::nexus_config::RegionAllocationStrategy::Random {
@@ -1257,6 +1277,8 @@ mod test {
12571277
region_snapshot_replacement_garbage_collection.period_secs = 30
12581278
region_snapshot_replacement_step.period_secs = 30
12591279
region_snapshot_replacement_finish.period_secs = 30
1280+
tuf_artifact_replication.period_secs = 300
1281+
tuf_artifact_replication.min_sled_replication = 3
12601282
[default_region_allocation_strategy]
12611283
type = "random"
12621284
"##,

nexus/db-model/src/schema.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,7 @@ table! {
907907
sled_policy -> crate::sled_policy::SledPolicyEnum,
908908
sled_state -> crate::SledStateEnum,
909909
sled_agent_gen -> Int8,
910+
repo_depot_port -> Int4,
910911
}
911912
}
912913

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use std::collections::BTreeMap;
1717
///
1818
/// This must be updated when you change the database schema. Refer to
1919
/// schema/crdb/README.adoc in the root of this repository for details.
20-
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(121, 0, 0);
20+
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(122, 0, 0);
2121

2222
/// List of all past database schema versions, in *reverse* order
2323
///
@@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
2929
// | leaving the first copy as an example for the next person.
3030
// v
3131
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
32+
KnownVersion::new(122, "tuf-artifact-replication"),
3233
KnownVersion::new(121, "dataset-to-crucible-dataset"),
3334
KnownVersion::new(120, "rendezvous-debug-dataset"),
3435
KnownVersion::new(119, "tuf-artifact-key-uuid"),

nexus/db-model/src/sled.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ pub struct Sled {
8181
/// This is specifically distinct from `rcgen`, which is incremented by
8282
/// child resources as part of `DatastoreCollectionConfig`.
8383
pub sled_agent_gen: Generation,
84+
85+
// ServiceAddress (Repo Depot API). Uses `ip`.
86+
pub repo_depot_port: SqlU16,
8487
}
8588

8689
impl Sled {
@@ -169,6 +172,7 @@ impl From<Sled> for params::SledAgentInfo {
169172
};
170173
Self {
171174
sa_address: sled.address(),
175+
repo_depot_port: sled.repo_depot_port.into(),
172176
role,
173177
baseboard: Baseboard {
174178
serial: sled.serial_number.clone(),
@@ -220,6 +224,9 @@ pub struct SledUpdate {
220224
pub ip: ipv6::Ipv6Addr,
221225
pub port: SqlU16,
222226

227+
// ServiceAddress (Repo Depot API). Uses `ip`.
228+
pub repo_depot_port: SqlU16,
229+
223230
// Generation number - owned and incremented by sled-agent.
224231
pub sled_agent_gen: Generation,
225232
}
@@ -228,6 +235,7 @@ impl SledUpdate {
228235
pub fn new(
229236
id: Uuid,
230237
addr: SocketAddrV6,
238+
repo_depot_port: u16,
231239
baseboard: SledBaseboard,
232240
hardware: SledSystemHardware,
233241
rack_id: Uuid,
@@ -247,6 +255,7 @@ impl SledUpdate {
247255
reservoir_size: hardware.reservoir_size,
248256
ip: addr.ip().into(),
249257
port: addr.port().into(),
258+
repo_depot_port: repo_depot_port.into(),
250259
sled_agent_gen,
251260
}
252261
}
@@ -282,6 +291,7 @@ impl SledUpdate {
282291
reservoir_size: self.reservoir_size,
283292
ip: self.ip,
284293
port: self.port,
294+
repo_depot_port: self.repo_depot_port,
285295
last_used_address,
286296
sled_agent_gen: self.sled_agent_gen,
287297
}

nexus/db-queries/src/db/datastore/crucible_dataset.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ mod test {
270270
let sled = SledUpdate::new(
271271
*sled_id.as_untyped_uuid(),
272272
"[::1]:0".parse().unwrap(),
273+
0,
273274
SledBaseboard {
274275
serial_number: "test-sn".to_string(),
275276
part_number: "test-pn".to_string(),

0 commit comments

Comments
 (0)