diff --git a/Cargo.lock b/Cargo.lock index ab9c9080392..b6d1a9fe55f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3567,6 +3567,7 @@ dependencies = [ "serde", "serde_json", "slog", + "thiserror 1.0.69", "uuid", ] diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml index 6e74b903a57..37e015c5d26 100644 --- a/clients/gateway-client/Cargo.toml +++ b/clients/gateway-client/Cargo.toml @@ -21,5 +21,6 @@ serde.workspace = true serde_json.workspace = true schemars.workspace = true slog.workspace = true +thiserror.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index df7b88c13b4..a8c6d5a84dd 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -63,7 +63,7 @@ progenitor::generate_api!( HostPhase2RecoveryImageId = { derives = [PartialEq, Eq, PartialOrd, Ord] }, ImageVersion = { derives = [PartialEq, Eq, PartialOrd, Ord] }, RotImageDetails = { derives = [PartialEq, Eq, PartialOrd, Ord] }, - RotImageError = { derives = [ PartialEq, Eq, PartialOrd, Ord] }, + RotImageError = { derives = [ thiserror::Error, PartialEq, Eq, PartialOrd, Ord] }, RotState = { derives = [PartialEq, Eq, PartialOrd, Ord] }, SpComponentCaboose = { derives = [PartialEq, Eq] }, SpIdentifier = { derives = [Copy, PartialEq, Hash, Eq] }, diff --git a/nexus/mgs-updates/Cargo.toml b/nexus/mgs-updates/Cargo.toml index 33811919c9e..269ba745e63 100644 --- a/nexus/mgs-updates/Cargo.toml +++ b/nexus/mgs-updates/Cargo.toml @@ -11,6 +11,7 @@ chrono.workspace = true futures.workspace = true gateway-client.workspace = true gateway-types.workspace = true +gateway-messages.workspace = true id-map.workspace = true internal-dns-resolver.workspace = true internal-dns-types.workspace = true diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index 6afc6fb7367..50a33cfa860 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -8,6 +8,7 @@ use super::MgsClients; use super::UpdateProgress; use futures::future::BoxFuture; +use gateway_client::types::RotImageError; use gateway_client::types::SpType; use gateway_client::types::SpUpdateStatus; use gateway_types::rot::RotSlot; @@ -267,7 +268,7 @@ pub trait SpComponentUpdateHelper { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>>; + ) -> BoxFuture<'a, Result<(), PostUpdateError>>; } /// Describes the live state of the component before the update begins @@ -275,6 +276,7 @@ pub trait SpComponentUpdateHelper { pub enum PrecheckStatus { UpdateComplete, ReadyForUpdate, + WaitingForOngoingRotBootloaderUpdate, } #[derive(Debug, Error)] @@ -319,6 +321,18 @@ pub enum PrecheckError { WrongInactiveVersion { expected: ExpectedVersion, found: FoundVersion }, } +#[derive(Debug, thiserror::Error)] +pub enum PostUpdateError { + #[error("communicating with MGS")] + GatewayClientError(#[from] GatewayClientError), + + #[error("communicating with RoT: {message:?}")] + RotCommunicationFailed { message: String }, + + #[error("invalid RoT bootloader image: {error:?}")] + RotBootloaderImageError { error: RotImageError }, +} + #[derive(Debug)] pub enum FoundVersion { MissingVersion, diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 4f0ebefc48e..4578ddba943 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -4,6 +4,7 @@ //! Concurrent-safe facilities for doing MGS-managed upates +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::STATUS_POLL_INTERVAL; @@ -32,7 +33,7 @@ use uuid::Uuid; /// How long may the status remain unchanged without us treating this as a /// problem? -pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(120); +pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(180); /// How long to wait between failed attempts to reset the device const RESET_DELAY_INTERVAL: Duration = Duration::from_secs(10); @@ -46,6 +47,14 @@ pub const DEFAULT_RETRY_TIMEOUT: Duration = Duration::from_secs(60); /// How long to wait after resetting the device before expecting it to come up const RESET_TIMEOUT: Duration = Duration::from_secs(60); +/// How long to wait for an ongoing RoT bootloader update +const WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT: Duration = + Duration::from_secs(180); + +/// How long to wait between poll attempts on RoT bootloader update status +const ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL: Duration = + Duration::from_secs(10); + /// Parameters describing a request to update one SP-managed component /// /// This is similar in spirit to the `SpComponentUpdater` trait but uses a @@ -216,7 +225,11 @@ pub(crate) async fn apply_update( // - if not, then if our required preconditions are met status.update(UpdateAttemptStatus::Precheck); match update_helper.precheck(log, &mut mgs_clients, update).await { - Ok(PrecheckStatus::ReadyForUpdate) => (), + Ok(PrecheckStatus::ReadyForUpdate) | + // This is the first time a Nexus instance is attempting to + // update the RoT bootloader, we don't need to wait for an + // ongoing update. + Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => (), Ok(PrecheckStatus::UpdateComplete) => { return Ok(UpdateCompletedHow::FoundNoChangesNeeded); } @@ -349,16 +362,33 @@ pub(crate) async fn apply_update( if try_reset { // We retry this until we get some error *other* than a communication - // error. There is intentionally no timeout here. If we've staged an - // update but not managed to reset the device, there's no point where - // we'd want to stop trying to do so. + // error or an RoT bootloader image error. There is intentionally no + // timeout here. If we've staged an update but not managed to reset + // the device, there's no point where we'd want to stop trying to do so. while let Err(error) = update_helper.post_update(log, &mut mgs_clients, update).await { - if !matches!(error, gateway_client::Error::CommunicationError(_)) { - let error = InlineErrorChain::new(&error); - error!(log, "post_update failed"; &error); - return Err(ApplyUpdateError::SpResetFailed(error.to_string())); + match error { + PostUpdateError::GatewayClientError(error) => { + if !matches!( + error, + gateway_client::Error::CommunicationError(_) + ) { + let error = InlineErrorChain::new(&error); + error!(log, "post_update failed"; &error); + return Err(ApplyUpdateError::SpResetFailed( + error.to_string(), + )); + } + } + PostUpdateError::RotBootloaderImageError { error } => { + let error = InlineErrorChain::new(&error); + error!(log, "post_update failed"; &error); + return Err(ApplyUpdateError::SpResetFailed( + error.to_string(), + )); + } + PostUpdateError::RotCommunicationFailed { message: _ } => {} } tokio::time::sleep(RESET_DELAY_INTERVAL).await; @@ -598,6 +628,24 @@ async fn wait_for_update_done( // Check if we're done. Ok(PrecheckStatus::UpdateComplete) => return Ok(()), + // We'll loop for 3 minutes to wait for any ongoing RoT bootloader update. + // We need to wait for 2 resets which have a timeout of 60 seconds each, + // and an attempt to retrieve boot info, which has a time out of 30 seconds. + // We give an additional 30 seconds to as a buffer for the other actions. + Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => { + if before.elapsed() + >= WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT + { + return Err(UpdateWaitError::Timeout( + WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT, + )); + } + + tokio::time::sleep(ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL) + .await; + continue; + } + // An incorrect version in the "inactive" slot, incorrect active slot, // or non-empty pending_persistent_boot_preference/transient_boot_preference // are normal during the upgrade. We have no reason to think these won't @@ -730,6 +778,91 @@ mod test { finished.expect_sp_success(expected_result); } + /// Tests several happy-path cases of updating an RoT bootloader + #[tokio::test] + async fn test_rot_bootloader_update_basic() { + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_rot_bootloader_update_basic", + SpPort::One, + ) + .await; + let log = &gwtestctx.logctx.log; + let artifacts = TestArtifacts::new(log).await.unwrap(); + + // Basic case: normal update + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Sled, + 1, + &artifacts.rot_bootloader_gimlet_artifact_hash, + UpdateCompletedHow::CompletedUpdate, + ) + .await; + + // Basic case: attempted update, found no changes needed + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Sled, + 1, + &artifacts.rot_bootloader_gimlet_artifact_hash, + UpdateCompletedHow::FoundNoChangesNeeded, + ) + .await; + + // Run the same two tests for a switch RoT bootloader. + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Switch, + 0, + &artifacts.rot_bootloader_sidecar_artifact_hash, + UpdateCompletedHow::CompletedUpdate, + ) + .await; + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Switch, + 0, + &artifacts.rot_bootloader_sidecar_artifact_hash, + UpdateCompletedHow::FoundNoChangesNeeded, + ) + .await; + + artifacts.teardown().await; + gwtestctx.teardown().await; + } + + async fn run_one_successful_rot_bootloader_update( + gwtestctx: &GatewayTestContext, + artifacts: &TestArtifacts, + sp_type: SpType, + slot_id: u32, + artifact_hash: &ArtifactHash, + expected_result: UpdateCompletedHow, + ) { + let desc = UpdateDescription { + gwtestctx, + artifacts, + sp_type, + slot_id, + artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + let in_progress = desc.setup().await; + let finished = in_progress.finish().await; + finished.expect_rot_bootloader_success(expected_result); + } + /// Tests several happy-path cases of updating an RoT #[tokio::test] async fn test_rot_update_basic() { @@ -1366,4 +1499,174 @@ mod test { gwtestctx.teardown().await; } + + /// Tests a bunch of easy fast-failure RoT bootloader cases. + #[tokio::test] + async fn test_rot_bootloader_basic_failures() { + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_rot_bootloader_basic_failures", + SpPort::One, + ) + .await; + let log = &gwtestctx.logctx.log; + let artifacts = TestArtifacts::new(log).await.unwrap(); + + // Test a case of mistaken identity (reported baseboard does not match + // the one that we expect). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: Some(BaseboardId { + part_number: String::from("i86pc"), + serial_number: String::from("SimGimlet0"), + }), + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "in sled slot 1, expected to find part \"i86pc\" serial \ + \"SimGimlet0\", but found part \"i86pc\" serial \ + \"SimGimlet01\"", + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where the active version doesn't match what we expect. + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: Some( + "not-right".parse().unwrap(), + ), + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find active version \"not-right\", but \ + found \"0.0.200\"" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where the inactive version doesn't match what it should + // (expected invalid, found something else). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: Some( + ExpectedVersion::NoValidVersion, + ), + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find inactive version NoValidVersion, \ + but found Version(\"0.0.200\")" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Now test a case where the inactive version doesn't match what it + // should (expected a different valid version). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: Some( + ExpectedVersion::Version( + "something-else".parse().unwrap(), + ), + ), + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find inactive version \ + Version(ArtifactVersion(\"something-else\")), but found \ + Version(\"0.0.200\")" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where we fail to fetch the artifact. We simulate this by + // tearing down our artifact server before the update starts. + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + let in_progress = desc.setup().await; + artifacts.teardown().await; + in_progress.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::FetchArtifact(..)); + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + gwtestctx.teardown().await; + } } diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 04aa569b0df..89356757947 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -6,35 +6,349 @@ use super::MgsClients; use crate::SpComponentUpdateHelper; +use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; +use crate::common_sp_update::error_means_caboose_is_invalid; +use futures::FutureExt; use futures::future::BoxFuture; +use gateway_client::SpComponent; +use gateway_client::types::GetRotBootInfoParams; +use gateway_client::types::RotImageError; +use gateway_client::types::RotState; +use gateway_client::types::SpComponentFirmwareSlot; +use gateway_client::types::SpType; +use gateway_messages::RotBootInfo; +use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::PendingMgsUpdate; +use nexus_types::deployment::PendingMgsUpdateDetails; +use slog::Logger; +use slog::{debug, error, info}; +use std::time::Duration; +use std::time::Instant; -type GatewayClientError = gateway_client::Error; +const WAIT_FOR_BOOT_INFO_TIMEOUT: Duration = Duration::from_secs(30); pub struct ReconfiguratorRotBootloaderUpdater; impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { /// Checks if the component is already updated or ready for update fn precheck<'a>( &'a self, - _log: &'a slog::Logger, - _mgs_clients: &'a mut MgsClients, - _update: &'a PendingMgsUpdate, + log: &'a slog::Logger, + mgs_clients: &'a mut MgsClients, + update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result> { - // TODO-K: To be completed in a follow up PR - todo!() + async move { + // Verify that the device is the one we think it is. + let state = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client.sp_get(update.sp_type, update.slot_id).await + }) + .await? + .into_inner(); + debug!(log, "found SP state"; "state" => ?state); + if state.model != update.baseboard_id.part_number + || state.serial_number != update.baseboard_id.serial_number + { + return Err(PrecheckError::WrongDevice { + sp_type: update.sp_type, + slot_id: update.slot_id, + expected_part: update.baseboard_id.part_number.clone(), + expected_serial: update.baseboard_id.serial_number.clone(), + found_part: state.model, + found_serial: state.serial_number, + }); + } + + // TODO-K: In the RoT bootloader update code in wicket, there is a set of + // known bootloader FWIDs that don't have cabooses. Is this something we + // should care about here? + // https://github.com/oxidecomputer/omicron/blob/89ce370f0a96165c777e90a008257a6085897f2a/wicketd/src/update_tracker.rs#L1817-L1841 + + // TODO-K: There are also older versions of the SP have a bug that prevents + // setting the active slot for the RoT bootloader. Is this something we should + // care about here? + // https://github.com/oxidecomputer/omicron/blob/89ce370f0a96165c777e90a008257a6085897f2a/wicketd/src/update_tracker.rs#L1705-L1710 + + // Fetch the caboose from the currently active slot (stage0). + let caboose = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_caboose_get( + update.sp_type, + update.slot_id, + &SpComponent::STAGE0.to_string(), + 0, + ) + .await + }) + .await? + .into_inner(); + debug!(log, "found active slot caboose"; "caboose" => ?caboose); + + // If the version in the currently active slot matches the one we're + // trying to set, then there's nothing to do. + if caboose.version == update.artifact_version.as_str() { + return Ok(PrecheckStatus::UpdateComplete); + } + + // Otherwise, if the version in the currently active slot does not + // match what we expect to find, bail out. It may be that somebody + // else has come along and completed a subsequent update and we + // don't want to roll that back. (If for some reason we *do* want + // to do this update, the planner will have to notice that what's + // here is wrong and update the blueprint.) + let PendingMgsUpdateDetails::RotBootloader { + expected_stage0_version, + expected_stage0_next_version, + } = &update.details + else { + unreachable!( + "pending MGS update details within ReconfiguratorSpUpdater \ + will always be for the RoT bootloader" + ); + }; + if caboose.version != expected_stage0_version.to_string() { + return Err(PrecheckError::WrongActiveVersion { + expected: expected_stage0_version.clone(), + found: caboose.version, + }); + } + + // For the same reason, check that the version in the inactive slot + // matches what we expect to find. + // TODO It's important for us to detect the condition that a caboose + // is invalid because this can happen when devices are programmed + // with a bad image. Unfortunately, MGS currently reports this as a + // 503. Besides being annoying for us to look for, this causes + // `try_all_serially()` to try the other MGS. That's pointless + // here, but not a big deal. + let found_stage0_next_caboose_result = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_caboose_get( + update.sp_type, + update.slot_id, + // The name for the SP component here is STAGE0 + // it's a little confusing because we're really + // trying to reach STAGE0NEXT, and there is no + // ROT_BOOTLOADER variant. We specify that we + // want STAGE0NEXT by setting the firmware slot + // to 1, which is where it will always be. + &SpComponent::STAGE0.to_string(), + 1, + ) + .await + }) + .await; + let found_stage0_next_version = + match found_stage0_next_caboose_result { + Ok(version) => { + FoundVersion::Version(version.into_inner().version) + } + Err(error) => { + if error_means_caboose_is_invalid(&error) { + FoundVersion::MissingVersion + } else { + return Err(PrecheckError::from(error)); + } + } + }; + match (&expected_stage0_next_version, &found_stage0_next_version) { + // expected garbage, found garbage + ( + ExpectedVersion::NoValidVersion, + FoundVersion::MissingVersion, + ) => (), + // expected a specific version and found it + ( + ExpectedVersion::Version(artifact_version), + FoundVersion::Version(found_stage0_next_version), + ) if artifact_version.to_string() + == *found_stage0_next_version => + { + () + } + // anything else is a mismatch + (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) + | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) + | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { + return Err(PrecheckError::WrongInactiveVersion { + expected: expected_stage0_next_version.clone(), + found: found_stage0_next_version, + }); + } + }; + + // TODO-K: In post_update we'll be restarting the RoT twice to do signature + // checks, and to set stage0 to the new version. What happens if the RoT + // itself is being updated (during the reset stage)? Should we check for that + // here before setting the RoT bootloader as ready to update? + + Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) + } + .boxed() } /// Attempts once to perform any post-update actions (e.g., reset the /// device) fn post_update<'a>( &'a self, - _log: &'a slog::Logger, - _mgs_clients: &'a mut MgsClients, - _update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { - // TODO-K: To be completed in a follow up PR - todo!() + log: &'a slog::Logger, + mgs_clients: &'a mut MgsClients, + update: &'a PendingMgsUpdate, + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + // TODO-K: Again, we're resetting the ROT twice here, what happens + // if an RoT update is happening at the same time? + + async move { + // Before setting stage0 to the new version we want to ensure + // the image is good and we're not going to brick the device. + // The RoT will do a signature check when reset. + debug!( + log, + "attempting to reset device to do bootloader signature check" + ); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + ) + .await + }) + .await?; + + // We now retrieve boot info from the RoT to verify the reset + // has completed and signature checks done. + debug!( + log, + "attempting to retrieve boot info to verify image validity" + ); + let stage0next_error = wait_for_stage0_next_image_check( + log, + mgs_clients, + update.sp_type, + update.slot_id, + WAIT_FOR_BOOT_INFO_TIMEOUT, + ) + .await?; + // If the image is not valid we bail + if let Some(error) = stage0next_error { + return Err(PostUpdateError::RotBootloaderImageError { + error, + }); + } + + debug!(log, "attempting to set RoT bootloader active slot"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + let persist = true; + mgs_client + .sp_component_active_slot_set( + update.sp_type, + update.slot_id, + &SpComponent::STAGE0.to_string(), + persist, + &SpComponentFirmwareSlot { slot: 1 }, + ) + .await?; + Ok(()) + }) + .await?; + + debug!(log, "attempting to reset device to set to new RoT bootloader version"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + ) + .await?; + Ok(()) + }) + .await?; + + Ok(()) + } + .boxed() + } +} + +/// Poll the RoT asking for its boot information. This is used to check +/// state after RoT bootloader updates +async fn wait_for_stage0_next_image_check( + log: &Logger, + mgs_clients: &mut MgsClients, + sp_type: SpType, + sp_slot: u32, + timeout: Duration, +) -> Result, PostUpdateError> { + let mut ticker = tokio::time::interval(Duration::from_secs(1)); + + let start = Instant::now(); + loop { + ticker.tick().await; + + match mgs_clients + .try_all_serially(log, |mgs_client| async move { + mgs_client + .sp_rot_boot_info( + sp_type, + sp_slot, + SpComponent::ROT.const_as_str(), + &GetRotBootInfoParams { + version: RotBootInfo::HIGHEST_KNOWN_VERSION, + }, + ) + .await + }) + .await + { + Ok(state) => match state.into_inner() { + // The minimum we will ever return is 3. + // Additionally, V2 does not report image errors, so we cannot + // know with certainty if a signature check came back with errors + RotState::V2 { .. } => unreachable!(), + RotState::V3 { stage0next_error, .. } => { + return Ok(stage0next_error); + } + // The RoT is probably still booting + RotState::CommunicationFailed { message } => { + if start.elapsed() < timeout { + info!( + log, + "failed getting RoT boot info (will retry)"; + "error" => %message, + ); + } else { + error!( + log, + "failed to get RoT boot info"; + "error" => %message, + ); + return Err(PostUpdateError::RotCommunicationFailed { + message, + }); + } + } + }, + Err(error) => { + if start.elapsed() < timeout { + info!( + log, + "failed getting RoT boot info (will retry)"; + "error" => %error, + ); + } else { + return Err(PostUpdateError::GatewayClientError(error)); + } + } + } } } diff --git a/nexus/mgs-updates/src/rot_updater.rs b/nexus/mgs-updates/src/rot_updater.rs index d7c5e7caf40..c7b0128a095 100644 --- a/nexus/mgs-updates/src/rot_updater.rs +++ b/nexus/mgs-updates/src/rot_updater.rs @@ -11,6 +11,7 @@ use super::common_sp_update::SpComponentUpdater; use super::common_sp_update::deliver_update; use crate::SpComponentUpdateHelper; use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::error_means_caboose_is_invalid; @@ -396,11 +397,12 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + async move { + // We want to set the slot we've just updated as the active one + debug!(log, "attempting to set active slot"); mgs_clients .try_all_serially(log, move |mgs_client| async move { - // We want to set the slot we've just updated as the active one - debug!(log, "attempting to set active slot"); let inactive_slot = match &update.details { PendingMgsUpdateDetails::Rot { expected_active_slot, .. } => { expected_active_slot.slot().toggled().to_u16() @@ -421,8 +423,13 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { &SpComponentFirmwareSlot { slot: inactive_slot } ) .await?; + Ok(()) + }) + .await?; - debug!(log, "attempting to reset device"); + debug!(log, "attempting to reset device"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { mgs_client .sp_component_reset( update.sp_type, @@ -431,7 +438,8 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { ) .await?; Ok(()) - }) - .boxed() + }).await?; + Ok(()) + }.boxed() } } diff --git a/nexus/mgs-updates/src/sp_updater.rs b/nexus/mgs-updates/src/sp_updater.rs index e50b4f3061b..88e45e4556d 100644 --- a/nexus/mgs-updates/src/sp_updater.rs +++ b/nexus/mgs-updates/src/sp_updater.rs @@ -9,6 +9,7 @@ use crate::SpComponentUpdateError; use crate::SpComponentUpdateHelper; use crate::UpdateProgress; use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::SpComponentUpdater; @@ -293,19 +294,23 @@ impl SpComponentUpdateHelper for ReconfiguratorSpUpdater { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { - mgs_clients - .try_all_serially(log, move |mgs_client| async move { - debug!(log, "attempting to reset device"); - mgs_client - .sp_component_reset( - update.sp_type, - update.slot_id, - &SpComponent::SP_ITSELF.to_string(), - ) - .await?; - Ok(()) - }) - .boxed() + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + async move { + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + debug!(log, "attempting to reset device"); + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::SP_ITSELF.to_string(), + ) + .await?; + Ok(()) + }) + .await?; + Ok(()) + } + .boxed() } } diff --git a/nexus/mgs-updates/src/test_util/sp_test_state.rs b/nexus/mgs-updates/src/test_util/sp_test_state.rs index 9c9198a4d5c..75b02972c83 100644 --- a/nexus/mgs-updates/src/test_util/sp_test_state.rs +++ b/nexus/mgs-updates/src/test_util/sp_test_state.rs @@ -44,6 +44,16 @@ pub struct SpTestState { /// This can be None if the caboose contents were not valid. pub caboose_rot_b: Option, + /// caboose read from stage 0 (active slot for the RoT bootloader) + /// + /// This is not an `Option` because we never expect to fail to read this. + pub caboose_stage0: SpComponentCaboose, + + /// caboose read from stage 0 next (inactive slot for the RoT bootloader) + /// + /// This can be None if the caboose contents were not valid. + pub caboose_stage0_next: Option, + /// Overall SP state pub sp_state: SpState, @@ -94,6 +104,24 @@ impl SpTestState { ) .await .map(|c| c.into_inner()); + let caboose_stage0 = mgs_client + .sp_component_caboose_get( + sp_type, + sp_slot, + SpComponent::STAGE0.const_as_str(), + 0, + ) + .await? + .into_inner(); + let caboose_stage0_next = mgs_client + .sp_component_caboose_get( + sp_type, + sp_slot, + SpComponent::STAGE0.const_as_str(), + 1, + ) + .await + .map(|c| c.into_inner()); let sp_info = mgs_client.sp_get(sp_type, sp_slot).await?.into_inner(); let sp_boot_info = mgs_client .sp_rot_boot_info( @@ -113,6 +141,10 @@ impl SpTestState { ), caboose_rot_a: ignore_invalid_caboose_error(caboose_rot_a), caboose_rot_b: ignore_invalid_caboose_error(caboose_rot_b), + caboose_stage0, + caboose_stage0_next: ignore_invalid_caboose_error( + caboose_stage0_next, + ), sp_state: sp_info, sp_boot_info, }) @@ -141,6 +173,14 @@ impl SpTestState { } } + pub fn expect_caboose_stage0(&self) -> &SpComponentCaboose { + &self.caboose_stage0 + } + + pub fn expect_caboose_stage0_next(&self) -> &SpComponentCaboose { + self.caboose_stage0_next.as_ref().expect("stage0 next caboose") + } + pub fn expect_caboose_rot_inactive(&self) -> &SpComponentCaboose { let slot = self.expect_rot_active_slot().toggled(); match slot { @@ -255,6 +295,22 @@ impl SpTestState { }, } } + + pub fn expect_stage0_version(&self) -> ArtifactVersion { + self.expect_caboose_stage0() + .version + .parse() + .expect("valid artifact version") + } + + pub fn expect_stage0_next_version(&self) -> ExpectedVersion { + match &self.caboose_stage0_next { + Some(v) => ExpectedVersion::Version( + v.version.parse().expect("valid stage0 next version"), + ), + None => ExpectedVersion::NoValidVersion, + } + } } fn ignore_invalid_caboose_error( diff --git a/nexus/mgs-updates/src/test_util/test_artifacts.rs b/nexus/mgs-updates/src/test_util/test_artifacts.rs index 33cb4853009..4eb753f0c47 100644 --- a/nexus/mgs-updates/src/test_util/test_artifacts.rs +++ b/nexus/mgs-updates/src/test_util/test_artifacts.rs @@ -36,6 +36,8 @@ pub struct TestArtifacts { pub sp_sidecar_artifact_hash: ArtifactHash, pub rot_gimlet_artifact_hash: ArtifactHash, pub rot_sidecar_artifact_hash: ArtifactHash, + pub rot_bootloader_gimlet_artifact_hash: ArtifactHash, + pub rot_bootloader_sidecar_artifact_hash: ArtifactHash, pub artifact_cache: Arc, deployed_cabooses: BTreeMap, resolver: FixedResolver, @@ -108,12 +110,56 @@ impl TestArtifacts { ArtifactHash(digest.finalize().into()) }; + // Make an RoT bootloader update artifact for SimGimlet. + let rot_bootloader_gimlet_artifact_caboose = CabooseBuilder::default() + .git_commit("fake-git-commit") + .board(SIM_GIMLET_BOARD) + .version("0.0.0") + .name("fake-name") + .build(); + let mut builder = HubrisArchiveBuilder::with_fake_image(); + builder + .write_caboose(rot_bootloader_gimlet_artifact_caboose.as_slice()) + .unwrap(); + let rot_bootloader_gimlet_artifact = builder.build_to_vec().unwrap(); + let rot_bootloader_gimlet_artifact_hash = { + let mut digest = sha2::Sha256::default(); + digest.update(&rot_bootloader_gimlet_artifact); + ArtifactHash(digest.finalize().into()) + }; + + // Make an RoT bootloader update artifact for SimSidecar + let rot_bootloader_sidecar_artifact_caboose = CabooseBuilder::default() + .git_commit("fake-git-commit") + .board(SIM_SIDECAR_BOARD) + .version("0.0.0") + .name("fake-name") + .build(); + let mut builder = HubrisArchiveBuilder::with_fake_image(); + builder + .write_caboose(rot_bootloader_sidecar_artifact_caboose.as_slice()) + .unwrap(); + let rot_bootloader_sidecar_artifact = builder.build_to_vec().unwrap(); + let rot_bootloader_sidecar_artifact_hash = { + let mut digest = sha2::Sha256::default(); + digest.update(&rot_bootloader_sidecar_artifact); + ArtifactHash(digest.finalize().into()) + }; + // Assemble a map of artifact hash to artifact contents. let artifact_data = [ (sp_gimlet_artifact_hash, sp_gimlet_artifact), (sp_sidecar_artifact_hash, sp_sidecar_artifact), (rot_gimlet_artifact_hash, rot_gimlet_artifact), (rot_sidecar_artifact_hash, rot_sidecar_artifact), + ( + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_gimlet_artifact, + ), + ( + rot_bootloader_sidecar_artifact_hash, + rot_bootloader_sidecar_artifact, + ), ] .into_iter() .collect(); @@ -124,6 +170,14 @@ impl TestArtifacts { (sp_sidecar_artifact_hash, sp_sidecar_artifact_caboose), (rot_gimlet_artifact_hash, rot_gimlet_artifact_caboose), (rot_sidecar_artifact_hash, rot_sidecar_artifact_caboose), + ( + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_gimlet_artifact_caboose, + ), + ( + rot_bootloader_sidecar_artifact_hash, + rot_bootloader_sidecar_artifact_caboose, + ), ] .into_iter() .collect(); @@ -155,6 +209,8 @@ impl TestArtifacts { sp_sidecar_artifact_hash, rot_gimlet_artifact_hash, rot_sidecar_artifact_hash, + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_sidecar_artifact_hash, deployed_cabooses, artifact_cache, resolver, diff --git a/nexus/mgs-updates/src/test_util/updates.rs b/nexus/mgs-updates/src/test_util/updates.rs index f59986c6315..fbcc17b0937 100644 --- a/nexus/mgs-updates/src/test_util/updates.rs +++ b/nexus/mgs-updates/src/test_util/updates.rs @@ -54,9 +54,10 @@ pub enum ExpectedSpComponent { override_expected_pending_persistent_boot_preference: Option, override_expected_transient_boot_preference: Option, }, - // TODO-K: Remove once fully implemented - #[allow(dead_code)] - RotBootloader {}, + RotBootloader { + override_expected_stage0: Option, + override_expected_stage0_next: Option, + }, } /// Describes an update operation that can later be executed any number of times @@ -152,7 +153,23 @@ impl UpdateDescription<'_> { expected_transient_boot_preference, } } - ExpectedSpComponent::RotBootloader {} => unimplemented!(), + ExpectedSpComponent::RotBootloader { + override_expected_stage0, + override_expected_stage0_next, + } => { + let expected_stage0_version = override_expected_stage0 + .clone() + .unwrap_or_else(|| sp1.expect_stage0_version()); + let expected_stage0_next_version = + override_expected_stage0_next + .clone() + .unwrap_or_else(|| sp1.expect_stage0_next_version()); + + PendingMgsUpdateDetails::RotBootloader { + expected_stage0_version, + expected_stage0_next_version, + } + } }; let deployed_caboose = self @@ -492,6 +509,64 @@ impl FinishedUpdateAttempt { } } + /// Asserts various conditions associated with successful RoT bootloader updates. + pub fn expect_rot_bootloader_success( + &self, + expected_result: UpdateCompletedHow, + ) { + let how = match self.result { + Ok(how) if how == expected_result => how, + _ => { + panic!( + "unexpected result from apply_update(): {:?}", + self.result, + ); + } + }; + + eprintln!("apply_update() -> {:?}", how); + let sp2 = &self.sp2; + + // The active slot should contain what we just updated to. + let deployed_caboose = &self.deployed_caboose; + assert!(cabooses_equal(&sp2.caboose_stage0, &deployed_caboose)); + + // RoT information should not have changed. + let sp1 = &self.sp1; + assert_eq!(sp1.expect_caboose_rot_a(), sp2.expect_caboose_rot_a()); + assert_eq!(sp1.expect_caboose_rot_b(), sp2.expect_caboose_rot_b()); + + // SP information should not have changed + let sp1 = &self.sp1; + assert_eq!( + sp1.expect_caboose_sp_inactive(), + sp2.expect_caboose_sp_inactive(), + ); + assert_eq!( + sp1.expect_caboose_sp_active(), + sp2.expect_caboose_sp_active(), + ); + + if how == UpdateCompletedHow::FoundNoChangesNeeded { + assert_eq!(sp1.caboose_stage0, sp2.caboose_stage0); + assert_eq!( + sp1.expect_caboose_stage0_next(), + sp2.expect_caboose_stage0_next() + ); + assert_eq!(sp1.sp_state, sp2.sp_state); + assert_eq!(sp1.sp_boot_info, sp2.sp_boot_info); + } else { + // One way or another, an update was completed. Stage 0 + // and Stage 0 next should contain the same contents + assert_eq!( + sp2.expect_caboose_stage0(), + sp2.expect_caboose_stage0_next() + ); + // RoT boot info should have changed + assert_ne!(sp1.sp_boot_info, sp2.sp_boot_info); + } + } + /// Asserts that the update failed and invokes `assert_error(error, /// initial_sp_state, final_sp_state)` for you to make your own assertions /// about why it failed and what state things were left in.