Skip to content

Commit b3656a8

Browse files
authored
[12/n] [sled-agent] don't start install dataset zones on zone manifest error (#8237)
For mupdate overrides, in order to be safe, we must know that the data stored in the JSON is consistent with the images stored on disk. We currently apply this logic to install dataset hashes, and will apply it to artifact hashes in the future. Questions: * [x] ~~Should we apply this logic to more critical zones like the switch zone? They're always going to be part of the ramdisk maybe?~~ We unconditionally succeed for RAMdisk zones, and in particular the switch zone, now. * [ ] How does this interact with config-reconciler work? On error, that would permanently be blocked it seems to me. Note: we decide errors per-zone now. * [ ] We ask for the boot zpool to be passed in but also cache it as part of constructing the info... how do we reconcile the relative presence or absence of this info? Depends on #8235.
1 parent 8b04704 commit b3656a8

File tree

6 files changed

+254
-33
lines changed

6 files changed

+254
-33
lines changed

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

illumos-utils/src/running_zone.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1339,7 +1339,7 @@ impl<'a> ZoneBuilder<'a> {
13391339
}
13401340

13411341
/// Places to look for a zone's image.
1342-
#[derive(Clone, Debug)]
1342+
#[derive(Clone, Debug, PartialEq, Eq)]
13431343
pub struct ZoneImageFileSource {
13441344
/// The file name to look for.
13451345
pub file_name: String,

sled-agent/src/services.rs

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ use internal_dns_types::names::BOUNDARY_NTP_DNS_NAME;
6464
use internal_dns_types::names::DNS_ZONE;
6565
use nexus_config::{ConfigDropshotWithTls, DeploymentConfig};
6666
use nexus_sled_agent_shared::inventory::{
67-
OmicronZoneConfig, OmicronZoneImageSource, OmicronZoneType, ZoneKind,
67+
OmicronZoneConfig, OmicronZoneType, ZoneKind,
6868
};
6969
use omicron_common::address::AZ_PREFIX;
7070
use omicron_common::address::DENDRITE_PORT;
@@ -94,6 +94,7 @@ use omicron_uuid_kinds::OmicronZoneUuid;
9494
use sled_agent_config_reconciler::InternalDisksReceiver;
9595
use sled_agent_types::sled::SWITCH_ZONE_BASEBOARD_FILE;
9696
use sled_agent_zone_images::ZoneImageSourceResolver;
97+
use sled_agent_zone_images::{MupdateOverrideReadError, ZoneImageSource};
9798
use sled_hardware::DendriteAsic;
9899
use sled_hardware::SledMode;
99100
use sled_hardware::is_gimlet;
@@ -262,6 +263,9 @@ pub enum Error {
262263
#[error("Unexpected zone config: zone {zone_id} is running on ramdisk ?!")]
263264
ZoneIsRunningOnRamdisk { zone_id: OmicronZoneUuid },
264265

266+
#[error("failed to read mupdate override info, not starting zones")]
267+
MupdateOverrideRead(#[source] MupdateOverrideReadError),
268+
265269
#[error(
266270
"Couldn't find requested zone image ({hash}) for \
267271
{zone_kind:?} {id} in artifact store: {err}"
@@ -1363,20 +1367,31 @@ impl ServiceManager {
13631367
};
13641368

13651369
// TODO: `InstallDataset` should be renamed to something more accurate
1366-
// when all the major changes here have landed. Some zones are
1367-
// distributed from the host OS image and are never placed in the
1368-
// install dataset; that enum variant more accurately reflects that we
1369-
// are falling back to searching `/opt/oxide` in addition to the install
1370-
// datasets.
1370+
// when all the major changes here have landed.
1371+
//
1372+
// Some zones are distributed from the host OS image and are never
1373+
// placed in the install dataset; the Ramdisk enum variant more
1374+
// accurately reflects that we are only search `/opt/oxide` for those
1375+
// zones.
1376+
//
1377+
// (Currently, only the switch zone goes through this code path. Other
1378+
// ramdisk zones like the probe zone construct the file source
1379+
// directly.)
13711380
let image_source = match &request {
1372-
ZoneArgs::Omicron(zone_config) => &zone_config.image_source,
1373-
ZoneArgs::Switch(_) => &OmicronZoneImageSource::InstallDataset,
1381+
ZoneArgs::Omicron(zone_config) => {
1382+
ZoneImageSource::Omicron(zone_config.image_source.clone())
1383+
}
1384+
ZoneArgs::Switch(_) => ZoneImageSource::Ramdisk,
13741385
};
1375-
let file_source = self.inner.zone_image_resolver.file_source_for(
1376-
zone_type_str,
1377-
image_source,
1378-
self.inner.internal_disks_rx.current(),
1379-
);
1386+
let file_source = self
1387+
.inner
1388+
.zone_image_resolver
1389+
.file_source_for(
1390+
zone_type_str,
1391+
&image_source,
1392+
self.inner.internal_disks_rx.current(),
1393+
)
1394+
.map_err(|error| Error::MupdateOverrideRead(error))?;
13801395

13811396
// We use the fake initialiser for testing
13821397
let mut zone_builder = match self.inner.system_api.fake_install_dir() {

sled-agent/zone-images/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ license = "MPL-2.0"
88
workspace = true
99

1010
[dependencies]
11-
anyhow.workspace = true
1211
camino.workspace = true
1312
iddqd.workspace = true
1413
illumos-utils.workspace = true

sled-agent/zone-images/src/source_resolver.rs

Lines changed: 219 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,33 @@
66
77
use crate::AllMupdateOverrides;
88
use crate::AllZoneManifests;
9+
use crate::MupdateOverrideReadError;
910
use crate::MupdateOverrideStatus;
1011
use crate::RAMDISK_IMAGE_PATH;
1112
use crate::ZoneManifestStatus;
1213
use crate::install_dataset_file_name;
14+
use crate::ramdisk_file_source;
1315
use camino::Utf8PathBuf;
1416
use illumos_utils::running_zone::ZoneImageFileSource;
1517
use nexus_sled_agent_shared::inventory::OmicronZoneImageSource;
1618
use sled_agent_config_reconciler::InternalDisks;
1719
use sled_agent_config_reconciler::InternalDisksWithBootDisk;
20+
use slog::error;
1821
use slog::o;
22+
use slog_error_chain::InlineErrorChain;
1923
use std::sync::Arc;
2024
use std::sync::Mutex;
2125

26+
/// A zone image source.
27+
#[derive(Clone, Debug)]
28+
pub enum ZoneImageSource {
29+
/// An Omicron zone.
30+
Omicron(OmicronZoneImageSource),
31+
32+
/// A RAM disk-based zone.
33+
Ramdisk,
34+
}
35+
2236
/// Resolves [`OmicronZoneImageSource`] instances into file names and search
2337
/// paths.
2438
///
@@ -57,11 +71,19 @@ impl ZoneImageSourceResolver {
5771
pub fn file_source_for(
5872
&self,
5973
zone_type: &str,
60-
image_source: &OmicronZoneImageSource,
74+
image_source: &ZoneImageSource,
6175
internal_disks: InternalDisks,
62-
) -> ZoneImageFileSource {
63-
let inner = self.inner.lock().unwrap();
64-
inner.file_source_for(zone_type, image_source, internal_disks)
76+
) -> Result<ZoneImageFileSource, MupdateOverrideReadError> {
77+
match image_source {
78+
ZoneImageSource::Ramdisk => {
79+
// RAM disk images are always stored on the RAM disk path.
80+
Ok(ramdisk_file_source(zone_type))
81+
}
82+
ZoneImageSource::Omicron(image_source) => {
83+
let inner = self.inner.lock().unwrap();
84+
inner.file_source_for(zone_type, image_source, internal_disks)
85+
}
86+
}
6587
}
6688
}
6789

@@ -77,7 +99,6 @@ pub struct ResolverStatus {
7799

78100
#[derive(Debug)]
79101
struct ResolverInner {
80-
#[expect(unused)]
81102
log: slog::Logger,
82103
image_directory_override: Option<Utf8PathBuf>,
83104
// Store all collected information for zones -- we're going to need to
@@ -112,41 +133,222 @@ impl ResolverInner {
112133
zone_type: &str,
113134
image_source: &OmicronZoneImageSource,
114135
internal_disks: InternalDisks,
115-
) -> ZoneImageFileSource {
136+
) -> Result<ZoneImageFileSource, MupdateOverrideReadError> {
116137
match image_source {
117138
OmicronZoneImageSource::InstallDataset => {
118-
// Look for the image in the ramdisk first
139+
let file_name = install_dataset_file_name(zone_type);
140+
// Look for the image in the RAM disk first. Note that install
141+
// dataset images are not stored on the RAM disk in production,
142+
// just in development or test workflows.
119143
let mut zone_image_paths =
120144
vec![Utf8PathBuf::from(RAMDISK_IMAGE_PATH)];
145+
121146
// Inject an image path if requested by a test.
122147
if let Some(path) = &self.image_directory_override {
123148
zone_image_paths.push(path.clone());
124149
};
125150

126-
// If the boot disk exists, look for the image in the "install"
127-
// dataset on the boot zpool.
151+
// Any zones not part of the RAM disk are managed via the
152+
// zone manifest.
153+
//
154+
// XXX: we ask for the boot zpool to be passed in here. But
155+
// `AllZoneImages` also caches the boot zpool. How should we
156+
// reconcile the two?
128157
if let Some(path) = internal_disks.boot_disk_install_dataset() {
129-
zone_image_paths.push(path);
158+
match self.zone_manifests.boot_disk_result() {
159+
Ok(result) => {
160+
match result.data.get(file_name.as_str()) {
161+
Some(result) => {
162+
if result.is_valid() {
163+
zone_image_paths.push(path);
164+
} else {
165+
// If the zone is not valid, we refuse to start
166+
// it.
167+
error!(
168+
self.log,
169+
"zone {} is not valid in the zone manifest, \
170+
not returning it as a source",
171+
file_name;
172+
"error" => %result.display()
173+
);
174+
}
175+
}
176+
None => {
177+
error!(
178+
self.log,
179+
"zone {} is not present in the boot disk zone manifest",
180+
file_name,
181+
);
182+
}
183+
}
184+
}
185+
Err(error) => {
186+
error!(
187+
self.log,
188+
"error parsing boot disk zone manifest, not returning \
189+
install dataset as a source";
190+
"error" => InlineErrorChain::new(error),
191+
);
192+
}
193+
}
130194
}
131195

132-
ZoneImageFileSource {
133-
file_name: install_dataset_file_name(zone_type),
196+
Ok(ZoneImageFileSource {
197+
file_name,
134198
search_paths: zone_image_paths,
135-
}
199+
})
136200
}
137201
OmicronZoneImageSource::Artifact { hash } => {
202+
// TODO: implement mupdate override here.
203+
//
138204
// Search both artifact datasets. This iterator starts with the
139205
// dataset for the boot disk (if it exists), and then is followed
140206
// by all other disks.
141207
let search_paths =
142208
internal_disks.all_artifact_datasets().collect();
143-
ZoneImageFileSource {
144-
// Images in the artifact store are named by just their
145-
// hash.
209+
Ok(ZoneImageFileSource {
146210
file_name: hash.to_string(),
147211
search_paths,
148-
}
212+
})
213+
}
214+
}
215+
}
216+
}
217+
218+
#[cfg(test)]
219+
mod tests {
220+
use super::*;
221+
222+
use crate::test_utils::{
223+
BOOT_PATHS, BOOT_ZPOOL, WriteInstallDatasetContext,
224+
make_internal_disks_rx,
225+
};
226+
227+
use camino_tempfile_ext::prelude::*;
228+
use dropshot::{ConfigLogging, ConfigLoggingLevel, test_util::LogContext};
229+
230+
/// Test source resolver behavior when the zone manifest is invalid.
231+
#[test]
232+
fn file_source_zone_manifest_invalid() {
233+
let logctx = LogContext::new(
234+
"source_resolver_file_source_zone_manifest_invalid",
235+
&ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug },
236+
);
237+
let dir = Utf8TempDir::new().unwrap();
238+
dir.child(&BOOT_PATHS.install_dataset).create_dir_all().unwrap();
239+
240+
let internal_disks_rx =
241+
make_internal_disks_rx(dir.path(), BOOT_ZPOOL, &[]);
242+
let resolver = ZoneImageSourceResolver::new(
243+
&logctx.log,
244+
internal_disks_rx.current_with_boot_disk(),
245+
);
246+
247+
// RAM disk image sources should work as expected.
248+
let ramdisk_source = resolver
249+
.file_source_for(
250+
"zone1",
251+
&ZoneImageSource::Ramdisk,
252+
internal_disks_rx.current(),
253+
)
254+
.unwrap();
255+
assert_eq!(ramdisk_source, ramdisk_file_source("zone1"));
256+
257+
let file_source = resolver
258+
.file_source_for(
259+
"zone1",
260+
&ZoneImageSource::Omicron(
261+
OmicronZoneImageSource::InstallDataset,
262+
),
263+
internal_disks_rx.current(),
264+
)
265+
.unwrap();
266+
267+
// Because the zone manifest is missing, the file source should not
268+
// return the install dataset.
269+
assert_eq!(
270+
file_source,
271+
ZoneImageFileSource {
272+
file_name: install_dataset_file_name("zone1"),
273+
search_paths: vec![Utf8PathBuf::from(RAMDISK_IMAGE_PATH)]
149274
}
275+
);
276+
277+
logctx.cleanup_successful();
278+
}
279+
280+
/// Test source resolver behavior when the zone manifest detects errors.
281+
#[test]
282+
fn file_source_with_errors() {
283+
let logctx = LogContext::new(
284+
"source_resolver_file_source_with_errors",
285+
&ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug },
286+
);
287+
let dir = Utf8TempDir::new().unwrap();
288+
let mut cx = WriteInstallDatasetContext::new_basic();
289+
cx.make_error_cases();
290+
291+
cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap();
292+
293+
let internal_disks_rx =
294+
make_internal_disks_rx(dir.path(), BOOT_ZPOOL, &[]);
295+
let resolver = ZoneImageSourceResolver::new(
296+
&logctx.log,
297+
internal_disks_rx.current_with_boot_disk(),
298+
);
299+
300+
// The resolver should not fail for ramdisk images.
301+
let file_source = resolver
302+
.file_source_for(
303+
"fake-zone",
304+
&ZoneImageSource::Ramdisk,
305+
internal_disks_rx.current(),
306+
)
307+
.unwrap();
308+
assert_eq!(file_source, ramdisk_file_source("fake-zone"));
309+
310+
// zone1.tar.gz is valid.
311+
let file_source = resolver
312+
.file_source_for(
313+
"zone1",
314+
&ZoneImageSource::Omicron(
315+
OmicronZoneImageSource::InstallDataset,
316+
),
317+
internal_disks_rx.current(),
318+
)
319+
.unwrap();
320+
assert_eq!(
321+
file_source,
322+
ZoneImageFileSource {
323+
file_name: "zone1.tar.gz".to_string(),
324+
search_paths: vec![
325+
Utf8PathBuf::from(RAMDISK_IMAGE_PATH),
326+
dir.path().join(&BOOT_PATHS.install_dataset)
327+
]
328+
},
329+
);
330+
331+
// zone2, zone3, zone4 and zone5 aren't valid, and none of them will
332+
// return the install dataset path.
333+
for zone_name in ["zone2", "zone3", "zone4", "zone5"] {
334+
let file_source = resolver
335+
.file_source_for(
336+
zone_name,
337+
&ZoneImageSource::Omicron(
338+
OmicronZoneImageSource::InstallDataset,
339+
),
340+
internal_disks_rx.current(),
341+
)
342+
.unwrap();
343+
assert_eq!(
344+
file_source,
345+
ZoneImageFileSource {
346+
file_name: install_dataset_file_name(zone_name),
347+
search_paths: vec![Utf8PathBuf::from(RAMDISK_IMAGE_PATH)]
348+
}
349+
);
150350
}
351+
352+
logctx.cleanup_successful();
151353
}
152354
}

0 commit comments

Comments
 (0)