Skip to content

Commit

Permalink
Allow device to have multiple reasons for being in maintenance mode
Browse files Browse the repository at this point in the history
This change allows the device to have multiple reasons for being in
maintenance mode. This is useful when multiple conditions are met that
require the device to be in maintenance mode. For example, if the TPM
is in error and the device disk is full, the device can be in
maintenance mode for both reasons. Clearing one of the reasons will not
take the device out of maintenance mode if there are other reasons for
it to be in maintenance mode.

Signed-off-by: Shahriyar Jalayeri <[email protected]>
  • Loading branch information
shjala authored and eriknordmark committed Feb 5, 2025
1 parent 85905df commit f0f6fdb
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 137 deletions.
5 changes: 1 addition & 4 deletions pkg/pillar/cmd/nodeagent/handletimers.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,9 @@ func handleRebootOnVaultLocked(ctxPtr *nodeagentContext) {
scheduleNodeOperation(ctxPtr, errStr, types.BootReasonVaultFailure,
types.DeviceOperationReboot)
} else {
log.Noticef("Setting %s",
types.MaintenanceModeReasonVaultLockedUp)
// there is no image update in progress, this happened after a normal
// reboot. enter maintenance mode
ctxPtr.maintMode = true
ctxPtr.maintModeReason = types.MaintenanceModeReasonVaultLockedUp
addMaintenanceModeReason(ctxPtr, types.MaintenanceModeReasonVaultLockedUp, "handleRebootOnVaultLocked")
publishNodeAgentStatus(ctxPtr)
}
} else {
Expand Down
120 changes: 60 additions & 60 deletions pkg/pillar/cmd/nodeagent/nodeagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,13 @@ type nodeagentContext struct {
rebootStack string // From last reboot
rebootTime time.Time // From last reboot
restartCounter uint32
vaultOperational types.TriState // Is the vault fully operational?
vaultTestStartTime uint32 // Time at which we should start waiting for vault to be operational
maintMode bool // whether Maintenance mode should be triggered
maintModeReason types.MaintenanceModeReason //reason for entering Maintenance mode
configGetSuccess bool // got config from controller success
vaultmgrReported bool // got reports from vaultmgr
hvTypeKube bool // image is kubernetes cluster type
vaultOperational types.TriState // Is the vault fully operational?
vaultTestStartTime uint32 // Time at which we should start waiting for vault to be operational
maintMode bool // whether Maintenance mode should be triggered
maintModeReasons types.MaintenanceModeMultiReason // reasons for entering Maintenance mode
configGetSuccess bool // got config from controller success
vaultmgrReported bool // got reports from vaultmgr
hvTypeKube bool // image is kubernetes cluster type
waitDrainInProgress bool

// Some constants.. Declared here as variables to enable unit tests
Expand Down Expand Up @@ -734,24 +734,24 @@ func publishNodeAgentStatus(ctxPtr *nodeagentContext) {
pub := ctxPtr.pubNodeAgentStatus
ctxPtr.lastLock.Lock()
status := types.NodeAgentStatus{
Name: agentName,
CurPart: ctxPtr.curPart,
RemainingTestTime: ctxPtr.remainingTestTime,
UpdateInprogress: ctxPtr.updateInprogress,
DeviceReboot: ctxPtr.deviceReboot,
DeviceShutdown: ctxPtr.deviceShutdown,
DevicePoweroff: ctxPtr.devicePoweroff,
AllDomainsHalted: ctxPtr.allDomainsHalted,
RebootReason: ctxPtr.rebootReason,
BootReason: ctxPtr.bootReason,
RebootStack: ctxPtr.rebootStack,
RebootTime: ctxPtr.rebootTime,
RebootImage: ctxPtr.rebootImage,
RestartCounter: ctxPtr.restartCounter,
LocalMaintenanceMode: ctxPtr.maintMode,
LocalMaintenanceModeReason: ctxPtr.maintModeReason,
HVTypeKube: ctxPtr.hvTypeKube,
WaitDrainInProgress: ctxPtr.waitDrainInProgress,
Name: agentName,
CurPart: ctxPtr.curPart,
RemainingTestTime: ctxPtr.remainingTestTime,
UpdateInprogress: ctxPtr.updateInprogress,
DeviceReboot: ctxPtr.deviceReboot,
DeviceShutdown: ctxPtr.deviceShutdown,
DevicePoweroff: ctxPtr.devicePoweroff,
AllDomainsHalted: ctxPtr.allDomainsHalted,
RebootReason: ctxPtr.rebootReason,
BootReason: ctxPtr.bootReason,
RebootStack: ctxPtr.rebootStack,
RebootTime: ctxPtr.rebootTime,
RebootImage: ctxPtr.rebootImage,
RestartCounter: ctxPtr.restartCounter,
LocalMaintenanceMode: ctxPtr.maintMode,
LocalMaintenanceModeReasons: ctxPtr.maintModeReasons,
HVTypeKube: ctxPtr.hvTypeKube,
WaitDrainInProgress: ctxPtr.waitDrainInProgress,
}
ctxPtr.lastLock.Unlock()
pub.Publish(agentName, status)
Expand Down Expand Up @@ -788,14 +788,8 @@ func handleVaultStatusImpl(ctxArg interface{}, key string,
if vault.ConversionComplete {
ctx.vaultOperational = types.TS_ENABLED
// Do we need to clear maintenance?
if ctx.maintMode &&
ctx.maintModeReason == types.MaintenanceModeReasonVaultLockedUp {
log.Noticef("Clearing %s",
types.MaintenanceModeReasonVaultLockedUp)
ctx.maintMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
publishNodeAgentStatus(ctx)
}
removeMaintenanceModeReason(ctx, types.MaintenanceModeReasonVaultLockedUp, "handleVaultStatusImpl")
publishNodeAgentStatus(ctx)
} else {
ctx.vaultOperational = types.TS_NONE
}
Expand All @@ -819,34 +813,19 @@ func handleVolumeMgrStatusImpl(ctxArg interface{}, key string,

ctx := ctxArg.(*nodeagentContext)
vms := statusArg.(types.VolumeMgrStatus)
changed := false
// This RemainingSpace takes into account the space reserved for
// /persist/newlog plus the percentage/minimum reserved for the rest
// of EVE-OS. Thus it can never go negative, but zero means that
// we neiether have space to download new images nor space to deploy
// a tiny app instance.
if vms.RemainingSpace == 0 {
log.Warnf("MaintenanceMode due to no remaining diskspace")
// Do not overwrite a vault maintenance mode
if !ctx.maintMode {
log.Noticef("Setting %s",
types.MaintenanceModeReasonNoDiskSpace)
ctx.maintModeReason = types.MaintenanceModeReasonNoDiskSpace
ctx.maintMode = true
changed = true
}
// Add to maintenance mode reasons
addMaintenanceModeReason(ctx, types.MaintenanceModeReasonNoDiskSpace, "handleVolumeMgrStatusImpl")
publishNodeAgentStatus(ctx)
} else {
// Do we need to clear maintenance?
if ctx.maintMode &&
ctx.maintModeReason == types.MaintenanceModeReasonNoDiskSpace {
log.Noticef("Clearing %s",
types.MaintenanceModeReasonNoDiskSpace)
ctx.maintMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
changed = true
}
}
if changed {
removeMaintenanceModeReason(ctx, types.MaintenanceModeReasonNoDiskSpace, "handleVolumeMgrStatusImpl")
publishNodeAgentStatus(ctx)
}
}
Expand Down Expand Up @@ -888,16 +867,37 @@ func handleTpmStatusImpl(ctxArg interface{}, key string,

if tpm.Status == types.MaintenanceModeReasonTpmEncFailure {
log.Errorf("handleTpmStatusImpl: TPM manager reported TPM error : %s", tpm.Error)
log.Noticef("Setting %s", types.MaintenanceModeReasonTpmEncFailure)
ctx.maintMode = true
ctx.maintModeReason = types.MaintenanceModeReasonTpmEncFailure
addMaintenanceModeReason(ctx, types.MaintenanceModeReasonTpmEncFailure, "handleTpmStatusImpl")
publishNodeAgentStatus(ctx)
} else {
if ctx.maintMode && ctx.maintModeReason == types.MaintenanceModeReasonTpmEncFailure {
log.Noticef("Clearing %s", types.MaintenanceModeReasonTpmEncFailure)
ctx.maintMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
publishNodeAgentStatus(ctx)
removeMaintenanceModeReason(ctx, types.MaintenanceModeReasonTpmEncFailure, "handleTpmStatusImpl")
publishNodeAgentStatus(ctx)
}
}

func addMaintenanceModeReason(ctx *nodeagentContext, reason types.MaintenanceModeReason, caller string) {
log.Noticef("%s setting %s", caller, reason)
ctx.maintMode = true
// don't add duplicate reasons
for _, r := range ctx.maintModeReasons {
if r == reason {
return
}
}

ctx.maintModeReasons = append(ctx.maintModeReasons, reason)
}

func removeMaintenanceModeReason(ctx *nodeagentContext, reason types.MaintenanceModeReason, caller string) {
log.Noticef("%s clearing %s", caller, reason)
for i, r := range ctx.maintModeReasons {
if r == reason {
ctx.maintModeReasons = append(ctx.maintModeReasons[:i], ctx.maintModeReasons[i+1:]...)
}
}

if len(ctx.maintModeReasons) == 0 {
ctx.maintMode = false
log.Noticef("%s : No reason to be in maintenance mode, clearing maintenance mode", caller)
}
}
12 changes: 11 additions & 1 deletion pkg/pillar/cmd/zedagent/localinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ func prepareLocalDevInfo(ctx *zedagentContext) *profile.LocalDevInfo {
msg.DeviceUuid = devUUID.String()
msg.State = info.ZDeviceState(getDeviceState(ctx))
msg.MaintenanceModeReasons = append(msg.MaintenanceModeReasons,
info.MaintenanceModeReason(ctx.maintModeReason))
infoMaintModeReason(ctx.maintModeReasons)...)
hinfo, err := host.Info()
if err != nil {
log.Errorf("host.Info(): %s", err)
Expand Down Expand Up @@ -778,3 +778,13 @@ func processReceivedDevCommands(getconfigCtx *getconfigContext, cmd *profile.Loc
getconfigCtx.sideController.lastDevCmdTimestamp = cmd.Timestamp
saveLocalDevCmdTimestamp(getconfigCtx)
}

// convert maintenance mode reasons from types.MaintenanceModeMultiReason
// to []info.MaintenanceModeReason
func infoMaintModeReason(mmmr types.MaintenanceModeMultiReason) []info.MaintenanceModeReason {
cast := []info.MaintenanceModeReason{}
for _, v := range mmmr {
cast = append(cast, info.MaintenanceModeReason(v))
}
return cast
}
20 changes: 10 additions & 10 deletions pkg/pillar/cmd/zedagent/parseconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,
// Did MaintenanceMode change?
if ctx.apiMaintenanceMode != config.MaintenanceMode {
ctx.apiMaintenanceMode = config.MaintenanceMode
mergeMaintenanceMode(ctx)
mergeMaintenanceMode(ctx, "parseConfig")
}

// Did the ForceFallbackCounter change? If so we publish for
Expand Down Expand Up @@ -2749,7 +2749,7 @@ func parseConfigItems(ctx *getconfigContext, config *zconfig.EdgeDevConfig,
newMaintenanceMode := newGlobalConfig.GlobalValueTriState(types.MaintenanceMode)
if oldMaintenanceMode != newMaintenanceMode {
ctx.zedagentCtx.gcpMaintenanceMode = newMaintenanceMode
mergeMaintenanceMode(ctx.zedagentCtx)
mergeMaintenanceMode(ctx.zedagentCtx, "parseConfigItems")
}

pub := ctx.zedagentCtx.pubGlobalConfig
Expand All @@ -2765,29 +2765,29 @@ func parseConfigItems(ctx *getconfigContext, config *zconfig.EdgeDevConfig,

// mergeMaintenanceMode handles the configItem override (unless NONE)
// and the API setting
func mergeMaintenanceMode(ctx *zedagentContext) {
func mergeMaintenanceMode(ctx *zedagentContext, caller string) {
switch ctx.gcpMaintenanceMode {
case types.TS_ENABLED:
// Overrides everything, and sets maintenance mode
ctx.maintenanceMode = true
ctx.maintModeReason = types.MaintenanceModeReasonUserRequested
ctx.maintModeReasons = types.MaintenanceModeMultiReason{types.MaintenanceModeReasonUserRequested}
case types.TS_DISABLED:
// Overrides everything, and resets maintenance mode
ctx.maintenanceMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
ctx.maintModeReasons = types.MaintenanceModeMultiReason{types.MaintenanceModeReasonNone}
case types.TS_NONE:
// Now, look at user config and local triggers
ctx.maintenanceMode = ctx.apiMaintenanceMode || ctx.localMaintenanceMode
if ctx.apiMaintenanceMode {
// set reason as user requested
ctx.maintModeReason = types.MaintenanceModeReasonUserRequested
} else if ctx.localMaintenanceMode {
ctx.maintModeReasons = types.MaintenanceModeMultiReason{types.MaintenanceModeReasonUserRequested}
} else if !equalMaintenanceMode(ctx.maintModeReasons, ctx.localMaintModeReasons) {
// set reason to reflect exact local reason
ctx.maintModeReason = ctx.localMaintModeReason
ctx.maintModeReasons = ctx.localMaintModeReasons
}
}
log.Noticef("Changed maintenanceMode to %t, with reason as %s, considering {%v, %v, %v}",
ctx.maintenanceMode, ctx.maintModeReason.String(), ctx.gcpMaintenanceMode,
log.Noticef("%s changed maintenanceMode to %t, with reason as %s, considering {%v, %v, %v}",
caller, ctx.maintenanceMode, ctx.maintModeReasons.String(), ctx.gcpMaintenanceMode,
ctx.apiMaintenanceMode, ctx.localMaintenanceMode)
}

Expand Down
8 changes: 6 additions & 2 deletions pkg/pillar/cmd/zedagent/reportinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -617,10 +617,14 @@ func PublishDeviceInfoToZedCloud(ctx *zedagentContext, dest destinationBitset) {

// EVE needs to fill deprecated MaintenanceMode until it is removed
ReportDeviceInfo.MaintenanceMode = ctx.maintenanceMode
ReportDeviceInfo.MaintenanceModeReason = info.MaintenanceModeReason(ctx.maintModeReason)
// if we are in maintenance mode, this must have at least one reason, but
// better safe than sorry.
if len(ctx.maintModeReasons) > 0 {
ReportDeviceInfo.MaintenanceModeReason = info.MaintenanceModeReason(ctx.maintModeReasons[0])
}
// For backward compatibility added new field
ReportDeviceInfo.MaintenanceModeReasons = append(ReportDeviceInfo.MaintenanceModeReasons,
info.MaintenanceModeReason(ctx.maintModeReason))
infoMaintModeReason(ctx.maintModeReasons)...)

// Watchdog
ReportDeviceInfo.HardwareWatchdogPresent = getHardwareWatchdogPresent(ctx)
Expand Down
Loading

0 comments on commit f0f6fdb

Please sign in to comment.