Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set device to maintenance mode if TPM error is detected #4462

Merged
merged 6 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions pkg/pillar/cmd/nodeagent/handletimers.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,9 @@ func handleRebootOnVaultLocked(ctxPtr *nodeagentContext) {
scheduleNodeOperation(ctxPtr, errStr, types.BootReasonVaultFailure,
types.DeviceOperationReboot)
} else {
log.Noticef("Setting %s",
types.MaintenanceModeReasonVaultLockedUp)
// there is no image update in progress, this happened after a normal
// reboot. enter maintenance mode
ctxPtr.maintMode = true
ctxPtr.maintModeReason = types.MaintenanceModeReasonVaultLockedUp
addMaintenanceModeReason(ctxPtr, types.MaintenanceModeReasonVaultLockedUp, "handleRebootOnVaultLocked")
publishNodeAgentStatus(ctxPtr)
}
} else {
Expand Down
157 changes: 105 additions & 52 deletions pkg/pillar/cmd/nodeagent/nodeagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ type nodeagentContext struct {
subVaultStatus pubsub.Subscription
subVolumeMgrStatus pubsub.Subscription
subNodeDrainStatus pubsub.Subscription
subTpmStatus pubsub.Subscription
pubZbootConfig pubsub.Publication
pubNodeAgentStatus pubsub.Publication
curPart string
Expand Down Expand Up @@ -109,13 +110,13 @@ type nodeagentContext struct {
rebootStack string // From last reboot
rebootTime time.Time // From last reboot
restartCounter uint32
vaultOperational types.TriState // Is the vault fully operational?
vaultTestStartTime uint32 // Time at which we should start waiting for vault to be operational
maintMode bool // whether Maintenance mode should be triggered
maintModeReason types.MaintenanceModeReason //reason for entering Maintenance mode
configGetSuccess bool // got config from controller success
vaultmgrReported bool // got reports from vaultmgr
hvTypeKube bool // image is kubernetes cluster type
vaultOperational types.TriState // Is the vault fully operational?
vaultTestStartTime uint32 // Time at which we should start waiting for vault to be operational
maintMode bool // whether Maintenance mode should be triggered
maintModeReasons types.MaintenanceModeMultiReason // reasons for entering Maintenance mode
configGetSuccess bool // got config from controller success
vaultmgrReported bool // got reports from vaultmgr
hvTypeKube bool // image is kubernetes cluster type
waitDrainInProgress bool

// Some constants.. Declared here as variables to enable unit tests
Expand Down Expand Up @@ -281,6 +282,24 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
ctxPtr.subVolumeMgrStatus = subVolumeMgrStatus
subVolumeMgrStatus.Activate()

// Look for Tpm status
subTpmStatus, err := ps.NewSubscription(pubsub.SubscriptionOptions{
AgentName: "tpmmgr",
MyAgentName: agentName,
TopicImpl: types.TpmSanityStatus{},
Activate: false,
Ctx: ctxPtr,
CreateHandler: handleTpmStatusCreate,
ModifyHandler: handleTpmStatusModify,
WarningTime: warningTime,
ErrorTime: errorTime,
})
if err != nil {
log.Fatal(err)
}
ctxPtr.subTpmStatus = subTpmStatus
subTpmStatus.Activate()

// publish zboot config as of now
publishZbootConfigAll(ctxPtr)

Expand Down Expand Up @@ -395,6 +414,9 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
case change := <-ctxPtr.subNodeDrainStatus.MsgChan():
ctxPtr.subNodeDrainStatus.ProcessChange(change)

case change := <-subTpmStatus.MsgChan():
subTpmStatus.ProcessChange(change)

case <-ctxPtr.stillRunning.C:
}
ps.StillRunning(agentName, warningTime, errorTime)
Expand Down Expand Up @@ -712,24 +734,24 @@ func publishNodeAgentStatus(ctxPtr *nodeagentContext) {
pub := ctxPtr.pubNodeAgentStatus
ctxPtr.lastLock.Lock()
status := types.NodeAgentStatus{
Name: agentName,
CurPart: ctxPtr.curPart,
RemainingTestTime: ctxPtr.remainingTestTime,
UpdateInprogress: ctxPtr.updateInprogress,
DeviceReboot: ctxPtr.deviceReboot,
DeviceShutdown: ctxPtr.deviceShutdown,
DevicePoweroff: ctxPtr.devicePoweroff,
AllDomainsHalted: ctxPtr.allDomainsHalted,
RebootReason: ctxPtr.rebootReason,
BootReason: ctxPtr.bootReason,
RebootStack: ctxPtr.rebootStack,
RebootTime: ctxPtr.rebootTime,
RebootImage: ctxPtr.rebootImage,
RestartCounter: ctxPtr.restartCounter,
LocalMaintenanceMode: ctxPtr.maintMode,
LocalMaintenanceModeReason: ctxPtr.maintModeReason,
HVTypeKube: ctxPtr.hvTypeKube,
WaitDrainInProgress: ctxPtr.waitDrainInProgress,
Name: agentName,
CurPart: ctxPtr.curPart,
RemainingTestTime: ctxPtr.remainingTestTime,
UpdateInprogress: ctxPtr.updateInprogress,
DeviceReboot: ctxPtr.deviceReboot,
DeviceShutdown: ctxPtr.deviceShutdown,
DevicePoweroff: ctxPtr.devicePoweroff,
AllDomainsHalted: ctxPtr.allDomainsHalted,
RebootReason: ctxPtr.rebootReason,
BootReason: ctxPtr.bootReason,
RebootStack: ctxPtr.rebootStack,
RebootTime: ctxPtr.rebootTime,
RebootImage: ctxPtr.rebootImage,
RestartCounter: ctxPtr.restartCounter,
LocalMaintenanceMode: ctxPtr.maintMode,
LocalMaintenanceModeReasons: ctxPtr.maintModeReasons,
HVTypeKube: ctxPtr.hvTypeKube,
WaitDrainInProgress: ctxPtr.waitDrainInProgress,
}
ctxPtr.lastLock.Unlock()
pub.Publish(agentName, status)
Expand Down Expand Up @@ -766,14 +788,8 @@ func handleVaultStatusImpl(ctxArg interface{}, key string,
if vault.ConversionComplete {
ctx.vaultOperational = types.TS_ENABLED
// Do we need to clear maintenance?
if ctx.maintMode &&
ctx.maintModeReason == types.MaintenanceModeReasonVaultLockedUp {
log.Noticef("Clearing %s",
types.MaintenanceModeReasonVaultLockedUp)
ctx.maintMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
publishNodeAgentStatus(ctx)
}
removeMaintenanceModeReason(ctx, types.MaintenanceModeReasonVaultLockedUp, "handleVaultStatusImpl")
publishNodeAgentStatus(ctx)
} else {
ctx.vaultOperational = types.TS_NONE
}
Expand All @@ -797,34 +813,19 @@ func handleVolumeMgrStatusImpl(ctxArg interface{}, key string,

ctx := ctxArg.(*nodeagentContext)
vms := statusArg.(types.VolumeMgrStatus)
changed := false
// This RemainingSpace takes into account the space reserved for
// /persist/newlog plus the percentage/minimum reserved for the rest
// of EVE-OS. Thus it can never go negative, but zero means that
// we neiether have space to download new images nor space to deploy
// a tiny app instance.
if vms.RemainingSpace == 0 {
log.Warnf("MaintenanceMode due to no remaining diskspace")
// Do not overwrite a vault maintenance mode
if !ctx.maintMode {
log.Noticef("Setting %s",
types.MaintenanceModeReasonNoDiskSpace)
ctx.maintModeReason = types.MaintenanceModeReasonNoDiskSpace
ctx.maintMode = true
changed = true
}
// Add to maintenance mode reasons
addMaintenanceModeReason(ctx, types.MaintenanceModeReasonNoDiskSpace, "handleVolumeMgrStatusImpl")
publishNodeAgentStatus(ctx)
} else {
// Do we need to clear maintenance?
if ctx.maintMode &&
ctx.maintModeReason == types.MaintenanceModeReasonNoDiskSpace {
log.Noticef("Clearing %s",
types.MaintenanceModeReasonNoDiskSpace)
ctx.maintMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
changed = true
}
}
if changed {
removeMaintenanceModeReason(ctx, types.MaintenanceModeReasonNoDiskSpace, "handleVolumeMgrStatusImpl")
publishNodeAgentStatus(ctx)
}
}
Expand All @@ -848,3 +849,55 @@ func parseSMARTData() {
parseData(currentSMARTfilename, smartData)
parseData(previousSMARTfilename, previousSmartData)
}

func handleTpmStatusCreate(ctxArg interface{}, key string,
statusArg interface{}) {
handleTpmStatusImpl(ctxArg, key, statusArg)
}

func handleTpmStatusModify(ctxArg interface{}, key string,
statusArg interface{}, oldStatusArg interface{}) {
handleTpmStatusImpl(ctxArg, key, statusArg)
}

func handleTpmStatusImpl(ctxArg interface{}, key string,
statusArg interface{}) {
ctx := ctxArg.(*nodeagentContext)
tpm := statusArg.(types.TpmSanityStatus)

if tpm.Status == types.MaintenanceModeReasonTpmEncFailure {
log.Errorf("handleTpmStatusImpl: TPM manager reported TPM error : %s", tpm.Error)
addMaintenanceModeReason(ctx, types.MaintenanceModeReasonTpmEncFailure, "handleTpmStatusImpl")
publishNodeAgentStatus(ctx)
} else {
removeMaintenanceModeReason(ctx, types.MaintenanceModeReasonTpmEncFailure, "handleTpmStatusImpl")
publishNodeAgentStatus(ctx)
}
}

func addMaintenanceModeReason(ctx *nodeagentContext, reason types.MaintenanceModeReason, caller string) {
log.Noticef("%s setting %s", caller, reason)
ctx.maintMode = true
// don't add duplicate reasons
for _, r := range ctx.maintModeReasons {
if r == reason {
return
}
}

ctx.maintModeReasons = append(ctx.maintModeReasons, reason)
}

func removeMaintenanceModeReason(ctx *nodeagentContext, reason types.MaintenanceModeReason, caller string) {
log.Noticef("%s clearing %s", caller, reason)
for i, r := range ctx.maintModeReasons {
if r == reason {
ctx.maintModeReasons = append(ctx.maintModeReasons[:i], ctx.maintModeReasons[i+1:]...)
}
}

if len(ctx.maintModeReasons) == 0 {
ctx.maintMode = false
log.Noticef("%s : No reason to be in maintenance mode, clearing maintenance mode", caller)
}
}
113 changes: 113 additions & 0 deletions pkg/pillar/cmd/tpmmgr/tpmmgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package tpmmgr

import (
"bytes"
"crypto"
"crypto/aes"
"crypto/ecdsa"
Expand Down Expand Up @@ -36,12 +37,18 @@ import (
"github.com/sirupsen/logrus"
)

type tpmSanityCheckError struct {
err error
tpmErrorType types.MaintenanceModeReason
}

type tpmMgrContext struct {
agentbase.AgentBase
subGlobalConfig pubsub.Subscription
subAttestNonce pubsub.Subscription
pubAttestQuote pubsub.Publication
pubEdgeNodeCert pubsub.Publication
pubTpmStatus pubsub.Publication
globalConfig *types.ConfigItemValueMap
GCInitialized bool // GlobalConfig initialized
// cli options
Expand Down Expand Up @@ -1334,6 +1341,17 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
}
ctx.pubEdgeNodeCert = pubEdgeNodeCert

//to publish tpm sanity check results
pubTpmStatus, err := ps.NewPublication(
pubsub.PublicationOptions{
AgentName: agentName,
TopicType: types.TpmSanityStatus{},
})
if err != nil {
log.Fatal(err)
}
ctx.pubTpmStatus = pubTpmStatus

// publish ECDH cert
publishEdgeNodeCertToController(&ctx, ecdhCertFile, types.CertTypeEcdhXchange,
etpm.IsTpmEnabled() && !fileutils.FileExists(log, etpm.EcdhKeyFile), nil)
Expand Down Expand Up @@ -1370,6 +1388,35 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
return 1
}
}

// check if TPM is working as expected every hour
tpmSanityCheckTicker := time.NewTicker(1 * time.Hour)
defer tpmSanityCheckTicker.Stop()
periodicTpmSanityCheck := func() {
// check if TPM is working as expected or not
if check := tpmSanityCheck(); check != nil {
log.Errorf("TPM sanity check failed: %v", check.err)
// Alert the Controller about the TPM error, and possible implications.
errorAndTime := types.ErrorAndTime{}
errorAndTime.SetErrorDescription(types.ErrorDescription{
Error: check.err.Error(),
ErrorTime: time.Now(),
ErrorSeverity: types.ErrorSeverityWarning,
ErrorRetryCondition: getTpmSanityStatus(check.tpmErrorType),
})
publishTpmStatus(&ctx, types.TpmSanityStatus{
Name: etpm.TpmDevicePath,
Status: check.tpmErrorType,
ErrorAndTime: errorAndTime,
})
} else {
publishTpmStatus(&ctx, types.TpmSanityStatus{
Name: etpm.TpmDevicePath,
Status: types.MaintenanceModeReasonNone,
})
}
}

for {
select {
case change := <-subGlobalConfig.MsgChan():
Expand All @@ -1378,6 +1425,8 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
ctx.subAttestNonce.ProcessChange(change)
case <-stillRunning.C:
ps.StillRunning(agentName, warningTime, errorTime)
case <-tpmSanityCheckTicker.C:
periodicTpmSanityCheck()
}
}
}
Expand Down Expand Up @@ -1592,3 +1641,67 @@ func handleAttestNonceDelete(ctxArg interface{}, key string, statusArg interface
}
log.Functionf("handleAttestNonceDelete done")
}

func publishTpmStatus(ctx *tpmMgrContext, status types.TpmSanityStatus) {
key := status.Key()
log.Tracef("Publishing TpmSanityStatus %s\n", key)
pub := ctx.pubTpmStatus
pub.Publish(key, status)
}

// tpmSanityCheck checks if the TPM fails in a way that is not detectable during the
// common TPM operations but affects EVE's ability to manage itself.
// * encrypt/decrypt (ECDHZGen) : checked here.
// * quote : checked during attestation and also here.
// * seal/unseal : checked during vault creation, failure will set device in MaintenanceModeReasonVaultLockedUp.
// * certificate and key creation : checked during device step.
// * device key signing : checked during onboarding.
func tpmSanityCheck() *tpmSanityCheckError {
// sanity check TPM encrypt/decrypt (ECDHZGen), if this fails we can't
// encrypt/decrypt the vualt key and send/received it from controller.
// this can prevent the device from being upgraded.
message := []byte("TPM Sanity Check, may god have mercy on us")
encrypted, err := etpm.EncryptDecryptUsingTpm(message, true)
if err != nil {
return &tpmSanityCheckError{
fmt.Errorf("failed to encrypt key using TPM: %w", err),
types.MaintenanceModeReasonTpmEncFailure,
}
}
decrypted, err := etpm.EncryptDecryptUsingTpm(encrypted, false)
if err != nil {
return &tpmSanityCheckError{
fmt.Errorf("failed to decrypt key using TPM: %w", err),
types.MaintenanceModeReasonTpmEncFailure,
}
}
if !bytes.Equal(message, decrypted) {
return &tpmSanityCheckError{
fmt.Errorf("decrypted message is not matching original message"),
types.MaintenanceModeReasonTpmEncFailure,
}
}

// sanity check TPM quote operation, this is key to successful attestation
// and if this fails we can't attest the device and recover the vault key.
_, _, _, err = getQuote([]byte(message))
if err != nil {
return &tpmSanityCheckError{
fmt.Errorf("failed to get quote using TPM: %w", err),
types.MaintenanceModeReasonTpmQuoteFailure,
}
}

return nil
}

func getTpmSanityStatus(status types.MaintenanceModeReason) string {
switch status {
case types.MaintenanceModeReasonTpmEncFailure:
return "TPM error can possibly affect device upgrade"
case types.MaintenanceModeReasonTpmQuoteFailure:
return "TPM error can affect attestation process and vault key retrieval"
default:
return ""
}
}
Loading
Loading