Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convey failed maintenance by requestor #12

Merged
merged 1 commit into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions api/v1alpha1/nodemaintenance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,17 @@ import (
)

const (
// ConditionTypeReady is the Ready Condition.Type for NodeMaintenance
// ConditionTypeReady is the Ready Condition type for NodeMaintenance.
// It is used to convey node readiness for maintenance operation by the requestor.
ConditionTypeReady string = "Ready"
// ConditionTypeRequestorFailed is the RequestorFailed Condition type for NodeMaintenance.
// It is used to convey failure during node maintenance operation by the requestor.
ConditionTypeRequestorFailed string = "RequestorFailed"
)

const (
// Condition Reasons for ConditionTypeReady condition type.

// ConditionReasonUninitialized means that Condition is unset for NodeMaintenance
ConditionReasonUninitialized string = ""
// ConditionReasonPending means that NodeMaintenance is in Pending state
Expand All @@ -40,8 +46,14 @@ const (
ConditionReasonDraining string = "Draining"
// ConditionReasonReady means that NodeMaintenance is in Ready state
ConditionReasonReady string = "Ready"
// ConditionReasonAborted means that NodeMaintenance is in Aborted state
ConditionReasonAborted string = "Aborted"
// ConditionReasonRequestorFailed means that NodeMaintenance operation by the requestor has failed
// garbage collection will not occur if this reason is set.
ConditionReasonRequestorFailed string = ConditionTypeRequestorFailed

// Condition Reasons for ConditionTypeRequestorFailed condition type.

// ConditionReasonFailedMaintenance means that maintenance operation failed in a non-recoverable way
ConditionReasonFailedMaintenance string = "FailedMaintenance"
)

const (
Expand Down Expand Up @@ -186,6 +198,7 @@ type DrainStatus struct {
// +kubebuilder:printcolumn:name="Requestor",type="string",JSONPath=`.spec.requestorID`
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=`.status.conditions[?(@.type=='Ready')].status`
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=`.status.conditions[?(@.type=='Ready')].reason`
// +kubebuilder:printcolumn:name="Failed",type="string",JSONPath=`.status.conditions[?(@.type=='Failed')].reason`

// NodeMaintenance is the Schema for the nodemaintenances API
type NodeMaintenance struct {
Expand Down
3 changes: 3 additions & 0 deletions config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ spec:
- jsonPath: .status.conditions[?(@.type=='Ready')].reason
name: Phase
type: string
- jsonPath: .status.conditions[?(@.type=='Failed')].reason
name: Failed
type: string
name: v1alpha1
schema:
openAPIV3Schema:
Expand Down
2 changes: 2 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ rules:
- events
verbs:
- create
- patch
- update
- apiGroups:
- ""
resources:
Expand Down
74 changes: 55 additions & 19 deletions internal/controller/nodemaintenance_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,18 @@
package controller

import (
"cmp"
"context"
"errors"
"fmt"
"reflect"
"slices"
"time"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -73,7 +76,7 @@ type NodeMaintenanceReconciler struct {
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances/finalizers,verbs=update
//+kubebuilder:rbac:groups="",resources=events,verbs=create
//+kubebuilder:rbac:groups="",resources=events,verbs=create;update;patch
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;update;patch
//+kubebuilder:rbac:groups="",resources=pods,verbs=get;watch;list;update;patch;delete
//+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;get;list;update;patch;delete
Expand Down Expand Up @@ -153,8 +156,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ
if err != nil {
reqLog.Error(err, "failed to handle drain state for NodeMaintenance object")
}
case maintenancev1.ConditionReasonReady:
res, err = r.handleReadyState(ctx, reqLog, nm, node)
case maintenancev1.ConditionReasonReady, maintenancev1.ConditionReasonRequestorFailed:
res, err = r.handleTerminalState(ctx, reqLog, nm, node)
if err != nil {
reqLog.Error(err, "failed to handle Ready state for NodeMaintenance object")
}
Expand Down Expand Up @@ -461,8 +464,8 @@ func (r *NodeMaintenanceReconciler) updateDrainStatus(ctx context.Context, nm *m
return nil
}

func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) {
reqLog.Info("Handle Ready NodeMaintenance")
func (r *NodeMaintenanceReconciler) handleTerminalState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) {
reqLog.Info("Handle Ready/RequestorFailed NodeMaintenance")
var err error
var res ctrl.Result

Expand Down Expand Up @@ -494,12 +497,38 @@ func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog
if !metav1.HasAnnotation(nm.ObjectMeta, ReadyTimeAnnotation) ||
nm.Annotations[ReadyTimeAnnotation] == "" {
metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339))
err := r.Update(ctx, nm)
err = r.Update(ctx, nm)
if err != nil {
return res, err
}
}

// check RequestorFailed condition
currentReason := k8sutils.GetReadyConditionReason(nm)
conditionChanged := false
cond := meta.FindStatusCondition(nm.Status.Conditions, maintenancev1.ConditionTypeRequestorFailed)
if cond != nil && cond.Status == metav1.ConditionTrue {
// set Ready condition to RequestorFailed reason if not already set
if currentReason == maintenancev1.ConditionReasonReady {
err = k8sutils.SetReadyConditionReason(ctx, r.Client, nm, maintenancev1.ConditionReasonRequestorFailed)
conditionChanged = true
}
} else {
// set Ready condition to Ready reason if not already set
if currentReason == maintenancev1.ConditionReasonRequestorFailed {
err = k8sutils.SetReadyConditionReason(ctx, r.Client, nm, maintenancev1.ConditionReasonReady)
conditionChanged = true
}
}
if err != nil {
return res, err
}

if conditionChanged {
r.EventRecorder.Event(
nm, corev1.EventTypeNormal, maintenancev1.ConditionChangedEventType, k8sutils.GetReadyConditionReason(nm))
}

return res, nil
}

Expand All @@ -508,32 +537,32 @@ func (r *NodeMaintenanceReconciler) SetupWithManager(mgr ctrl.Manager, log logr.
r.EventRecorder = mgr.GetEventRecorderFor("nodemaintenancereconciler")

return ctrl.NewControllerManagedBy(mgr).
For(&maintenancev1.NodeMaintenance{}, builder.WithPredicates(NewReadyConditionChangedPredicate(log))).
For(&maintenancev1.NodeMaintenance{}, builder.WithPredicates(NewConditionChangedPredicate(log))).
Complete(r)
}

// NewReadyConditionChangedPredicate creates a new ReadyConditionChangedPredicate
func NewReadyConditionChangedPredicate(log logr.Logger) ReadyConditionChangedPredicate {
return ReadyConditionChangedPredicate{
// NewConditionChangedPredicate creates a new ConditionChangedPredicate
func NewConditionChangedPredicate(log logr.Logger) ConditionChangedPredicate {
return ConditionChangedPredicate{
Funcs: predicate.Funcs{},
log: log,
}
}

// ReadyConditionChangedPredicate will trigger enqueue of Event for reconcile in the following cases:
// 1. A change in NodeMaintenance Ready Condition
// ConditionChangedPredicate will trigger enqueue of Event for reconcile in the following cases:
// 1. A change in NodeMaintenance Conditions
// 2. Update to the object occurred and deletion timestamp is set
// 3. NodeMaintenance created
// 4. NodeMaintenance deleted
// 5. generic event received
type ReadyConditionChangedPredicate struct {
type ConditionChangedPredicate struct {
predicate.Funcs

log logr.Logger
}

// Update implements Predicate.
func (p ReadyConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.Object]) bool {
func (p ConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.Object]) bool {
if e.ObjectOld == nil {
p.log.Error(nil, "old object is nil in update event, ignoring event.")
return false
Expand All @@ -555,15 +584,22 @@ func (p ReadyConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.O
return false
}

oldRCR := k8sutils.GetReadyConditionReason(oldO)
newRCR := k8sutils.GetReadyConditionReason(newO)
cmpByType := func(a, b metav1.Condition) int {
return cmp.Compare(a.Type, b.Type)
}

// sort old and new obj.Status.Conditions so they can be compared using DeepEqual
slices.SortFunc(oldO.Status.Conditions, cmpByType)
slices.SortFunc(newO.Status.Conditions, cmpByType)

process := oldRCR != newRCR || !newO.GetDeletionTimestamp().IsZero()
condChanged := !reflect.DeepEqual(oldO.Status.Conditions, newO.Status.Conditions)
deleting := !newO.GetDeletionTimestamp().IsZero()
process := condChanged || deleting

p.log.V(operatorlog.DebugLevel).Info("Update event for NodeMaintenance",
"name", newO.Name, "namespace", newO.Namespace,
"condition-changed", oldRCR != newRCR, "old", oldRCR, "new", newRCR,
"deleting", !newO.GetDeletionTimestamp().IsZero(), "process", process)
"condition-changed", condChanged,
"deleting", deleting, "process", process)

return process
}
Loading