-
Notifications
You must be signed in to change notification settings - Fork 407
[OCPBUGS-50992]: Filter out unreachable taints from tolerations #1990
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
18697e8
f41482b
1dc0090
cd6a9c3
96859e5
dd2eb43
2849b32
b0502ca
af17d75
365fb3a
93d98b5
d27e839
6c551a3
0525fc0
d4b1267
8d01071
516c4a6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,6 +51,8 @@ import ( | |
|
||
const ( | ||
gatherContainerName = "gather" | ||
unreachableTaintKey = "node.kubernetes.io/unreachable" | ||
notReadyTaintKey = "node.kubernetes.io/not-ready" | ||
) | ||
|
||
var ( | ||
|
@@ -96,6 +98,20 @@ sleep 5 | |
done` | ||
) | ||
|
||
var ( | ||
tolerationNotReady = corev1.Toleration{ | ||
Key: "node.kubernetes.io/not-ready", | ||
Operator: corev1.TolerationOpExists, | ||
Effect: corev1.TaintEffectNoSchedule, | ||
} | ||
|
||
tolerationMasterNoSchedule = corev1.Toleration{ | ||
Key: "node-role.kubernetes.io/master", | ||
Operator: corev1.TolerationOpExists, | ||
Effect: corev1.TaintEffectNoSchedule, | ||
} | ||
) | ||
|
||
const ( | ||
// number of concurrent must-gather Pods to run if --all-images or multiple --image are provided | ||
concurrentMG = 4 | ||
|
@@ -401,6 +417,42 @@ func (o *MustGatherOptions) Validate() error { | |
return nil | ||
} | ||
|
||
// prioritizeHealthyNodes returns a preferred node to run the must-gather pod on, and a fallback node if no preferred node is found. | ||
func prioritizeHealthyNodes(nodes *corev1.NodeList) (preferred *corev1.Node, fallback *corev1.Node) { | ||
midu16 marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
var fallbackNode *corev1.Node | ||
var latestHeartbeat time.Time | ||
|
||
for _, node := range nodes.Items { | ||
var hasUnhealthyTaint bool | ||
for _, taint := range node.Spec.Taints { | ||
if taint.Key == unreachableTaintKey || taint.Key == notReadyTaintKey { | ||
hasUnhealthyTaint = true | ||
break | ||
} | ||
} | ||
|
||
// Look at heartbeat time for readiness hint | ||
for _, cond := range node.Status.Conditions { | ||
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { | ||
// Check if heartbeat is recent (less than 2m old) | ||
if time.Since(cond.LastHeartbeatTime.Time) < 2*time.Minute { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: This assumes the clock running on a host where |
||
if !hasUnhealthyTaint { | ||
return &node, nil // return immediately on good candidate | ||
} | ||
} | ||
break | ||
} | ||
} | ||
|
||
if fallbackNode == nil || node.CreationTimestamp.Time.After(latestHeartbeat) { | ||
fallbackNode = &node | ||
latestHeartbeat = node.CreationTimestamp.Time | ||
} | ||
} | ||
|
||
return nil, fallbackNode | ||
} | ||
|
||
// Run creates and runs a must-gather pod | ||
func (o *MustGatherOptions) Run() error { | ||
var errs []error | ||
|
@@ -462,6 +514,10 @@ func (o *MustGatherOptions) Run() error { | |
nodes, err := o.Client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ | ||
LabelSelector: o.NodeSelector, | ||
}) | ||
if err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duplicated code |
||
return err | ||
} | ||
|
||
if err != nil { | ||
return err | ||
} | ||
|
@@ -940,6 +996,31 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P | |
cleanedSourceDir := path.Clean(o.SourceDir) | ||
volumeUsageChecker := fmt.Sprintf(volumeUsageCheckerScript, cleanedSourceDir, o.VolumePercentage, o.VolumePercentage, executedCommand) | ||
|
||
// Define taints we do not want to tolerate (can be extended with user input in the future) | ||
excludedTaints := []corev1.Taint{ | ||
{Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute}, | ||
{Key: notReadyTaintKey, Effect: corev1.TaintEffectNoSchedule}, | ||
} | ||
|
||
// Define candidate tolerations we may want to apply | ||
candidateTolerations := []corev1.Toleration{tolerationNotReady} | ||
if node == "" && hasMaster { | ||
midu16 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
candidateTolerations = append(candidateTolerations, tolerationMasterNoSchedule) | ||
} | ||
|
||
// Build the toleration list by only adding tolerations that do NOT tolerate excluded taints | ||
filteredTolerations := make([]corev1.Toleration, 0, len(candidateTolerations)) | ||
TolerationLoop: | ||
for _, tol := range candidateTolerations { | ||
for _, excluded := range excludedTaints { | ||
if tol.ToleratesTaint(&excluded) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So you are excluding taints that are added in the same method. What about not adding the excluded taints instead of adding them and then excluding them? Or, is this code to be extended later with a list of user provided excluded taints? Also, none of those two tolerations tolerate the taint as the
Which makes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still unanswered There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ingvagabund with respect to this, i have added a comment that i am intending to support a user-defined exclusion tains in the hope to preserve this logic, should we define a unit test which is validating this logic further ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How far in the future do you expect the user-defined exclusion taints support added? If the current logic has no use it's better to introduce it in the PR that introduces the user-defined exclusion taints. So all the relevant changes are in the same PR to preserve the context. This way the code changes will be apart. Maybe never extended/finished. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ingvagabund I expect to expand it in the next PR with the experimental flag from here: #1990 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gotcha. In that case it's better to introduce the loop as part of the next PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ingvagabund unfortunetly this cannot be removed for next PR, main feature its to manage the tolerations for the must-gather pod and avoid a random alocation to a pod which cannot be scheduled. Initial implementation was allowing the must-gather pod to tolerate everything, hence this section was change to exclude specific taints. |
||
// Skip this toleration if it tolerates an excluded taint | ||
continue TolerationLoop | ||
} | ||
} | ||
filteredTolerations = append(filteredTolerations, tol) | ||
} | ||
|
||
ret := &corev1.Pod{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
GenerateName: "must-gather-", | ||
|
@@ -1014,14 +1095,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P | |
HostNetwork: o.HostNetwork, | ||
NodeSelector: nodeSelector, | ||
TerminationGracePeriodSeconds: &zero, | ||
Tolerations: []corev1.Toleration{ | ||
{ | ||
// An empty key with operator Exists matches all keys, | ||
// values and effects which means this will tolerate everything. | ||
// As noted in https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ | ||
Operator: "Exists", | ||
}, | ||
}, | ||
Tolerations: filteredTolerations, | ||
midu16 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}, | ||
} | ||
if o.HostNetwork { | ||
|
Uh oh!
There was an error while loading. Please reload this page.