Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Fix e2e test for dockermachinePool #11440

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"context"
"fmt"
"sort"
"time"

"github.com/pkg/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -136,8 +137,12 @@ func (r *DockerMachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Re
}()

// Handle deleted machines
if !dockerMachinePool.ObjectMeta.DeletionTimestamp.IsZero() {
return ctrl.Result{}, r.reconcileDelete(ctx, cluster, machinePool, dockerMachinePool)
if !dockerMachinePool.DeletionTimestamp.IsZero() {
// perform dockerMachinePool delete and requeue in 30s giving time for containers to be deleted.
if err = r.reconcileDelete(ctx, cluster, machinePool, dockerMachinePool); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}

// Add finalizer and the InfrastructureMachineKind if they aren't already present, and requeue if either were added.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@ import (
// reconcileDockerContainers manages the Docker containers for a MachinePool such that it
// - Ensures the number of up-to-date Docker containers is equal to the MachinePool's desired replica count.
// - Does not delete any containers as that must be triggered in reconcileDockerMachines to ensure node cordon/drain.
// - Create the DockerMachine CR after creating the container.
//
// Providers should similarly create their infrastructure instances and reconcile any additional logic.
func (r *DockerMachinePoolReconciler) reconcileDockerContainers(ctx context.Context, cluster *clusterv1.Cluster, machinePool *expv1.MachinePool, dockerMachinePool *infraexpv1.DockerMachinePool) error {
log := ctrl.LoggerFrom(ctx)

log.V(2).Info("Reconciling Docker containers", "DockerMachinePool", klog.KObj(dockerMachinePool))
log.Info("Reconciling Docker containers", "DockerMachinePool", klog.KObj(dockerMachinePool))

labelFilters := map[string]string{dockerMachinePoolLabel: dockerMachinePool.Name}

Expand All @@ -63,11 +63,17 @@ func (r *DockerMachinePoolReconciler) reconcileDockerContainers(ctx context.Cont
matchingMachineCount := len(machinesMatchingInfrastructureSpec(ctx, machines, machinePool, dockerMachinePool))
numToCreate := int(*machinePool.Spec.Replicas) - matchingMachineCount
for range numToCreate {
log.V(2).Info("Creating a new Docker container for machinePool", "MachinePool", klog.KObj(machinePool))
log.Info("Creating a new Docker container for machinePool", "MachinePool", klog.KObj(machinePool))
name := fmt.Sprintf("worker-%s", util.RandomString(6))
if err := createDockerContainer(ctx, name, cluster, machinePool, dockerMachinePool); err != nil {
return errors.Wrap(err, "failed to create a new docker machine")
}

log.Info("Creating a new DockerMachine for dockerMachinePool", "DockerMachinePool", klog.KObj(dockerMachinePool))
dockerMachine := computeDesiredDockerMachine(name, cluster, machinePool, dockerMachinePool, nil)
if err := ssa.Patch(ctx, r.Client, dockerMachinePoolControllerName, dockerMachine); err != nil {
return errors.Wrap(err, "failed to create a new docker machine")
}
}

return nil
Expand Down Expand Up @@ -107,15 +113,14 @@ func createDockerContainer(ctx context.Context, name string, cluster *clusterv1.

// reconcileDockerMachines creates and deletes DockerMachines to match the MachinePool's desired number of replicas and infrastructure spec.
// It is responsible for
// - Ensuring each Docker container has an associated DockerMachine by creating one if it doesn't already exist.
// - Ensuring that deletion for Docker container happens by calling delete on the associated Machine so that the node is cordoned/drained and the infrastructure is cleaned up.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The delete call seems unchanged

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kept the delete cause if the container get deleted out side of capi. It has to reflect what is exist in the infrastructure. Also it handle the replica scale down use-case if something wrong happen.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But then we probably shouldn't drop the comment mentioning the deletion?

// - Deleting DockerMachines referencing a container whose Kubernetes version or custom image no longer matches the spec.
// - Deleting DockerMachines that correspond to a deleted/non-existent Docker container.
// - Deleting DockerMachines when scaling down such that DockerMachines whose owner Machine has the clusterv1.DeleteMachineAnnotation is given priority.
func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Context, cluster *clusterv1.Cluster, machinePool *expv1.MachinePool, dockerMachinePool *infraexpv1.DockerMachinePool) error {
log := ctrl.LoggerFrom(ctx)

log.V(2).Info("Reconciling DockerMachines", "DockerMachinePool", klog.KObj(dockerMachinePool))
log.Info("Reconciling DockerMachines", "DockerMachinePool", klog.KObj(dockerMachinePool))

dockerMachineList, err := getDockerMachines(ctx, r.Client, *cluster, *machinePool, *dockerMachinePool)
if err != nil {
Expand All @@ -140,36 +145,13 @@ func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Contex
}

// Step 1:
// Create a DockerMachine for each Docker container so we surface the information to the user. Use the same name as the Docker container for the Docker Machine for ease of lookup.
// Providers should iterate through their infrastructure instances and ensure that each instance has a corresponding InfraMachine.
for _, machine := range externalMachines {
if existingMachine, ok := dockerMachineMap[machine.Name()]; ok {
log.V(2).Info("Patching existing DockerMachine", "DockerMachine", klog.KObj(&existingMachine))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we losing this entire branch?

I don't follow how this change solves the problem

Copy link
Contributor Author

@serngawy serngawy Nov 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved the creation of the DockerMachine with the Container creation here to avoid the loop for creating DockerMachine CR based on the previously created container.
Based on my investigation ex logs blow, for some reason while the DockerMachinePool cleaning all dockerMachine a delay happen for deleting the container which make the DockerMachine get re-created here at same time the DockerMachinePool is been deleted

logs ex; the dockerMachine is failed to patch as it is deleted , next log is the dockerMachine created (I think dockerMachinePool go to delete same time the patch is happening to create missing dockerMachine) and then we stuck waiting for the machine get created but never happen as the dockermachinePool and machinePool are gone.

I1119 02:08:36.160435       1 dockermachinepool_controller_phases.go:147] "Patching existing DockerMachine" controller="dockermachinepool" controllerGroup="infrastructure.cluster.x-k8s.io" controllerKind="DockerMachinePool" DockerMachinePool="self-hosted-jqke01/self-hosted-r77ad9-mp-0-8xsxc" namespace="self-hosted-jqke01" name="self-hosted-r77ad9-mp-0-8xsxc" reconcileID="904d6ace-3baa-48e1-b000-97ed711f9802" MachinePool="self-hosted-r77ad9-mp-0-s8xbz" Cluster="self-hosted-jqke01/self-hosted-r77ad9" DockerMachine="self-hosted-jqke01/worker-1tpa4a"
E1119 02:08:36.181156       1 controller.go:316] "Reconciler error" err="failed to patch DockerMachine self-hosted-jqke01/worker-1tpa4a: dockermachines.infrastructure.cluster.x-k8s.io \"worker-1tpa4a\" not found" controller="dockermachine" controllerGroup="infrastructure.cluster.x-k8s.io" controllerKind="DockerMachine" DockerMachine="self-hosted-jqke01/worker-1tpa4a" namespace="self-hosted-jqke01" name="worker-1tpa4a" reconcileID="f3f06dcd-f11a-4c3f-ae30-8bbef471ca00"
E1119 02:08:36.183408       1 controller.go:316] "Reconciler error" err="failed to update DockerMachine \"self-hosted-jqke01/worker-1tpa4a\": failed to apply DockerMachine self-hosted-jqke01/worker-1tpa4a: Operation cannot be fulfilled on dockermachines.infrastructure.cluster.x-k8s.io \"worker-1tpa4a\": uid mismatch: the provided object specified uid fb561549-45d7-4a09-af27-6931b56bd020, and no existing object was found" controller="dockermachinepool" controllerGroup="infrastructure.cluster.x-k8s.io" controllerKind="DockerMachinePool" DockerMachinePool="self-hosted-jqke01/self-hosted-r77ad9-mp-0-8xsxc" namespace="self-hosted-jqke01" name="self-hosted-r77ad9-mp-0-8xsxc" reconcileID="904d6ace-3baa-48e1-b000-97ed711f9802"
I1119 02:08:36.186594       1 dockermachinepool_controller_phases.go:158] "Creating a new DockerMachine for Docker container" controller="dockermachinepool" controllerGroup="infrastructure.cluster.x-k8s.io" controllerKind="DockerMachinePool" DockerMachinePool="self-hosted-jqke01/self-hosted-r77ad9-mp-0-8xsxc" namespace="self-hosted-jqke01" name="self-hosted-r77ad9-mp-0-8xsxc" reconcileID="9cc3622a-5a30-41c7-8b0a-6f0b212d0bdb" MachinePool="self-hosted-r77ad9-mp-0-s8xbz" Cluster="self-hosted-jqke01/self-hosted-r77ad9" container="worker-1tpa4a"
I1119 02:08:36.253031       1 dockermachine_controller.go:104] "Waiting for Machine Controller to set OwnerRef on DockerMachine" controller="dockermachine" controllerGroup="infrastructure.cluster.x-k8s.io" controllerKind="DockerMachine" DockerMachine="self-hosted-jqke01/worker-1tpa4a" namespace="self-hosted-jqke01" name="worker-1tpa4a" reconcileID="9799d2ef-c1a7-4924-b347-875155993995" DockerMachinePool="self-hosted-jqke01/self-hosted-r77ad9-mp-0-8xsxc"

Copy link
Member

@fabriziopandini fabriziopandini Nov 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for some reason while the DockerMachinePool cleaning all dockerMachine a delay happen for deleting the container which make the DockerMachine get re-created here at same time the DockerMachinePool is been deleted

have you considered to prevent creation of new machines in this func when the DockerMachinePool has a deletion timestamp? (it should probably be an if around L155-L161)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the code logic shouldn't allow this to happen (it should go through the delete reconcile). In any case, tying the creation of dockerMachine after the container creation is better implementation to avoid such random execution to happen.

Copy link
Member

@fabriziopandini fabriziopandini Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the code logic shouldn't allow this to happen (it should go through the delete reconcile).

This is confusing me a little bit (and my lack of knowledge in MP doesn't help)
If I got it right, we are trying to fix a race condition that happens when deleting the MP.
But, to fix this error on deletion, we are shuffling machine creation from reconcileDockerMachines to reconcileDockerContainers, and both are not called on reconcile delete 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for confusion, this changes fix the race condition. here during reconcile the dockerMachines if the container has no dockerMachine the dockerMachine get created, while the delete could happen at sametime. Before this changes; all containers get created first then all dockerMachines get created (in between race condition could happen). So this changes create every container with its dockerMachine at the sametime to avoid race condition. Hope that make it clear :)

Copy link
Member

@fabriziopandini fabriziopandini Dec 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for bearing with me with the explanation.
Ok, we are probably getting to to the point that confuses me.

When I review the PR, I start from the assumption that, if we consider a single MP, controller runtime ensures us that only one concurrent reconcile can happen anytime.

That's means, that if we are running the code here we are in reconcile normal, and reconcileDelete won't start until the operation completes or fails.

This rules out most of the race conditions. Also, assuming normal completes, the order of operations doesn't matter much, because at the end both dockerMachine and container should exist.

However, what might happen is that it takes some time for the cache to see the newly created dockerMachines, so it will seems it is missing when reconcileDelete starts, even if it it actually exists on the API server (but this requires a different fix).

Does this match with what are you observing?
To check this you can look e.g. at reconcile IDs in logs + look at errors when creating machines

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm following the patch func here but not sure how the cache is delayed, Do we run the cache in concurrent exec ?
But in any case; attaching the creation of the container with the dockerMachine better implementation than the old way of creating all containers and then create the dockerMachines

desiredMachine := computeDesiredDockerMachine(machine.Name(), cluster, machinePool, dockerMachinePool, &existingMachine)
if err := ssa.Patch(ctx, r.Client, dockerMachinePoolControllerName, desiredMachine, ssa.WithCachingProxy{Cache: r.ssaCache, Original: &existingMachine}); err != nil {
return errors.Wrapf(err, "failed to update DockerMachine %q", klog.KObj(desiredMachine))
}

dockerMachineMap[desiredMachine.Name] = *desiredMachine
} else {
log.V(2).Info("Creating a new DockerMachine for Docker container", "container", machine.Name())
desiredMachine := computeDesiredDockerMachine(machine.Name(), cluster, machinePool, dockerMachinePool, nil)
if err := ssa.Patch(ctx, r.Client, dockerMachinePoolControllerName, desiredMachine); err != nil {
return errors.Wrap(err, "failed to create a new docker machine")
}

dockerMachineMap[desiredMachine.Name] = *desiredMachine
}
}

// Step 2:
// Delete any DockerMachines that correspond to a deleted Docker container.
// Providers should iterate through the InfraMachines to ensure each one still corresponds to an existing infrastructure instance.
// This allows the InfraMachine (and owner Machine) to be deleted and avoid hanging resources when a user deletes an instance out-of-band.
for _, dockerMachine := range dockerMachineMap {
if _, ok := externalMachineMap[dockerMachine.Name]; !ok {
dockerMachine := dockerMachine
log.V(2).Info("Deleting DockerMachine with no underlying infrastructure", "DockerMachine", klog.KObj(&dockerMachine))
log.Info("Deleting DockerMachine with no underlying infrastructure", "DockerMachine", klog.KObj(&dockerMachine))
if err := r.deleteMachinePoolMachine(ctx, dockerMachine); err != nil {
return err
}
Expand All @@ -178,7 +160,7 @@ func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Contex
}
}

// Step 3:
// Step 2:
// This handles the scale down/excess replicas case and the case where a rolling upgrade is needed.
// If there are more ready DockerMachines than desired replicas, start to delete the excess DockerMachines such that
// - DockerMachines with an outdated Kubernetes version or custom image are deleted first (i.e. the rolling upgrade).
Expand Down Expand Up @@ -218,7 +200,7 @@ func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Contex
for _, dockerMachine := range outdatedMachines {
if overProvisionCount > 0 {
dockerMachine := dockerMachine
log.V(2).Info("Deleting DockerMachine because it is outdated", "DockerMachine", klog.KObj(&dockerMachine))
log.Info("Deleting DockerMachine because it is outdated", "DockerMachine", klog.KObj(&dockerMachine))
if err := r.deleteMachinePoolMachine(ctx, dockerMachine); err != nil {
return err
}
Expand All @@ -231,7 +213,7 @@ func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Contex
for _, dockerMachine := range readyMachines {
if overProvisionCount > 0 {
dockerMachine := dockerMachine
log.V(2).Info("Deleting DockerMachine because it is an excess replica", "DockerMachine", klog.KObj(&dockerMachine))
log.Info("Deleting DockerMachine because it is an excess replica", "DockerMachine", klog.KObj(&dockerMachine))
if err := r.deleteMachinePoolMachine(ctx, dockerMachine); err != nil {
return err
}
Expand Down Expand Up @@ -272,6 +254,7 @@ func computeDesiredDockerMachine(name string, cluster *clusterv1.Cluster, machin
Name: dockerMachinePool.Name,
UID: dockerMachinePool.UID,
}))

dockerMachine.Labels[clusterv1.ClusterNameLabel] = cluster.Name
dockerMachine.Labels[clusterv1.MachinePoolNameLabel] = format.MustFormatValue(machinePool.Name)

Expand All @@ -288,7 +271,7 @@ func (r *DockerMachinePoolReconciler) deleteMachinePoolMachine(ctx context.Conte
}
// util.GetOwnerMachine() returns a nil Machine without error if there is no Machine kind in the ownerRefs, so we must verify that machine is not nil.
if machine == nil {
log.V(2).Info("No owner Machine exists for DockerMachine", "dockerMachine", klog.KObj(&dockerMachine))
log.Info("No owner Machine exists for DockerMachine", "dockerMachine", klog.KObj(&dockerMachine))

// If the DockerMachine does not have an owner Machine, do not attempt to delete the DockerMachine as the MachinePool controller will create the
// Machine and we want to let it catch up. If we are too hasty to delete, that introduces a race condition where the DockerMachine could be deleted
Expand All @@ -297,7 +280,8 @@ func (r *DockerMachinePoolReconciler) deleteMachinePoolMachine(ctx context.Conte
// In the case where the MachinePool is being deleted and the Machine will never come online, the DockerMachine will be deleted via its ownerRef to the
// DockerMachinePool, so that is covered as well.

return nil
// Returning error as we need the dockerMachine not to proceed.
return errors.New("No owner Machine exists for DockerMachine")
}

log.Info("Deleting Machine for DockerMachine", "Machine", klog.KObj(machine), "DockerMachine", klog.KObj(&dockerMachine))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
}

// Handle deleted machines
if !dockerMachine.ObjectMeta.DeletionTimestamp.IsZero() {
if !dockerMachine.DeletionTimestamp.IsZero() {
return ctrl.Result{}, r.reconcileDelete(ctx, cluster, dockerCluster, machine, dockerMachine, externalMachine, externalLoadBalancer)
}

Expand Down Expand Up @@ -464,11 +464,11 @@ func (r *DockerMachineReconciler) reconcileDelete(ctx context.Context, cluster *

// delete the machine
if err := externalMachine.Delete(ctx); err != nil {
return errors.Wrap(err, "failed to delete DockerMachine")
return errors.Wrap(err, "failed to delete external DockerMachine")
}

// if the deleted machine is a control-plane node, remove it from the load balancer configuration;
if util.IsControlPlaneMachine(machine) {
if machine != nil && util.IsControlPlaneMachine(machine) {
if err := r.reconcileLoadBalancerConfiguration(ctx, cluster, dockerCluster, externalLoadBalancer); err != nil {
return err
}
Expand Down