From 369b1cdd6efb80b3753ae3ae8dfdb3506338cc0d Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Wed, 20 Nov 2024 14:07:52 +0100 Subject: [PATCH] OCPBUGS-42810: actively move bootstrap member lead This PR will actively try to move the leadership away from the bootstrap member to another healthy member. Signed-off-by: Thomas Jungblut --- pkg/etcdcli/etcdcli.go | 14 +++ pkg/etcdcli/helpers.go | 16 +++ pkg/etcdcli/interfaces.go | 5 + .../bootstrap_teardown_controller.go | 105 ++++++++++++++---- .../bootstrap_teardown_controller_test.go | 93 +++++++++++++++- 5 files changed, 206 insertions(+), 27 deletions(-) diff --git a/pkg/etcdcli/etcdcli.go b/pkg/etcdcli/etcdcli.go index 8eeca8a6ee..4b670859cc 100644 --- a/pkg/etcdcli/etcdcli.go +++ b/pkg/etcdcli/etcdcli.go @@ -259,6 +259,20 @@ func (g *etcdClientGetter) MemberRemove(ctx context.Context, memberID uint64) er return err } +func (g *etcdClientGetter) MoveLeader(ctx context.Context, toMember uint64) error { + cli, err := g.clientPool.Get() + if err != nil { + return err + } + + defer g.clientPool.Return(cli) + + ctx, cancel := context.WithTimeout(ctx, DefaultClientTimeout) + defer cancel() + _, err = cli.MoveLeader(ctx, toMember) + return err +} + func (g *etcdClientGetter) MemberList(ctx context.Context) ([]*etcdserverpb.Member, error) { cli, err := g.clientPool.Get() if err != nil { diff --git a/pkg/etcdcli/helpers.go b/pkg/etcdcli/helpers.go index e16f068779..16538d41cb 100644 --- a/pkg/etcdcli/helpers.go +++ b/pkg/etcdcli/helpers.go @@ -105,6 +105,14 @@ func (f *fakeEtcdClient) MemberRemove(ctx context.Context, memberID uint64) erro return nil } +func (f *fakeEtcdClient) MoveLeader(ctx context.Context, toMember uint64) error { + for _, status := range f.opts.status { + status.Leader = toMember + } + + return nil +} + func (f *fakeEtcdClient) MemberHealth(ctx context.Context) (memberHealth, error) { var healthy, unhealthy int var memberHealth memberHealth @@ -251,6 +259,14 @@ func WithFakeStatus(status []*clientv3.StatusResponse) FakeClientOption { } } +func WithLeader(leader uint64) FakeClientOption { + return func(fo *FakeClientOptions) { + for _, status := range fo.status { + status.Leader = leader + } + } +} + // WithFakeDefragErrors configures each call to Defrag to consume one error from the given slice func WithFakeDefragErrors(errors []error) FakeClientOption { return func(fo *FakeClientOptions) { diff --git a/pkg/etcdcli/interfaces.go b/pkg/etcdcli/interfaces.go index 6a31a9c620..ae6cf3e072 100644 --- a/pkg/etcdcli/interfaces.go +++ b/pkg/etcdcli/interfaces.go @@ -25,6 +25,7 @@ type EtcdClient interface { HealthyMemberLister UnhealthyMemberLister MemberStatusChecker + LeaderMover Status GetMember(ctx context.Context, name string) (*etcdserverpb.Member, error) @@ -64,6 +65,10 @@ type MemberRemover interface { MemberRemove(ctx context.Context, memberID uint64) error } +type LeaderMover interface { + MoveLeader(ctx context.Context, toMember uint64) error +} + type MemberLister interface { // MemberList lists all members in a cluster MemberList(ctx context.Context) ([]*etcdserverpb.Member, error) diff --git a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go index 413b8b4854..1973c76e64 100644 --- a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go +++ b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go @@ -3,6 +3,7 @@ package bootstrapteardown import ( "context" "fmt" + "go.etcd.io/etcd/api/v3/etcdserverpb" "time" operatorv1 "github.com/openshift/api/operator/v1" @@ -62,12 +63,18 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo return fmt.Errorf("failed to get bootstrap scaling strategy: %w", err) } // checks the actual etcd cluster membership API if etcd-bootstrap exists - safeToRemoveBootstrap, hasBootstrap, bootstrapID, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy) + safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy) if err != nil { return fmt.Errorf("error while canRemoveEtcdBootstrap: %w", err) } - err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapID) + if hasBootstrap { + if err := c.ensureBootstrapIsNotLeader(ctx, bootstrapMember); err != nil { + klog.Errorf("error while ensuring bootstrap is not leader: %v", err) + } + } + + err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapMember) if err != nil { _, _, updateErr := v1helpers.UpdateStatus(ctx, c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{ Type: "BootstrapTeardownDegraded", @@ -90,13 +97,20 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo return updateErr } -func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapID uint64) error { +func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapMember *etcdserverpb.Member) error { + bootstrapID := uint64(0) + bootstrapUrl := "unknown" + if bootstrapMember != nil { + bootstrapID = bootstrapMember.ID + bootstrapUrl = bootstrapMember.GetClientURLs()[0] + } + if !hasBootstrap { klog.V(4).Infof("no bootstrap anymore setting removal status") // this is to ensure the status is always set correctly, even if the status update below failed - updateErr := setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient) + updateErr := setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient) if updateErr != nil { - return fmt.Errorf("error while setSuccessfulBoostrapRemovalStatus: %w", updateErr) + return fmt.Errorf("error while setSuccessfulBootstrapRemovalStatus: %w", updateErr) } // if the bootstrap isn't present, then clearly we're available enough to terminate. This avoids any risk of flapping. @@ -141,20 +155,21 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT if isBootstrapComplete, err := bootstrap.IsBootstrapComplete(c.configmapLister); !isBootstrapComplete || err != nil { return err } - klog.Warningf("Removing bootstrap member [%x]", bootstrapID) + + klog.Warningf("Removing bootstrap member [%x] (%s)", bootstrapID, bootstrapUrl) // this is ugly until bootkube is updated, but we want to be sure that bootkube has time to be waiting to watch the condition coming back. if err := c.etcdClient.MemberRemove(ctx, bootstrapID); err != nil { - return fmt.Errorf("error while removing bootstrap member [%x]: %w", bootstrapID, err) + return fmt.Errorf("error while removing bootstrap member [%x] (%s): %w", bootstrapID, bootstrapUrl, err) } - klog.Infof("Successfully removed bootstrap member [%x]", bootstrapID) + klog.Infof("Successfully removed bootstrap member [%x] (%s)", bootstrapID, bootstrapUrl) // below might fail, since the member removal can cause some downtime for raft to settle on a quorum // it's important that everything below is properly retried above during normal controller reconciliation - return setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient) + return setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient) } -func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error { +func setSuccessfulBootstrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error { _, _, updateErr := v1helpers.UpdateStatus(ctx, client, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{ Type: "EtcdBootstrapMemberRemoved", Status: operatorv1.ConditionTrue, @@ -165,57 +180,101 @@ func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.St } // canRemoveEtcdBootstrap returns whether it is safe to remove bootstrap, whether bootstrap is in the list, and an error -func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, uint64, error) { +func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, *etcdserverpb.Member, error) { members, err := c.etcdClient.MemberList(ctx) if err != nil { - return false, false, 0, err + return false, false, nil, err } var hasBootstrap bool - var bootstrapMemberID uint64 + var bootstrapMember *etcdserverpb.Member for _, member := range members { if member.Name == "etcd-bootstrap" { hasBootstrap = true - bootstrapMemberID = member.ID + bootstrapMember = member break } } if !hasBootstrap { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } // First, enforce the main HA invariants in terms of member counts. switch scalingStrategy { case ceohelpers.HAScalingStrategy: if len(members) < 4 { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } case ceohelpers.DelayedHAScalingStrategy: if len(members) < 3 { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } case ceohelpers.UnsafeScalingStrategy: if len(members) < 2 { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } } // Next, given member counts are satisfied, check member health. unhealthyMembers, err := c.etcdClient.UnhealthyMembers(ctx) if err != nil { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } // the etcd-bootstrap member is allowed to be unhealthy and can still be removed switch { case len(unhealthyMembers) == 0: - return true, hasBootstrap, bootstrapMemberID, nil + return true, hasBootstrap, bootstrapMember, nil case len(unhealthyMembers) > 1: - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil default: if unhealthyMembers[0].Name == "etcd-bootstrap" { - return true, true, unhealthyMembers[0].ID, nil + return true, true, bootstrapMember, nil + } + return false, hasBootstrap, bootstrapMember, nil + } +} + +func (c *BootstrapTeardownController) ensureBootstrapIsNotLeader(ctx context.Context, bootstrapMember *etcdserverpb.Member) error { + if bootstrapMember == nil { + return fmt.Errorf("bootstrap member was not provided") + } + status, err := c.etcdClient.Status(ctx, bootstrapMember.ClientURLs[0]) + if err != nil { + return fmt.Errorf("could not find bootstrap member status: %w", err) + } + + if bootstrapMember.ID != status.Leader { + return nil + } + + klog.Warningf("Bootstrap member [%x] (%s) detected as leader, trying to move elsewhere...", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0]) + + memberHealth, err := c.etcdClient.MemberHealth(ctx) + if err != nil { + return fmt.Errorf("could not find member health: %w", err) + } + + var otherMember *etcdserverpb.Member + // we can pick any other healthy voting member as the target to move to + for _, m := range memberHealth.GetHealthyMembers() { + if m.ID != bootstrapMember.ID && !m.IsLearner { + otherMember = m + break } - return false, hasBootstrap, bootstrapMemberID, nil } + + if otherMember == nil { + return fmt.Errorf("could not find other healthy member to move leader") + } + + klog.Warningf("Moving lead from bootstrap member [%x] (%s) detected as leader to [%x] (%s)", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0]) + err = c.etcdClient.MoveLeader(ctx, otherMember.ID) + if err != nil { + return err + } + + klog.Warningf("Moving lead from bootstrap member [%x] (%s) to [%x] (%s) succesfully!", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0]) + + return nil } diff --git a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go index d958dd75fb..3dce5a2dbe 100644 --- a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go +++ b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go @@ -3,6 +3,7 @@ package bootstrapteardown import ( "context" "fmt" + clientv3 "go.etcd.io/etcd/client/v3" "testing" operatorv1 "github.com/openshift/api/operator/v1" @@ -172,11 +173,95 @@ func TestCanRemoveEtcdBootstrap(t *testing.T) { etcdClient: fakeEtcdClient, } - safeToRemoveBootstrap, hasBootstrap, bootstrapId, err := c.canRemoveEtcdBootstrap(context.TODO(), test.scalingStrategy) + safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(context.TODO(), test.scalingStrategy) require.NoError(t, err) require.Equal(t, test.safeToRemove, safeToRemoveBootstrap, "safe to remove") require.Equal(t, test.hasBootstrap, hasBootstrap, "has bootstrap") - require.Equal(t, test.bootstrapId, bootstrapId, "bootstrap id") + if hasBootstrap { + require.Equal(t, test.bootstrapId, bootstrapMember.ID, "bootstrap member") + } else { + require.Nil(t, bootstrapMember) + } + }) + } +} + +func TestMoveLeaderFromEtcdBootstrap(t *testing.T) { + tests := map[string]struct { + etcdMembers []*etcdserverpb.Member + clientFakeOpts []etcdcli.FakeClientOption + movedToLeader uint64 + expectedErr error + }{ + "happy path no bootstrap member": { + etcdMembers: u.DefaultEtcdMembers(), + expectedErr: fmt.Errorf("bootstrap member was not provided"), + }, + "happy path bootstrap not leader": { + etcdMembers: append(u.DefaultEtcdMembers(), u.FakeEtcdBootstrapMember(3)), + clientFakeOpts: []etcdcli.FakeClientOption{ + etcdcli.WithFakeStatus([]*clientv3.StatusResponse{ + {Header: &etcdserverpb.ResponseHeader{MemberId: 3}}}), + etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 3, Healthy: 1}), + etcdcli.WithLeader(1), + }, + expectedErr: nil, + }, + "no healthy destination": { + etcdMembers: append(u.DefaultEtcdMembers(), u.FakeEtcdBootstrapMember(3)), + clientFakeOpts: []etcdcli.FakeClientOption{ + etcdcli.WithFakeStatus([]*clientv3.StatusResponse{ + {Header: &etcdserverpb.ResponseHeader{MemberId: 0}}, + {Header: &etcdserverpb.ResponseHeader{MemberId: 1}}, + {Header: &etcdserverpb.ResponseHeader{MemberId: 2}}, + {Header: &etcdserverpb.ResponseHeader{MemberId: 3}}}), + etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 4, Healthy: 0}), + etcdcli.WithLeader(3), + }, + expectedErr: fmt.Errorf("could not find other healthy member to move leader"), + }, + "moving leader": { + etcdMembers: []*etcdserverpb.Member{ + u.FakeEtcdBootstrapMember(0), + u.FakeEtcdMemberWithoutServer(1), + }, + clientFakeOpts: []etcdcli.FakeClientOption{etcdcli.WithFakeStatus( + []*clientv3.StatusResponse{ + {Header: &etcdserverpb.ResponseHeader{MemberId: 0}}, + {Header: &etcdserverpb.ResponseHeader{MemberId: 1}}}), + etcdcli.WithLeader(0)}, + movedToLeader: 1, + expectedErr: nil, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + if test.clientFakeOpts == nil { + test.clientFakeOpts = []etcdcli.FakeClientOption{etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 0})} + } + fakeEtcdClient, err := etcdcli.NewFakeEtcdClient(test.etcdMembers, test.clientFakeOpts...) + require.NoError(t, err) + + c := &BootstrapTeardownController{ + etcdClient: fakeEtcdClient, + } + + var bootstrapMember *etcdserverpb.Member + for _, m := range test.etcdMembers { + if m.Name == "etcd-bootstrap" { + bootstrapMember = m + break + } + } + + err = c.ensureBootstrapIsNotLeader(context.TODO(), bootstrapMember) + require.Equal(t, test.expectedErr, err) + if test.movedToLeader != 0 { + status, err := fakeEtcdClient.Status(context.TODO(), bootstrapMember.GetClientURLs()[0]) + require.NoError(t, err) + require.Equal(t, test.movedToLeader, status.Leader) + } }) } } @@ -246,7 +331,7 @@ func TestRemoveBootstrap(t *testing.T) { indexerObjs: []interface{}{ bootstrapComplete, }, - expectedErr: fmt.Errorf("error while removing bootstrap member [%x]: %w", 0, fmt.Errorf("member with the given ID: 0 doesn't exist")), + expectedErr: fmt.Errorf("error while removing bootstrap member [%x] (https://10.0.0.1:2907): %w", 0, fmt.Errorf("member with the given ID: 0 doesn't exist")), }, "safe, has bootstrap, complete process, successful removal": { safeToRemove: true, @@ -285,7 +370,7 @@ func TestRemoveBootstrap(t *testing.T) { fakeInfraLister, } - err = c.removeBootstrap(context.TODO(), test.safeToRemove, test.hasBootstrap, test.bootstrapId) + err = c.removeBootstrap(context.TODO(), test.safeToRemove, test.hasBootstrap, u.FakeEtcdMemberWithoutServer(int(test.bootstrapId))) require.Equal(t, test.expectedErr, err) _, status, _, err := fakeStaticPodClient.GetStaticPodOperatorState()