Skip to content

Commit

Permalink
OCPBUGS-42810: actively move bootstrap member lead
Browse files Browse the repository at this point in the history
This PR will actively try to move the leadership away from the bootstrap member to another healthy member.

Signed-off-by: Thomas Jungblut <[email protected]>
  • Loading branch information
tjungblu committed Nov 28, 2024
1 parent 9a2aa20 commit 369b1cd
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 27 deletions.
14 changes: 14 additions & 0 deletions pkg/etcdcli/etcdcli.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,20 @@ func (g *etcdClientGetter) MemberRemove(ctx context.Context, memberID uint64) er
return err
}

func (g *etcdClientGetter) MoveLeader(ctx context.Context, toMember uint64) error {
cli, err := g.clientPool.Get()
if err != nil {
return err
}

defer g.clientPool.Return(cli)

ctx, cancel := context.WithTimeout(ctx, DefaultClientTimeout)
defer cancel()
_, err = cli.MoveLeader(ctx, toMember)
return err
}

func (g *etcdClientGetter) MemberList(ctx context.Context) ([]*etcdserverpb.Member, error) {
cli, err := g.clientPool.Get()
if err != nil {
Expand Down
16 changes: 16 additions & 0 deletions pkg/etcdcli/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ func (f *fakeEtcdClient) MemberRemove(ctx context.Context, memberID uint64) erro
return nil
}

func (f *fakeEtcdClient) MoveLeader(ctx context.Context, toMember uint64) error {
for _, status := range f.opts.status {
status.Leader = toMember
}

return nil
}

func (f *fakeEtcdClient) MemberHealth(ctx context.Context) (memberHealth, error) {
var healthy, unhealthy int
var memberHealth memberHealth
Expand Down Expand Up @@ -251,6 +259,14 @@ func WithFakeStatus(status []*clientv3.StatusResponse) FakeClientOption {
}
}

func WithLeader(leader uint64) FakeClientOption {
return func(fo *FakeClientOptions) {
for _, status := range fo.status {
status.Leader = leader
}
}
}

// WithFakeDefragErrors configures each call to Defrag to consume one error from the given slice
func WithFakeDefragErrors(errors []error) FakeClientOption {
return func(fo *FakeClientOptions) {
Expand Down
5 changes: 5 additions & 0 deletions pkg/etcdcli/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ type EtcdClient interface {
HealthyMemberLister
UnhealthyMemberLister
MemberStatusChecker
LeaderMover
Status

GetMember(ctx context.Context, name string) (*etcdserverpb.Member, error)
Expand Down Expand Up @@ -64,6 +65,10 @@ type MemberRemover interface {
MemberRemove(ctx context.Context, memberID uint64) error
}

type LeaderMover interface {
MoveLeader(ctx context.Context, toMember uint64) error
}

type MemberLister interface {
// MemberList lists all members in a cluster
MemberList(ctx context.Context) ([]*etcdserverpb.Member, error)
Expand Down
105 changes: 82 additions & 23 deletions pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package bootstrapteardown
import (
"context"
"fmt"
"go.etcd.io/etcd/api/v3/etcdserverpb"
"time"

operatorv1 "github.com/openshift/api/operator/v1"
Expand Down Expand Up @@ -62,12 +63,18 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
return fmt.Errorf("failed to get bootstrap scaling strategy: %w", err)
}
// checks the actual etcd cluster membership API if etcd-bootstrap exists
safeToRemoveBootstrap, hasBootstrap, bootstrapID, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy)
safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy)
if err != nil {
return fmt.Errorf("error while canRemoveEtcdBootstrap: %w", err)
}

err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapID)
if hasBootstrap {
if err := c.ensureBootstrapIsNotLeader(ctx, bootstrapMember); err != nil {
klog.Errorf("error while ensuring bootstrap is not leader: %v", err)
}
}

err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapMember)
if err != nil {
_, _, updateErr := v1helpers.UpdateStatus(ctx, c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{
Type: "BootstrapTeardownDegraded",
Expand All @@ -90,13 +97,20 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
return updateErr
}

func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapID uint64) error {
func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapMember *etcdserverpb.Member) error {
bootstrapID := uint64(0)
bootstrapUrl := "unknown"
if bootstrapMember != nil {
bootstrapID = bootstrapMember.ID
bootstrapUrl = bootstrapMember.GetClientURLs()[0]
}

if !hasBootstrap {
klog.V(4).Infof("no bootstrap anymore setting removal status")
// this is to ensure the status is always set correctly, even if the status update below failed
updateErr := setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient)
updateErr := setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient)
if updateErr != nil {
return fmt.Errorf("error while setSuccessfulBoostrapRemovalStatus: %w", updateErr)
return fmt.Errorf("error while setSuccessfulBootstrapRemovalStatus: %w", updateErr)
}

// if the bootstrap isn't present, then clearly we're available enough to terminate. This avoids any risk of flapping.
Expand Down Expand Up @@ -141,20 +155,21 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT
if isBootstrapComplete, err := bootstrap.IsBootstrapComplete(c.configmapLister); !isBootstrapComplete || err != nil {
return err
}
klog.Warningf("Removing bootstrap member [%x]", bootstrapID)

klog.Warningf("Removing bootstrap member [%x] (%s)", bootstrapID, bootstrapUrl)

// this is ugly until bootkube is updated, but we want to be sure that bootkube has time to be waiting to watch the condition coming back.
if err := c.etcdClient.MemberRemove(ctx, bootstrapID); err != nil {
return fmt.Errorf("error while removing bootstrap member [%x]: %w", bootstrapID, err)
return fmt.Errorf("error while removing bootstrap member [%x] (%s): %w", bootstrapID, bootstrapUrl, err)
}

klog.Infof("Successfully removed bootstrap member [%x]", bootstrapID)
klog.Infof("Successfully removed bootstrap member [%x] (%s)", bootstrapID, bootstrapUrl)
// below might fail, since the member removal can cause some downtime for raft to settle on a quorum
// it's important that everything below is properly retried above during normal controller reconciliation
return setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient)
return setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient)
}

func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error {
func setSuccessfulBootstrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error {
_, _, updateErr := v1helpers.UpdateStatus(ctx, client, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{
Type: "EtcdBootstrapMemberRemoved",
Status: operatorv1.ConditionTrue,
Expand All @@ -165,57 +180,101 @@ func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.St
}

// canRemoveEtcdBootstrap returns whether it is safe to remove bootstrap, whether bootstrap is in the list, and an error
func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, uint64, error) {
func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, *etcdserverpb.Member, error) {
members, err := c.etcdClient.MemberList(ctx)
if err != nil {
return false, false, 0, err
return false, false, nil, err
}

var hasBootstrap bool
var bootstrapMemberID uint64
var bootstrapMember *etcdserverpb.Member
for _, member := range members {
if member.Name == "etcd-bootstrap" {
hasBootstrap = true
bootstrapMemberID = member.ID
bootstrapMember = member
break
}
}
if !hasBootstrap {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}

// First, enforce the main HA invariants in terms of member counts.
switch scalingStrategy {
case ceohelpers.HAScalingStrategy:
if len(members) < 4 {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}
case ceohelpers.DelayedHAScalingStrategy:
if len(members) < 3 {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}
case ceohelpers.UnsafeScalingStrategy:
if len(members) < 2 {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}
}

// Next, given member counts are satisfied, check member health.
unhealthyMembers, err := c.etcdClient.UnhealthyMembers(ctx)
if err != nil {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}

// the etcd-bootstrap member is allowed to be unhealthy and can still be removed
switch {
case len(unhealthyMembers) == 0:
return true, hasBootstrap, bootstrapMemberID, nil
return true, hasBootstrap, bootstrapMember, nil
case len(unhealthyMembers) > 1:
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
default:
if unhealthyMembers[0].Name == "etcd-bootstrap" {
return true, true, unhealthyMembers[0].ID, nil
return true, true, bootstrapMember, nil
}
return false, hasBootstrap, bootstrapMember, nil
}
}

func (c *BootstrapTeardownController) ensureBootstrapIsNotLeader(ctx context.Context, bootstrapMember *etcdserverpb.Member) error {
if bootstrapMember == nil {
return fmt.Errorf("bootstrap member was not provided")
}
status, err := c.etcdClient.Status(ctx, bootstrapMember.ClientURLs[0])
if err != nil {
return fmt.Errorf("could not find bootstrap member status: %w", err)
}

if bootstrapMember.ID != status.Leader {
return nil
}

klog.Warningf("Bootstrap member [%x] (%s) detected as leader, trying to move elsewhere...", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0])

memberHealth, err := c.etcdClient.MemberHealth(ctx)
if err != nil {
return fmt.Errorf("could not find member health: %w", err)
}

var otherMember *etcdserverpb.Member
// we can pick any other healthy voting member as the target to move to
for _, m := range memberHealth.GetHealthyMembers() {
if m.ID != bootstrapMember.ID && !m.IsLearner {
otherMember = m
break
}
return false, hasBootstrap, bootstrapMemberID, nil
}

if otherMember == nil {
return fmt.Errorf("could not find other healthy member to move leader")
}

klog.Warningf("Moving lead from bootstrap member [%x] (%s) detected as leader to [%x] (%s)", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0])
err = c.etcdClient.MoveLeader(ctx, otherMember.ID)
if err != nil {
return err
}

klog.Warningf("Moving lead from bootstrap member [%x] (%s) to [%x] (%s) succesfully!", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0])

return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package bootstrapteardown
import (
"context"
"fmt"
clientv3 "go.etcd.io/etcd/client/v3"
"testing"

operatorv1 "github.com/openshift/api/operator/v1"
Expand Down Expand Up @@ -172,11 +173,95 @@ func TestCanRemoveEtcdBootstrap(t *testing.T) {
etcdClient: fakeEtcdClient,
}

safeToRemoveBootstrap, hasBootstrap, bootstrapId, err := c.canRemoveEtcdBootstrap(context.TODO(), test.scalingStrategy)
safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(context.TODO(), test.scalingStrategy)
require.NoError(t, err)
require.Equal(t, test.safeToRemove, safeToRemoveBootstrap, "safe to remove")
require.Equal(t, test.hasBootstrap, hasBootstrap, "has bootstrap")
require.Equal(t, test.bootstrapId, bootstrapId, "bootstrap id")
if hasBootstrap {
require.Equal(t, test.bootstrapId, bootstrapMember.ID, "bootstrap member")
} else {
require.Nil(t, bootstrapMember)
}
})
}
}

func TestMoveLeaderFromEtcdBootstrap(t *testing.T) {
tests := map[string]struct {
etcdMembers []*etcdserverpb.Member
clientFakeOpts []etcdcli.FakeClientOption
movedToLeader uint64
expectedErr error
}{
"happy path no bootstrap member": {
etcdMembers: u.DefaultEtcdMembers(),
expectedErr: fmt.Errorf("bootstrap member was not provided"),
},
"happy path bootstrap not leader": {
etcdMembers: append(u.DefaultEtcdMembers(), u.FakeEtcdBootstrapMember(3)),
clientFakeOpts: []etcdcli.FakeClientOption{
etcdcli.WithFakeStatus([]*clientv3.StatusResponse{
{Header: &etcdserverpb.ResponseHeader{MemberId: 3}}}),
etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 3, Healthy: 1}),
etcdcli.WithLeader(1),
},
expectedErr: nil,
},
"no healthy destination": {
etcdMembers: append(u.DefaultEtcdMembers(), u.FakeEtcdBootstrapMember(3)),
clientFakeOpts: []etcdcli.FakeClientOption{
etcdcli.WithFakeStatus([]*clientv3.StatusResponse{
{Header: &etcdserverpb.ResponseHeader{MemberId: 0}},
{Header: &etcdserverpb.ResponseHeader{MemberId: 1}},
{Header: &etcdserverpb.ResponseHeader{MemberId: 2}},
{Header: &etcdserverpb.ResponseHeader{MemberId: 3}}}),
etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 4, Healthy: 0}),
etcdcli.WithLeader(3),
},
expectedErr: fmt.Errorf("could not find other healthy member to move leader"),
},
"moving leader": {
etcdMembers: []*etcdserverpb.Member{
u.FakeEtcdBootstrapMember(0),
u.FakeEtcdMemberWithoutServer(1),
},
clientFakeOpts: []etcdcli.FakeClientOption{etcdcli.WithFakeStatus(
[]*clientv3.StatusResponse{
{Header: &etcdserverpb.ResponseHeader{MemberId: 0}},
{Header: &etcdserverpb.ResponseHeader{MemberId: 1}}}),
etcdcli.WithLeader(0)},
movedToLeader: 1,
expectedErr: nil,
},
}

for name, test := range tests {
t.Run(name, func(t *testing.T) {
if test.clientFakeOpts == nil {
test.clientFakeOpts = []etcdcli.FakeClientOption{etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 0})}
}
fakeEtcdClient, err := etcdcli.NewFakeEtcdClient(test.etcdMembers, test.clientFakeOpts...)
require.NoError(t, err)

c := &BootstrapTeardownController{
etcdClient: fakeEtcdClient,
}

var bootstrapMember *etcdserverpb.Member
for _, m := range test.etcdMembers {
if m.Name == "etcd-bootstrap" {
bootstrapMember = m
break
}
}

err = c.ensureBootstrapIsNotLeader(context.TODO(), bootstrapMember)
require.Equal(t, test.expectedErr, err)
if test.movedToLeader != 0 {
status, err := fakeEtcdClient.Status(context.TODO(), bootstrapMember.GetClientURLs()[0])
require.NoError(t, err)
require.Equal(t, test.movedToLeader, status.Leader)
}
})
}
}
Expand Down Expand Up @@ -246,7 +331,7 @@ func TestRemoveBootstrap(t *testing.T) {
indexerObjs: []interface{}{
bootstrapComplete,
},
expectedErr: fmt.Errorf("error while removing bootstrap member [%x]: %w", 0, fmt.Errorf("member with the given ID: 0 doesn't exist")),
expectedErr: fmt.Errorf("error while removing bootstrap member [%x] (https://10.0.0.1:2907): %w", 0, fmt.Errorf("member with the given ID: 0 doesn't exist")),
},
"safe, has bootstrap, complete process, successful removal": {
safeToRemove: true,
Expand Down Expand Up @@ -285,7 +370,7 @@ func TestRemoveBootstrap(t *testing.T) {
fakeInfraLister,
}

err = c.removeBootstrap(context.TODO(), test.safeToRemove, test.hasBootstrap, test.bootstrapId)
err = c.removeBootstrap(context.TODO(), test.safeToRemove, test.hasBootstrap, u.FakeEtcdMemberWithoutServer(int(test.bootstrapId)))
require.Equal(t, test.expectedErr, err)

_, status, _, err := fakeStaticPodClient.GetStaticPodOperatorState()
Expand Down

0 comments on commit 369b1cd

Please sign in to comment.