diff --git a/pkg/apis/v1/ec2nodeclass_status.go b/pkg/apis/v1/ec2nodeclass_status.go index 4c210ef81789..1c2a68015de1 100644 --- a/pkg/apis/v1/ec2nodeclass_status.go +++ b/pkg/apis/v1/ec2nodeclass_status.go @@ -25,6 +25,7 @@ const ( ConditionTypeAMIsReady = "AMIsReady" ConditionTypeInstanceProfileReady = "InstanceProfileReady" ConditionTypeValidationSucceeded = "ValidationSucceeded" + ConditionTypeNotDegraded = "NodeclassNotDegraded" ) // Subnet contains resolved Subnet selector values utilized for node launch diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index 3fe6b6f2706c..834165e39085 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -108,6 +108,9 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim) return nil, cloudprovider.NewNodeClassNotReadyError(err) } instance, err := c.instanceProvider.Create(ctx, nodeClass, nodeClaim, tags, instanceTypes) + if cloudprovider.IsNodeClassNotReadyError(err) { + return nil, err + } if err != nil { conditionMessage := "Error creating instance" var createError *cloudprovider.CreateError diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 350a1cc7b484..d0f6e2d89458 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -82,7 +82,7 @@ func NewControllers( instanceTypeProvider *instancetype.DefaultProvider) []controller.Controller { controllers := []controller.Controller{ nodeclasshash.NewController(kubeClient), - nodeclassstatus.NewController(kubeClient, subnetProvider, securityGroupProvider, amiProvider, instanceProfileProvider, launchTemplateProvider), + nodeclassstatus.NewController(kubeClient, subnetProvider, securityGroupProvider, amiProvider, instanceProfileProvider, launchTemplateProvider, instanceTypeProvider, instanceProvider), nodeclasstermination.NewController(kubeClient, recorder, instanceProfileProvider, launchTemplateProvider), nodeclaimgarbagecollection.NewController(kubeClient, cloudProvider), nodeclaimtagging.NewController(kubeClient, cloudProvider, instanceProvider), diff --git a/pkg/controllers/nodeclass/status/controller.go b/pkg/controllers/nodeclass/status/controller.go index 9845bf5a064b..da8c24133822 100644 --- a/pkg/controllers/nodeclass/status/controller.go +++ b/pkg/controllers/nodeclass/status/controller.go @@ -20,12 +20,18 @@ import ( "go.uber.org/multierr" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/operator/injection" "sigs.k8s.io/karpenter/pkg/utils/result" @@ -34,7 +40,9 @@ import ( v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" "github.com/aws/karpenter-provider-aws/pkg/providers/amifamily" + "github.com/aws/karpenter-provider-aws/pkg/providers/instance" "github.com/aws/karpenter-provider-aws/pkg/providers/instanceprofile" + "github.com/aws/karpenter-provider-aws/pkg/providers/instancetype" "github.com/aws/karpenter-provider-aws/pkg/providers/launchtemplate" "github.com/aws/karpenter-provider-aws/pkg/providers/securitygroup" "github.com/aws/karpenter-provider-aws/pkg/providers/subnet" @@ -52,11 +60,12 @@ type Controller struct { subnet *Subnet securitygroup *SecurityGroup validation *Validation + degraded *Degraded readiness *Readiness //TODO : Remove this when we have sub status conditions } func NewController(kubeClient client.Client, subnetProvider subnet.Provider, securityGroupProvider securitygroup.Provider, - amiProvider amifamily.Provider, instanceProfileProvider instanceprofile.Provider, launchTemplateProvider launchtemplate.Provider) *Controller { + amiProvider amifamily.Provider, instanceProfileProvider instanceprofile.Provider, launchTemplateProvider launchtemplate.Provider, instanceTypeProvider instancetype.Provider, instanceProvider instance.Provider) *Controller { return &Controller{ kubeClient: kubeClient, @@ -65,6 +74,7 @@ func NewController(kubeClient client.Client, subnetProvider subnet.Provider, sec securitygroup: &SecurityGroup{securityGroupProvider: securityGroupProvider}, instanceprofile: &InstanceProfile{instanceProfileProvider: instanceProfileProvider}, validation: &Validation{}, + degraded: &Degraded{kubeClient: kubeClient, instanceProvider: instanceProvider, instanceTypeProvider: instanceTypeProvider}, readiness: &Readiness{launchTemplateProvider: launchTemplateProvider}, } } @@ -96,6 +106,7 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass) c.securitygroup, c.instanceprofile, c.validation, + c.degraded, c.readiness, } { res, err := reconciler.Reconcile(ctx, nodeClass) @@ -124,6 +135,22 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error { return controllerruntime.NewControllerManagedBy(m). Named("nodeclass.status"). For(&v1.EC2NodeClass{}). + Watches( + &karpv1.NodeClaim{}, + handler.EnqueueRequestsFromMapFunc(func(_ context.Context, o client.Object) []reconcile.Request { + nc := o.(*karpv1.NodeClaim) + if nc.Spec.NodeClassRef == nil { + return nil + } + return []reconcile.Request{{NamespacedName: types.NamespacedName{Name: nc.Spec.NodeClassRef.Name}}} + }), + // Watch for NodeClaim creation events + builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { return false }, + UpdateFunc: func(e event.UpdateEvent) bool { return false }, + DeleteFunc: func(e event.DeleteEvent) bool { return true }, + }), + ). WithOptions(controller.Options{ RateLimiter: reasonable.RateLimiter(), MaxConcurrentReconciles: 10, diff --git a/pkg/controllers/nodeclass/status/degraded.go b/pkg/controllers/nodeclass/status/degraded.go new file mode 100644 index 000000000000..315b9c386482 --- /dev/null +++ b/pkg/controllers/nodeclass/status/degraded.go @@ -0,0 +1,70 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package status + +import ( + "context" + "fmt" + + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/scheduling" + nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" + "sigs.k8s.io/karpenter/pkg/utils/resources" + + v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/providers/instance" + "github.com/aws/karpenter-provider-aws/pkg/providers/instancetype" + "github.com/samber/lo" +) + +type Degraded struct { + instanceTypeProvider instancetype.Provider + instanceProvider instance.Provider + kubeClient client.Client +} + +func (d Degraded) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass) (reconcile.Result, error) { + if nodeClass.StatusConditions().Get(v1.ConditionTypeSubnetsReady).IsFalse() || nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsReady).IsFalse() { + return reconcile.Result{}, nil + } + nodeClaims := &karpv1.NodeClaimList{} + if err := d.kubeClient.List(ctx, nodeClaims, nodeclaimutils.ForNodeClass(nodeClass)); err != nil { + return reconcile.Result{}, fmt.Errorf("listing nodeclaims that are using nodeclass, %w", err) + } + _, err := d.instanceProvider.Create(ctx, nodeClass, &nodeClaims.Items[0], nodeClass.Spec.Tags, lo.Must(d.resolveInstanceTypes(ctx, &nodeClaims.Items[0], nodeClass))) + if corecloudprovider.IsNodeClassNotReadyError(err) { + nodeClass.StatusConditions().SetFalse(v1.ConditionTypeNotDegraded, "NodeClassDegraded", "Unauthorized Operation") + return reconcile.Result{}, fmt.Errorf("Unauthorized Operation %w", err) + } + nodeClass.StatusConditions().SetTrue(v1.ConditionTypeNotDegraded) + return reconcile.Result{}, nil +} + +func (d *Degraded) resolveInstanceTypes(ctx context.Context, nodeClaim *karpv1.NodeClaim, nodeClass *v1.EC2NodeClass) ([]*corecloudprovider.InstanceType, error) { + instanceTypes, err := d.instanceTypeProvider.List(ctx, nodeClass) + if err != nil { + return nil, fmt.Errorf("getting instance types, %w", err) + } + reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(nodeClaim.Spec.Requirements...) + return lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { + return reqs.Compatible(i.Requirements, scheduling.AllowUndefinedWellKnownLabels) == nil && + len(i.Offerings.Compatible(reqs).Available()) > 0 && + resources.Fits(nodeClaim.Spec.Resources.Requests, i.Allocatable()) + }), nil +} diff --git a/pkg/controllers/nodeclass/status/degraded_test.go b/pkg/controllers/nodeclass/status/degraded_test.go new file mode 100644 index 000000000000..5b98b728a856 --- /dev/null +++ b/pkg/controllers/nodeclass/status/degraded_test.go @@ -0,0 +1,71 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package status_test + +import ( + "github.com/awslabs/operatorpkg/object" + + v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/fake" + "github.com/aws/smithy-go" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + coretest "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + . "sigs.k8s.io/karpenter/pkg/test/expectations" +) + +var _ = Describe("NodeClass Degraded Status Controller", func() { + BeforeEach(func() { + env.Client.Create(ctx, coretest.NodeClaim(karpv1.NodeClaim{ + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, + }, + }, + })) + }) + It("should update status condition on nodeClass as NotReady when nodeclass is degraded", func() { + ExpectApplied(ctx, env.Client, nodeClass) + awsEnv.EC2API.CreateFleetBehavior.Error.Set(&smithy.GenericAPIError{ + Code: "UnauthorizedOperation", + }, fake.MaxCalls(2)) + err := ExpectObjectReconcileFailed(ctx, env.Client, statusController, nodeClass) + Expect(err).To(HaveOccurred()) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.Status.Conditions).To(HaveLen(7)) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeNotDegraded).IsFalse()).To(BeTrue()) + }) + It("should update status condition as Ready", func() { + nodeClass.Spec.Tags = map[string]string{} + ExpectApplied(ctx, env.Client, nodeClass) + env.Client.Create(ctx, coretest.NodeClaim(karpv1.NodeClaim{ + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, + }, + }, + })) + ExpectObjectReconciled(ctx, env.Client, statusController, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).IsTrue()).To(BeTrue()) + }) +}) diff --git a/pkg/errors/errors.go b/pkg/errors/errors.go index 271edd8581e5..4723eb91e2ce 100644 --- a/pkg/errors/errors.go +++ b/pkg/errors/errors.go @@ -106,3 +106,14 @@ func IsLaunchTemplateNotFound(err error) bool { } return false } + +func IsUnauthorizedError(err error) bool { + if err == nil { + return false + } + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + return apiErr.ErrorCode() == "UnauthorizedOperation" + } + return false +} diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index e1ac2b409de5..005f99e230bd 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -242,8 +242,13 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N } else { createFleetInput.OnDemandOptions = &ec2types.OnDemandOptionsRequest{AllocationStrategy: ec2types.FleetOnDemandAllocationStrategyLowestPrice} } - + createFleetInput.DryRun = lo.ToPtr(true) createFleetOutput, err := p.ec2Batcher.CreateFleet(ctx, createFleetInput) + if err != nil && awserrors.IsUnauthorizedError(err) { + return ec2types.CreateFleetInstance{}, cloudprovider.NewNodeClassNotReadyError(fmt.Errorf("dry run failed, %w", err)) + } + createFleetInput.DryRun = lo.ToPtr(false) + createFleetOutput, err = p.ec2Batcher.CreateFleet(ctx, createFleetInput) p.subnetProvider.UpdateInflightIPs(createFleetInput, createFleetOutput, instanceTypes, lo.Values(zonalSubnets), capacityType) if err != nil { conditionMessage := "Error creating fleet"