From c2a85cb5cbd3f8b59865a81a7ce255c7bf691e03 Mon Sep 17 00:00:00 2001 From: tan90github <81003453+tan90github@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:52:07 +0800 Subject: [PATCH] koord-manager: consider NodeReserved when calculate mid resource. (#2253) Signed-off-by: wangyang60 --- .../plugins/batchresource/plugin.go | 4 +- .../plugins/batchresource/plugin_test.go | 61 +++ .../plugins/midresource/plugin.go | 57 ++- .../plugins/midresource/plugin_test.go | 355 +++++++++++++++--- .../noderesource/plugins/util/util.go | 25 +- .../noderesource/resource_calculator_test.go | 20 +- 6 files changed, 430 insertions(+), 92 deletions(-) diff --git a/pkg/slo-controller/noderesource/plugins/batchresource/plugin.go b/pkg/slo-controller/noderesource/plugins/batchresource/plugin.go index dcec72c7d..64831f85e 100644 --- a/pkg/slo-controller/noderesource/plugins/batchresource/plugin.go +++ b/pkg/slo-controller/noderesource/plugins/batchresource/plugin.go @@ -279,7 +279,7 @@ func (p *Plugin) calculateOnNode(strategy *configuration.ColocationStrategy, nod } } - hostAppHPUsed := resutil.GetHostAppHPUsed(resourceMetrics) + hostAppHPUsed := resutil.GetHostAppHPUsed(resourceMetrics, extension.PriorityBatch) // For the pods reported metrics but not shown in current list, count them according to the metric priority. podsDanglingUsed := util.NewZeroResourceList() for _, podMetric := range podMetricDanglingMap { @@ -352,7 +352,7 @@ func (p *Plugin) calculateOnNUMALevel(strategy *configuration.ColocationStrategy podsHPZoneMaxUsedReq := make([]corev1.ResourceList, zoneNum) batchZoneAllocatable := make([]corev1.ResourceList, zoneNum) - hostAppHPUsed := resutil.GetHostAppHPUsed(resourceMetrics) + hostAppHPUsed := resutil.GetHostAppHPUsed(resourceMetrics, extension.PriorityBatch) systemUsed := resutil.GetResourceListForCPUAndMemory(nodeMetric.Status.NodeMetric.SystemUsage.ResourceList) // resource usage of host applications with prod priority will be count as host system usage since they consumes the // node reserved resource. bind host app on single numa node is not supported yet. divide the usage by numa node number. diff --git a/pkg/slo-controller/noderesource/plugins/batchresource/plugin_test.go b/pkg/slo-controller/noderesource/plugins/batchresource/plugin_test.go index 40f12ba2c..5fce25ca9 100644 --- a/pkg/slo-controller/noderesource/plugins/batchresource/plugin_test.go +++ b/pkg/slo-controller/noderesource/plugins/batchresource/plugin_test.go @@ -3731,6 +3731,67 @@ func TestPluginCalculate(t *testing.T) { }, wantErr: false, }, + { + name: "calculate with memory usage, including mid host application usage", + args: args{ + strategy: &configuration.ColocationStrategy{ + Enable: pointer.Bool(true), + CPUReclaimThresholdPercent: pointer.Int64(65), + MemoryReclaimThresholdPercent: pointer.Int64(65), + DegradeTimeMinutes: pointer.Int64(15), + UpdateTimeThresholdSeconds: pointer.Int64(300), + ResourceDiffThreshold: pointer.Float64(0.1), + }, + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node1", + }, + Status: makeNodeStat("100", "120G"), + }, + resourceMetrics: &framework.ResourceMetrics{ + NodeMetric: &slov1alpha1.NodeMetric{ + Status: slov1alpha1.NodeMetricStatus{ + UpdateTime: &metav1.Time{Time: time.Now()}, + NodeMetric: &slov1alpha1.NodeMetricInfo{ + NodeUsage: slov1alpha1.ResourceMap{ + ResourceList: makeResourceList("50", "55G"), + }, + SystemUsage: slov1alpha1.ResourceMap{ + ResourceList: makeResourceList("4", "6G"), + }, + }, + PodsMetric: []*slov1alpha1.PodMetricInfo{ + genPodMetric("test", "podA", "11", "11G"), + genPodMetric("test", "podB", "10", "10G"), + genPodMetric("test", "podC", "22", "22G"), + }, + HostApplicationMetric: []*slov1alpha1.HostApplicationMetricInfo{ + { + Name: "test-mid-host-application", + Usage: slov1alpha1.ResourceMap{ + ResourceList: makeResourceList("3", "6G"), + }, + Priority: extension.PriorityMid, + }, + }, + }, + }, + }, + }, + want: []framework.ResourceItem{ + { + Name: extension.BatchCPU, + Quantity: resource.NewQuantity(25000, resource.DecimalSI), + Message: "batchAllocatable[CPU(Milli-Core)]:25000 = nodeCapacity:100000 - nodeSafetyMargin:35000 - systemUsageOrNodeReserved:7000 - podHPUsed:33000", + }, + { + Name: extension.BatchMemory, + Quantity: resource.NewScaledQuantity(33, 9), + Message: "batchAllocatable[Mem(GB)]:33 = nodeCapacity:120 - nodeSafetyMargin:42 - systemUsage:12 - podHPUsed:33", + }, + }, + wantErr: false, + }, { name: "calculate with memory usage, including batch host application usage", args: args{ diff --git a/pkg/slo-controller/noderesource/plugins/midresource/plugin.go b/pkg/slo-controller/noderesource/plugins/midresource/plugin.go index 2cf74fb65..1368785f5 100644 --- a/pkg/slo-controller/noderesource/plugins/midresource/plugin.go +++ b/pkg/slo-controller/noderesource/plugins/midresource/plugin.go @@ -21,6 +21,7 @@ import ( "time" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" quotav1 "k8s.io/apiserver/pkg/quota/v1" "k8s.io/klog/v2" "k8s.io/utils/clock" @@ -126,9 +127,9 @@ func (p *Plugin) degradeCalculate(node *corev1.Node, message string) []framework return p.Reset(node, message) } -// Unallocated[Mid] = max(NodeAllocatable - Allocated[Prod], 0) -func (p *Plugin) getUnallocated(node *corev1.Node, podList *corev1.PodList) corev1.ResourceList { - allocated := corev1.ResourceList{} +// Unallocated[Mid] = max(NodeCapacity - NodeReserved - Allocated[Prod], 0) +func (p *Plugin) getUnallocated(nodeName string, podList *corev1.PodList, nodeCapacity, nodeReserved corev1.ResourceList) corev1.ResourceList { + prodPodAllocated := corev1.ResourceList{} for i := range podList.Items { pod := &podList.Items[i] priorityClass := extension.GetPodPriorityClassWithDefault(pod) @@ -142,19 +143,27 @@ func (p *Plugin) getUnallocated(node *corev1.Node, podList *corev1.PodList) core continue } podRequest := util.GetPodRequest(pod, corev1.ResourceCPU, corev1.ResourceMemory) - allocated = quotav1.Add(allocated, podRequest) + prodPodAllocated = quotav1.Add(prodPodAllocated, podRequest) } - return quotav1.SubtractWithNonNegativeResult(node.Status.Allocatable, allocated) + midUnallocated := quotav1.Max(quotav1.Subtract(quotav1.Subtract(nodeCapacity, nodeReserved), prodPodAllocated), util.NewZeroResourceList()) + cpuMsg := fmt.Sprintf("midUnallocatedCPU[core]:%v = max(nodeCapacity:%v - nodeReserved:%v - prodPodAllocated:%v, 0)", + midUnallocated.Cpu(), nodeCapacity.Cpu(), nodeReserved.Cpu(), prodPodAllocated.Cpu()) + memMsg := fmt.Sprintf("midUnallocatedMem[GB]:%v = max(nodeCapacity:%v - nodeReserved:%v - prodPodAllocated:%v, 0)", + midUnallocated.Memory().ScaledValue(resource.Giga), nodeCapacity.Memory().ScaledValue(resource.Giga), + nodeReserved.Memory().ScaledValue(resource.Giga), prodPodAllocated.Memory().ScaledValue(resource.Giga)) + + klog.V(6).Infof("calculated mid unallocated for node %s, cpu(core) %v, memory(GB) %v", nodeName, cpuMsg, memMsg) + return midUnallocated } func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *corev1.Node, podList *corev1.PodList, resourceMetrics *framework.ResourceMetrics) []framework.ResourceItem { // Allocatable[Mid]' := min(Reclaimable[Mid], NodeAllocatable * thresholdRatio) + Unallocated[Mid] * midUnallocatedRatio - // Unallocated[Mid] = max(NodeAllocatable - Allocated[Prod], 0) + // Unallocated[Mid] = max(NodeCapacity - NodeReserved - Allocated[Prod], 0) - var allocatableMilliCPU, allocatableMemory, prodReclaimableMilliCPU int64 - var prodReclaimableMemory string = "0" + var allocatableMilliCPU, allocatableMemory int64 + prodReclaimableCPU, prodReclaimableMemory := resource.NewQuantity(0, resource.DecimalSI), resource.NewQuantity(0, resource.BinarySI) prodReclaimableMetic := resourceMetrics.NodeMetric.Status.ProdReclaimableMetric if prodReclaimableMetic == nil || prodReclaimableMetic.Resource.ResourceList == nil { @@ -163,18 +172,34 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor allocatableMemory = 0 } else { prodReclaimable := resourceMetrics.NodeMetric.Status.ProdReclaimableMetric.Resource - allocatableMilliCPU = prodReclaimable.Cpu().MilliValue() - allocatableMemory = prodReclaimable.Memory().Value() - prodReclaimableMilliCPU = allocatableMilliCPU - prodReclaimableMemory = prodReclaimable.Memory().String() + prodReclaimableCPU = prodReclaimable.Cpu() + prodReclaimableMemory = prodReclaimable.Memory() + allocatableMilliCPU = prodReclaimableCPU.MilliValue() + allocatableMemory = prodReclaimableMemory.Value() } - nodeAllocatable := node.Status.Allocatable + nodeMetric := resourceMetrics.NodeMetric + + hostAppHPUsed := resutil.GetHostAppHPUsed(resourceMetrics, extension.PriorityMid) + + nodeCapacity := resutil.GetNodeCapacity(node) + + systemUsed := resutil.GetResourceListForCPUAndMemory(nodeMetric.Status.NodeMetric.SystemUsage.ResourceList) + // resource usage of host applications with prod priority will be count as host system usage since they consumes the + // node reserved resource. + systemUsed = quotav1.Add(systemUsed, hostAppHPUsed) + + // System.Reserved = Node.Anno.Reserved, Node.Kubelet.Reserved) + nodeAnnoReserved := util.GetNodeReservationFromAnnotation(node.Annotations) + nodeKubeletReserved := util.GetNodeReservationFromKubelet(node) + // FIXME: resource reservation taking max is rather confusing. + nodeReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved) + nodeReserved = quotav1.Max(systemUsed, nodeReserved) - // TODO: consider SafetyMargin and NodeReserved - unallocated := p.getUnallocated(node, podList) + unallocated := p.getUnallocated(node.Name, podList, nodeCapacity, nodeReserved) - cpuInMilliCores, memory, cpuMsg, memMsg := resutil.CalculateMidResourceByPolicy(strategy, nodeAllocatable, unallocated, allocatableMilliCPU, allocatableMemory, prodReclaimableMilliCPU, prodReclaimableMemory, node.Name) + cpuInMilliCores, memory, cpuMsg, memMsg := resutil.CalculateMidResourceByPolicy(strategy, nodeCapacity, + unallocated, allocatableMilliCPU, allocatableMemory, prodReclaimableCPU, prodReclaimableMemory, node.Name) metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.MidCPU), metrics.UnitInteger, float64(cpuInMilliCores.MilliValue())/1000) metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.MidMemory), metrics.UnitByte, float64(memory.Value())) diff --git a/pkg/slo-controller/noderesource/plugins/midresource/plugin_test.go b/pkg/slo-controller/noderesource/plugins/midresource/plugin_test.go index f9b7c0be2..d51b0f2c0 100644 --- a/pkg/slo-controller/noderesource/plugins/midresource/plugin_test.go +++ b/pkg/slo-controller/noderesource/plugins/midresource/plugin_test.go @@ -44,15 +44,15 @@ func TestPluginNeedSync(t *testing.T) { testNode := getTestNode(nil) testNodeMidNotChange := getTestNode(corev1.ResourceList{ extension.BatchCPU: resource.MustParse("50000"), - extension.BatchMemory: resource.MustParse("90Gi"), + extension.BatchMemory: resource.MustParse("90G"), extension.MidCPU: resource.MustParse("20000"), - extension.MidMemory: resource.MustParse("40Gi"), + extension.MidMemory: resource.MustParse("40G"), }) testNodeMidChanged := getTestNode(corev1.ResourceList{ extension.BatchCPU: resource.MustParse("40000"), - extension.BatchMemory: resource.MustParse("80Gi"), + extension.BatchMemory: resource.MustParse("80G"), extension.MidCPU: resource.MustParse("10000"), - extension.MidMemory: resource.MustParse("30Gi"), + extension.MidMemory: resource.MustParse("30G"), }) type args struct { strategy *configuration.ColocationStrategy @@ -223,11 +223,11 @@ func TestPluginCalculate(t *testing.T) { Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("20"), - corev1.ResourceMemory: resource.MustParse("40Gi"), + corev1.ResourceMemory: resource.MustParse("40G"), }, Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("20"), - corev1.ResourceMemory: resource.MustParse("40Gi"), + corev1.ResourceMemory: resource.MustParse("40G"), }, }, }, @@ -254,11 +254,11 @@ func TestPluginCalculate(t *testing.T) { Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("15"), - corev1.ResourceMemory: resource.MustParse("30Gi"), + corev1.ResourceMemory: resource.MustParse("30G"), }, Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("15"), - corev1.ResourceMemory: resource.MustParse("30Gi"), + corev1.ResourceMemory: resource.MustParse("30G"), }, }, }, @@ -268,18 +268,6 @@ func TestPluginCalculate(t *testing.T) { Phase: corev1.PodRunning, }, } - testCPUQuant := resource.MustParse("10000") - testCPUQuant2 := resource.MustParse("18000") - testCPUQuant3 := resource.MustParse("0") - - //NOTE: if not call String, cpu String will be diff - _ = testCPUQuant.String() - _ = testCPUQuant2.String() - _ = testCPUQuant3.String() - testMemoryQuant := resource.MustParse("15Gi") - testMemoryQuant2 := resource.MustParse("46Gi") - testMemoryQuant3 := resource.MustParse("0Gi") - _ = testMemoryQuant3.String() type args struct { strategy *configuration.ColocationStrategy @@ -324,7 +312,7 @@ func TestPluginCalculate(t *testing.T) { NodeUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("20"), - corev1.ResourceMemory: resource.MustParse("45Gi"), + corev1.ResourceMemory: resource.MustParse("45G"), }, }, }, @@ -335,7 +323,7 @@ func TestPluginCalculate(t *testing.T) { PodUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("10"), - corev1.ResourceMemory: resource.MustParse("20Gi"), + corev1.ResourceMemory: resource.MustParse("20G"), }, }, }, @@ -345,7 +333,7 @@ func TestPluginCalculate(t *testing.T) { PodUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("15"), - corev1.ResourceMemory: resource.MustParse("20Gi"), + corev1.ResourceMemory: resource.MustParse("20G"), }, }, }, @@ -393,7 +381,7 @@ func TestPluginCalculate(t *testing.T) { NodeUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("20"), - corev1.ResourceMemory: resource.MustParse("45Gi"), + corev1.ResourceMemory: resource.MustParse("45G"), }, }, }, @@ -404,7 +392,7 @@ func TestPluginCalculate(t *testing.T) { PodUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("10"), - corev1.ResourceMemory: resource.MustParse("20Gi"), + corev1.ResourceMemory: resource.MustParse("20G"), }, }, }, @@ -414,7 +402,7 @@ func TestPluginCalculate(t *testing.T) { PodUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("15"), - corev1.ResourceMemory: resource.MustParse("20Gi"), + corev1.ResourceMemory: resource.MustParse("20G"), }, }, }, @@ -423,7 +411,7 @@ func TestPluginCalculate(t *testing.T) { Resource: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("10"), - corev1.ResourceMemory: resource.MustParse("15Gi"), + corev1.ResourceMemory: resource.MustParse("15G"), }, }, }, @@ -434,13 +422,13 @@ func TestPluginCalculate(t *testing.T) { want: []framework.ResourceItem{ { Name: extension.MidCPU, - Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeAllocatable:100000 * thresholdRatio:1, ProdReclaimable:10000) + Unallocated:80000 * midUnallocatedRatio:0", - Quantity: &testCPUQuant, + Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:10000) + Unallocated:80000 * midUnallocatedRatio:0", + Quantity: resource.NewQuantity(10000, resource.DecimalSI), }, { Name: extension.MidMemory, - Message: "midAllocatable[Memory(byte)]:15Gi = min(nodeAllocatable:200Gi * thresholdRatio:1, ProdReclaimable:15Gi) + Unallocated:160Gi * midUnallocatedRatio:0", - Quantity: &testMemoryQuant, + Message: "midAllocatable[Memory(GB)]:15 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:15) + Unallocated:160 * midUnallocatedRatio:0", + Quantity: resource.NewScaledQuantity(15, 9), }, }, wantErr: false, @@ -473,7 +461,7 @@ func TestPluginCalculate(t *testing.T) { NodeUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("30"), - corev1.ResourceMemory: resource.MustParse("50Gi"), + corev1.ResourceMemory: resource.MustParse("50G"), }, }, }, @@ -484,7 +472,7 @@ func TestPluginCalculate(t *testing.T) { PodUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("5"), - corev1.ResourceMemory: resource.MustParse("10Gi"), + corev1.ResourceMemory: resource.MustParse("10G"), }, }, }, @@ -494,7 +482,7 @@ func TestPluginCalculate(t *testing.T) { PodUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("15"), - corev1.ResourceMemory: resource.MustParse("30Gi"), + corev1.ResourceMemory: resource.MustParse("30G"), }, }, }, @@ -503,7 +491,7 @@ func TestPluginCalculate(t *testing.T) { Resource: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("15"), - corev1.ResourceMemory: resource.MustParse("30Gi"), + corev1.ResourceMemory: resource.MustParse("30G"), }, }, }, @@ -514,13 +502,12 @@ func TestPluginCalculate(t *testing.T) { want: []framework.ResourceItem{ { Name: extension.MidCPU, - Message: "midAllocatable[CPU(milli-core)]:18000 = min(nodeAllocatable:100000 * thresholdRatio:0.1, ProdReclaimable:15000) + Unallocated:80000 * midUnallocatedRatio:0.1", - Quantity: &testCPUQuant2, - }, + Message: "midAllocatable[CPU(milli-core)]:18000 = min(nodeCapacity:100000 * thresholdRatio:0.1, ProdReclaimable:15000) + Unallocated:80000 * midUnallocatedRatio:0.1", + Quantity: resource.NewQuantity(18000, resource.DecimalSI)}, { Name: extension.MidMemory, - Message: "midAllocatable[Memory(byte)]:46Gi = min(nodeAllocatable:200Gi * thresholdRatio:0.2, ProdReclaimable:30Gi) + Unallocated:160Gi * midUnallocatedRatio:0.1", - Quantity: &testMemoryQuant2, + Message: "midAllocatable[Memory(GB)]:46 = min(nodeCapacity:210 * thresholdRatio:0.2, ProdReclaimable:30) + Unallocated:160 * midUnallocatedRatio:0.1", + Quantity: resource.NewScaledQuantity(46, 9), }, }, wantErr: false, @@ -550,7 +537,241 @@ func TestPluginCalculate(t *testing.T) { NodeUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("30"), - corev1.ResourceMemory: resource.MustParse("50Gi"), + corev1.ResourceMemory: resource.MustParse("50G"), + }, + }, + }, + PodsMetric: []*slov1alpha1.PodMetricInfo{ + { + Name: testProdLSPod.Name, + Namespace: testProdLSPod.Namespace, + PodUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("5"), + corev1.ResourceMemory: resource.MustParse("10G"), + }, + }, + }, + { + Name: testBatchBEPod.Name, + Namespace: testBatchBEPod.Namespace, + PodUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("15"), + corev1.ResourceMemory: resource.MustParse("30G"), + }, + }, + }, + }, + ProdReclaimableMetric: &slov1alpha1.ReclaimableMetric{}, + }, + }, + }, + }, + want: []framework.ResourceItem{ + { + Name: extension.MidCPU, + Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0", + Quantity: resource.NewQuantity(0, resource.DecimalSI), + }, + { + Name: extension.MidMemory, + Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0", + Quantity: resource.NewScaledQuantity(0, 0), + }, + }, + wantErr: false, + }, + { + name: "including product host application usage", + args: args{ + strategy: &configuration.ColocationStrategy{ + Enable: pointer.Bool(true), + DegradeTimeMinutes: pointer.Int64(10), + }, + node: testNode, + podList: &corev1.PodList{ + Items: []corev1.Pod{ + *testProdLSPod, + *testBatchBEPod, + }, + }, + metrics: &framework.ResourceMetrics{ + NodeMetric: &slov1alpha1.NodeMetric{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + Status: slov1alpha1.NodeMetricStatus{ + UpdateTime: &metav1.Time{Time: time.Now().Add(-20 * time.Second)}, + NodeMetric: &slov1alpha1.NodeMetricInfo{ + NodeUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("30"), + corev1.ResourceMemory: resource.MustParse("50G"), + }, + }, + }, + PodsMetric: []*slov1alpha1.PodMetricInfo{ + { + Name: testProdLSPod.Name, + Namespace: testProdLSPod.Namespace, + PodUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("5"), + corev1.ResourceMemory: resource.MustParse("10G"), + }, + }, + }, + { + Name: testBatchBEPod.Name, + Namespace: testBatchBEPod.Namespace, + PodUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("15"), + corev1.ResourceMemory: resource.MustParse("30G"), + }, + }, + }, + }, + HostApplicationMetric: []*slov1alpha1.HostApplicationMetricInfo{ + { + Name: "test-prod-host-application", + Usage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("5"), + corev1.ResourceMemory: resource.MustParse("15G"), + }, + }, + Priority: extension.PriorityProd, + }, + }, + ProdReclaimableMetric: &slov1alpha1.ReclaimableMetric{}, + }, + }, + }, + }, + want: []framework.ResourceItem{ + { + Name: extension.MidCPU, + Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:75000 * midUnallocatedRatio:0", + Quantity: resource.NewQuantity(0, resource.DecimalSI), + }, + { + Name: extension.MidMemory, + Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:155 * midUnallocatedRatio:0", + Quantity: resource.NewScaledQuantity(0, 0), + }, + }, + wantErr: false, + }, + { + name: "including mid host application usage", + args: args{ + strategy: &configuration.ColocationStrategy{ + Enable: pointer.Bool(true), + DegradeTimeMinutes: pointer.Int64(10), + }, + node: testNode, + podList: &corev1.PodList{ + Items: []corev1.Pod{ + *testProdLSPod, + *testBatchBEPod, + }, + }, + metrics: &framework.ResourceMetrics{ + NodeMetric: &slov1alpha1.NodeMetric{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + Status: slov1alpha1.NodeMetricStatus{ + UpdateTime: &metav1.Time{Time: time.Now().Add(-20 * time.Second)}, + NodeMetric: &slov1alpha1.NodeMetricInfo{ + NodeUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("30"), + corev1.ResourceMemory: resource.MustParse("50G"), + }, + }, + }, + PodsMetric: []*slov1alpha1.PodMetricInfo{ + { + Name: testProdLSPod.Name, + Namespace: testProdLSPod.Namespace, + PodUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("5"), + corev1.ResourceMemory: resource.MustParse("10G"), + }, + }, + }, + { + Name: testBatchBEPod.Name, + Namespace: testBatchBEPod.Namespace, + PodUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("15"), + corev1.ResourceMemory: resource.MustParse("30G"), + }, + }, + }, + }, + HostApplicationMetric: []*slov1alpha1.HostApplicationMetricInfo{ + { + Name: "test-mid-host-application", + Usage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("5"), + corev1.ResourceMemory: resource.MustParse("15G"), + }, + }, + Priority: extension.PriorityMid, + }, + }, + ProdReclaimableMetric: &slov1alpha1.ReclaimableMetric{}, + }, + }, + }, + }, + want: []framework.ResourceItem{ + { + Name: extension.MidCPU, + Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0", + Quantity: resource.NewQuantity(0, resource.DecimalSI), + }, + { + Name: extension.MidMemory, + Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0", + Quantity: resource.NewScaledQuantity(0, 0), + }, + }, + wantErr: false, + }, + { + name: "including batch host application usage", + args: args{ + strategy: &configuration.ColocationStrategy{ + Enable: pointer.Bool(true), + DegradeTimeMinutes: pointer.Int64(10), + }, + node: testNode, + podList: &corev1.PodList{ + Items: []corev1.Pod{ + *testProdLSPod, + *testBatchBEPod, + }, + }, + metrics: &framework.ResourceMetrics{ + NodeMetric: &slov1alpha1.NodeMetric{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + Status: slov1alpha1.NodeMetricStatus{ + UpdateTime: &metav1.Time{Time: time.Now().Add(-20 * time.Second)}, + NodeMetric: &slov1alpha1.NodeMetricInfo{ + NodeUsage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("30"), + corev1.ResourceMemory: resource.MustParse("50G"), }, }, }, @@ -561,7 +782,7 @@ func TestPluginCalculate(t *testing.T) { PodUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("5"), - corev1.ResourceMemory: resource.MustParse("10Gi"), + corev1.ResourceMemory: resource.MustParse("10G"), }, }, }, @@ -571,11 +792,23 @@ func TestPluginCalculate(t *testing.T) { PodUsage: slov1alpha1.ResourceMap{ ResourceList: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("15"), - corev1.ResourceMemory: resource.MustParse("30Gi"), + corev1.ResourceMemory: resource.MustParse("30G"), }, }, }, }, + HostApplicationMetric: []*slov1alpha1.HostApplicationMetricInfo{ + { + Name: "test-batch-host-application", + Usage: slov1alpha1.ResourceMap{ + ResourceList: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("10G"), + }, + }, + Priority: extension.PriorityBatch, + }, + }, ProdReclaimableMetric: &slov1alpha1.ReclaimableMetric{}, }, }, @@ -584,13 +817,13 @@ func TestPluginCalculate(t *testing.T) { want: []framework.ResourceItem{ { Name: extension.MidCPU, - Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeAllocatable:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0", - Quantity: &testCPUQuant3, + Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0", + Quantity: resource.NewQuantity(0, resource.DecimalSI), }, { Name: extension.MidMemory, - Message: "midAllocatable[Memory(byte)]:0 = min(nodeAllocatable:200Gi * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160Gi * midUnallocatedRatio:0", - Quantity: &testMemoryQuant3, + Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0", + Quantity: resource.NewScaledQuantity(0, 0), }, }, wantErr: false, @@ -600,8 +833,8 @@ func TestPluginCalculate(t *testing.T) { t.Run(tt.name, func(t *testing.T) { p := &Plugin{} got, gotErr := p.Calculate(tt.args.strategy, tt.args.node, tt.args.podList, tt.args.metrics) - assert.Equal(t, tt.want, got) assert.Equal(t, tt.wantErr, gotErr != nil) + testingCorrectResourceItems(t, tt.want, got) }) } } @@ -798,19 +1031,19 @@ func getTestNode(resourceList corev1.ResourceList, resetResources ...corev1.Reso Status: corev1.NodeStatus{ Allocatable: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("100"), - corev1.ResourceMemory: resource.MustParse("200Gi"), + corev1.ResourceMemory: resource.MustParse("200G"), extension.BatchCPU: resource.MustParse("40000"), - extension.BatchMemory: resource.MustParse("80Gi"), + extension.BatchMemory: resource.MustParse("80G"), extension.MidCPU: resource.MustParse("20000"), - extension.MidMemory: resource.MustParse("40Gi"), + extension.MidMemory: resource.MustParse("40G"), }, Capacity: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("100"), - corev1.ResourceMemory: resource.MustParse("210Gi"), + corev1.ResourceMemory: resource.MustParse("210G"), extension.BatchCPU: resource.MustParse("40000"), - extension.BatchMemory: resource.MustParse("80Gi"), + extension.BatchMemory: resource.MustParse("80G"), extension.MidCPU: resource.MustParse("20000"), - extension.MidMemory: resource.MustParse("40Gi"), + extension.MidMemory: resource.MustParse("40G"), }, }, } @@ -824,3 +1057,17 @@ func getTestNode(resourceList corev1.ResourceList, resetResources ...corev1.Reso } return testNode } + +func testingCorrectResourceItems(t *testing.T, want, got []framework.ResourceItem) { + assert.Equal(t, len(want), len(got)) + for i := range want { + qWant, qGot := want[i].Quantity, got[i].Quantity + want[i].Quantity, got[i].Quantity = nil, nil + assert.Equal(t, want[i], got[i], "equal fields for resource "+want[i].Name) + if qWant == nil && qGot == nil { + continue + } + assert.Equal(t, qWant.MilliValue(), qGot.MilliValue(), "equal values for resource "+want[i].Name) + want[i].Quantity, got[i].Quantity = qWant, qGot + } +} diff --git a/pkg/slo-controller/noderesource/plugins/util/util.go b/pkg/slo-controller/noderesource/plugins/util/util.go index a158fcc83..619f52443 100644 --- a/pkg/slo-controller/noderesource/plugins/util/util.go +++ b/pkg/slo-controller/noderesource/plugins/util/util.go @@ -102,10 +102,11 @@ func CalculateBatchResourceByPolicy(strategy *configuration.ColocationStrategy, return batchAllocatable, cpuMsg, memMsg } -func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, nodeAllocatable, unallocated corev1.ResourceList, allocatableMilliCPU, allocatableMemory, prodReclaimableMilliCPU int64, prodReclaimableMemory, nodeName string) (*resource.Quantity, *resource.Quantity, string, string) { +func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, nodeCapacity, unallocated corev1.ResourceList, allocatableMilliCPU, allocatableMemory int64, + prodReclaimableCPU, prodReclaimableMemory *resource.Quantity, nodeName string) (*resource.Quantity, *resource.Quantity, string, string) { defaultStrategy := sloconfig.DefaultColocationStrategy() cpuThresholdRatio := getPercentFromStrategy(strategy, &defaultStrategy, MidCPUThreshold) - if maxMilliCPU := float64(nodeAllocatable.Cpu().MilliValue()) * cpuThresholdRatio; allocatableMilliCPU > int64(maxMilliCPU) { + if maxMilliCPU := float64(nodeCapacity.Cpu().MilliValue()) * cpuThresholdRatio; allocatableMilliCPU > int64(maxMilliCPU) { allocatableMilliCPU = int64(maxMilliCPU) } if allocatableMilliCPU < 0 { @@ -116,7 +117,7 @@ func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, no cpuInMilliCores := resource.NewQuantity(allocatableMilliCPU, resource.DecimalSI) memThresholdRatio := getPercentFromStrategy(strategy, &defaultStrategy, MidMemoryThreshold) - if maxMemory := float64(nodeAllocatable.Memory().Value()) * memThresholdRatio; allocatableMemory > int64(maxMemory) { + if maxMemory := float64(nodeCapacity.Memory().Value()) * memThresholdRatio; allocatableMemory > int64(maxMemory) { allocatableMemory = int64(maxMemory) } if allocatableMemory < 0 { @@ -135,11 +136,15 @@ func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, no cpuInMilliCores.Add(*adjustedUnallocatedMilliCPU) memory.Add(*adjustedUnallocatedMemory) - cpuMsg := fmt.Sprintf("midAllocatable[CPU(milli-core)]:%v = min(nodeAllocatable:%v * thresholdRatio:%v, ProdReclaimable:%v) + Unallocated:%v * midUnallocatedRatio:%v", - cpuInMilliCores.Value(), nodeAllocatable.Cpu().MilliValue(), cpuThresholdRatio, prodReclaimableMilliCPU, unallocatedMilliCPU.Value(), midUnallocatedRatio) + cpuMsg := fmt.Sprintf("midAllocatable[CPU(milli-core)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v) + Unallocated:%v * midUnallocatedRatio:%v", + cpuInMilliCores.Value(), nodeCapacity.Cpu().MilliValue(), + cpuThresholdRatio, prodReclaimableCPU.MilliValue(), + unallocatedMilliCPU.Value(), midUnallocatedRatio) - memMsg := fmt.Sprintf("midAllocatable[Memory(byte)]:%s = min(nodeAllocatable:%s * thresholdRatio:%v, ProdReclaimable:%s) + Unallocated:%v * midUnallocatedRatio:%v", - memory.String(), nodeAllocatable.Memory().String(), memThresholdRatio, prodReclaimableMemory, unallocatedMemory.String(), midUnallocatedRatio) + memMsg := fmt.Sprintf("midAllocatable[Memory(GB)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v) + Unallocated:%v * midUnallocatedRatio:%v", + memory.ScaledValue(resource.Giga), nodeCapacity.Memory().ScaledValue(resource.Giga), + memThresholdRatio, prodReclaimableMemory.ScaledValue(resource.Giga), + unallocatedMemory.ScaledValue(resource.Giga), midUnallocatedRatio) return cpuInMilliCores, memory, cpuMsg, memMsg } @@ -181,11 +186,11 @@ func GetPodMetricUsage(info *slov1alpha1.PodMetricInfo) corev1.ResourceList { return GetResourceListForCPUAndMemory(info.PodUsage.ResourceList) } -func GetHostAppHPUsed(resourceMetrics *framework.ResourceMetrics) corev1.ResourceList { +func GetHostAppHPUsed(resourceMetrics *framework.ResourceMetrics, resPriority extension.PriorityClass) corev1.ResourceList { hostAppHPUsed := util.NewZeroResourceList() for _, hostAppMetric := range resourceMetrics.NodeMetric.Status.HostApplicationMetric { - if hostAppMetric.Priority == extension.PriorityBatch || hostAppMetric.Priority == extension.PriorityFree { - // only consider higher priority usage for batch allocatable + if extension.GetDefaultPriorityByPriorityClass(hostAppMetric.Priority) <= extension.GetDefaultPriorityByPriorityClass(resPriority) { + // consider higher priority usage for mid or batch allocatable // now only support product and batch(hadoop-yarn) priority for host application continue } diff --git a/pkg/slo-controller/noderesource/resource_calculator_test.go b/pkg/slo-controller/noderesource/resource_calculator_test.go index 3d20ec9f8..99fdb8fe1 100644 --- a/pkg/slo-controller/noderesource/resource_calculator_test.go +++ b/pkg/slo-controller/noderesource/resource_calculator_test.go @@ -190,12 +190,12 @@ func Test_calculateNodeResource(t *testing.T) { { Name: extension.MidCPU, Quantity: resource.NewQuantity(0, resource.DecimalSI), - Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeAllocatable:20000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:0 * midUnallocatedRatio:0", + Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:20000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:0 * midUnallocatedRatio:0", }, { Name: extension.MidMemory, Quantity: resource.NewQuantity(0, resource.BinarySI), - Message: "midAllocatable[Memory(byte)]:0 = min(nodeAllocatable:40G * thresholdRatio:1, ProdReclaimable:0) + Unallocated:20G * midUnallocatedRatio:0", + Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:40 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:20 * midUnallocatedRatio:0", }, }...), }, @@ -397,12 +397,12 @@ func Test_calculateNodeResource(t *testing.T) { { Name: extension.MidCPU, Quantity: resource.NewQuantity(0, resource.DecimalSI), - Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeAllocatable:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:60000 * midUnallocatedRatio:0", + Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:53000 * midUnallocatedRatio:0", }, { Name: extension.MidMemory, Quantity: resource.NewQuantity(0, resource.BinarySI), - Message: "midAllocatable[Memory(byte)]:0 = min(nodeAllocatable:120G * thresholdRatio:1, ProdReclaimable:0) + Unallocated:60G * midUnallocatedRatio:0", + Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:120 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:48 * midUnallocatedRatio:0", }, }...), }, @@ -607,12 +607,12 @@ func Test_calculateNodeResource(t *testing.T) { { Name: extension.MidCPU, Quantity: resource.NewQuantity(0, resource.DecimalSI), - Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeAllocatable:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:60000 * midUnallocatedRatio:0", + Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:53000 * midUnallocatedRatio:0", }, { Name: extension.MidMemory, Quantity: resource.NewQuantity(0, resource.BinarySI), - Message: "midAllocatable[Memory(byte)]:0 = min(nodeAllocatable:120G * thresholdRatio:1, ProdReclaimable:0) + Unallocated:60G * midUnallocatedRatio:0", + Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:120 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:48 * midUnallocatedRatio:0", }, }...), }, @@ -817,12 +817,12 @@ func Test_calculateNodeResource(t *testing.T) { { Name: extension.MidCPU, Quantity: resource.NewQuantity(0, resource.DecimalSI), - Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeAllocatable:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:60000 * midUnallocatedRatio:0", + Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:53000 * midUnallocatedRatio:0", }, { Name: extension.MidMemory, Quantity: resource.NewQuantity(0, resource.BinarySI), - Message: "midAllocatable[Memory(byte)]:0 = min(nodeAllocatable:120G * thresholdRatio:1, ProdReclaimable:0) + Unallocated:60G * midUnallocatedRatio:0", + Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:120 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:48 * midUnallocatedRatio:0", }, }...), }, @@ -1032,12 +1032,12 @@ func Test_calculateNodeResource(t *testing.T) { { Name: extension.MidCPU, Quantity: resource.NewQuantity(10000, resource.DecimalSI), - Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeAllocatable:100000 * thresholdRatio:1, ProdReclaimable:10000) + Unallocated:60000 * midUnallocatedRatio:0", + Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:10000) + Unallocated:53000 * midUnallocatedRatio:0", }, { Name: extension.MidMemory, Quantity: resource.NewQuantity(20000000000, resource.BinarySI), - Message: "midAllocatable[Memory(byte)]:19531250Ki = min(nodeAllocatable:120G * thresholdRatio:1, ProdReclaimable:20G) + Unallocated:60G * midUnallocatedRatio:0", + Message: "midAllocatable[Memory(GB)]:20 = min(nodeCapacity:120 * thresholdRatio:1, ProdReclaimable:20) + Unallocated:48 * midUnallocatedRatio:0", }, }...), },