From 053d636656895de426ac4e4dbb78dece56cb9ec4 Mon Sep 17 00:00:00 2001 From: leason <2395491531@qq.com> Date: Tue, 10 Dec 2024 13:56:11 +0800 Subject: [PATCH] koordlet: add metrics about be used cpu and node used memory (#2283) Signed-off-by: lisen Co-authored-by: lisen --- pkg/koordlet/metrics/common.go | 15 +++++++++++++++ pkg/koordlet/metrics/cpu_suppress.go | 15 +++++++++++++++ pkg/koordlet/metrics/metrics_test.go | 2 ++ .../beresource/be_resource_collector.go | 3 ++- .../noderesource/node_resource_collector.go | 1 + 5 files changed, 35 insertions(+), 1 deletion(-) diff --git a/pkg/koordlet/metrics/common.go b/pkg/koordlet/metrics/common.go index 9528d3580..19a27c183 100644 --- a/pkg/koordlet/metrics/common.go +++ b/pkg/koordlet/metrics/common.go @@ -65,6 +65,12 @@ var ( Help: "Number of cpu cores used by node in realtime", }, []string{NodeKey}) + NodeUsedMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: KoordletSubsystem, + Name: "node_used_memory_bytes", + Help: "Memory used by node in realtime", + }, []string{NodeKey}) + CommonCollectors = []prometheus.Collector{ KoordletStartTime, CollectNodeCPUInfoStatus, @@ -73,6 +79,7 @@ var ( PodEviction, PodEvictionDetail.GetCounterVec(), NodeUsedCPU, + NodeUsedMemory, } ) @@ -141,6 +148,14 @@ func RecordNodeUsedCPU(value float64) { NodeUsedCPU.With(labels).Set(value) } +func RecordNodeUsedMemory(value float64) { + labels := genNodeLabels() + if labels == nil { + return + } + NodeUsedMemory.With(labels).Set(value) +} + func labelsClone(labels prometheus.Labels) prometheus.Labels { copyLabels := prometheus.Labels{} for key, value := range labels { diff --git a/pkg/koordlet/metrics/cpu_suppress.go b/pkg/koordlet/metrics/cpu_suppress.go index ba98aa764..942e10b35 100644 --- a/pkg/koordlet/metrics/cpu_suppress.go +++ b/pkg/koordlet/metrics/cpu_suppress.go @@ -31,9 +31,16 @@ var ( Help: "Number of cpu cores used by LS. We consider non-BE pods and podMeta-missing pods as LS.", }, []string{NodeKey}) + BESuppressBEUsedCPU = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: KoordletSubsystem, + Name: "be_suppress_be_used_cpu_cores", + Help: "Number of cpu cores used by BE.", + }, []string{NodeKey}) + CPUSuppressCollector = []prometheus.Collector{ BESuppressCPU, BESuppressLSUsedCPU, + BESuppressBEUsedCPU, } ) @@ -53,3 +60,11 @@ func RecordBESuppressLSUsedCPU(value float64) { } BESuppressLSUsedCPU.With(labels).Set(value) } + +func RecordBESuppressBEUsedCPU(value float64) { + labels := genNodeLabels() + if labels == nil { + return + } + BESuppressBEUsedCPU.With(labels).Set(value) +} diff --git a/pkg/koordlet/metrics/metrics_test.go b/pkg/koordlet/metrics/metrics_test.go index 75312a79c..9b785823c 100644 --- a/pkg/koordlet/metrics/metrics_test.go +++ b/pkg/koordlet/metrics/metrics_test.go @@ -138,7 +138,9 @@ func TestCommonCollectors(t *testing.T) { RecordCollectNodeLocalStorageInfoStatus(nil) RecordBESuppressCores("cfsQuota", float64(1000)) RecordBESuppressLSUsedCPU(1.0) + RecordBESuppressBEUsedCPU(1.0) RecordNodeUsedCPU(2.0) + RecordNodeUsedMemory(float64(1024)) RecordContainerScaledCFSBurstUS(testingPod.Namespace, testingPod.Name, testingContainer.ContainerID, testingContainer.Name, 1000000) RecordContainerScaledCFSQuotaUS(testingPod.Namespace, testingPod.Name, testingContainer.ContainerID, testingContainer.Name, 1000000) RecordPodEviction(testingPod.Namespace, testingPod.Name, "evictByCPU") diff --git a/pkg/koordlet/metricsadvisor/collectors/beresource/be_resource_collector.go b/pkg/koordlet/metricsadvisor/collectors/beresource/be_resource_collector.go index 46fd43434..56552c7a9 100644 --- a/pkg/koordlet/metricsadvisor/collectors/beresource/be_resource_collector.go +++ b/pkg/koordlet/metricsadvisor/collectors/beresource/be_resource_collector.go @@ -27,6 +27,7 @@ import ( apiext "github.com/koordinator-sh/koordinator/apis/extension" "github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache" + "github.com/koordinator-sh/koordinator/pkg/koordlet/metrics" "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/framework" "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" @@ -107,7 +108,7 @@ func (b *beResourceCollector) collectBECPUResourceMetric() { klog.Errorf("failed to collect node BECPU, beLimitGenerateSampleErr: %v, beRequestGenerateSampleErr: %v, beUsageGenerateSampleErr: %v", err01, err02, err03) return } - + metrics.RecordBESuppressBEUsedCPU(float64(beCPUUsageMilliCores) / 1000) beMetrics := make([]metriccache.MetricSample, 0) beMetrics = append(beMetrics, beLimit, beRequest, beUsage) diff --git a/pkg/koordlet/metricsadvisor/collectors/noderesource/node_resource_collector.go b/pkg/koordlet/metricsadvisor/collectors/noderesource/node_resource_collector.go index 75b7d12fd..482173898 100644 --- a/pkg/koordlet/metricsadvisor/collectors/noderesource/node_resource_collector.go +++ b/pkg/koordlet/metricsadvisor/collectors/noderesource/node_resource_collector.go @@ -159,6 +159,7 @@ func (n *nodeResourceCollector) collectNodeResUsed() { // update collect time n.started.Store(true) metrics.RecordNodeUsedCPU(cpuUsageValue) // in cpu cores + metrics.RecordNodeUsedMemory(memUsageValue) klog.V(4).Infof("collectNodeResUsed finished, count %v, cpu[%v], mem[%v]", len(nodeMetrics), cpuUsageValue, memUsageValue)