Skip to content

Commit

Permalink
koordlet: add metrics about be used cpu and node used memory (#2283)
Browse files Browse the repository at this point in the history
Signed-off-by: lisen <[email protected]>
Co-authored-by: lisen <[email protected]>
  • Loading branch information
leason00 and lisen authored Dec 10, 2024
1 parent 7c94119 commit 053d636
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 1 deletion.
15 changes: 15 additions & 0 deletions pkg/koordlet/metrics/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ var (
Help: "Number of cpu cores used by node in realtime",
}, []string{NodeKey})

NodeUsedMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: KoordletSubsystem,
Name: "node_used_memory_bytes",
Help: "Memory used by node in realtime",
}, []string{NodeKey})

CommonCollectors = []prometheus.Collector{
KoordletStartTime,
CollectNodeCPUInfoStatus,
Expand All @@ -73,6 +79,7 @@ var (
PodEviction,
PodEvictionDetail.GetCounterVec(),
NodeUsedCPU,
NodeUsedMemory,
}
)

Expand Down Expand Up @@ -141,6 +148,14 @@ func RecordNodeUsedCPU(value float64) {
NodeUsedCPU.With(labels).Set(value)
}

func RecordNodeUsedMemory(value float64) {
labels := genNodeLabels()
if labels == nil {
return
}
NodeUsedMemory.With(labels).Set(value)
}

func labelsClone(labels prometheus.Labels) prometheus.Labels {
copyLabels := prometheus.Labels{}
for key, value := range labels {
Expand Down
15 changes: 15 additions & 0 deletions pkg/koordlet/metrics/cpu_suppress.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,16 @@ var (
Help: "Number of cpu cores used by LS. We consider non-BE pods and podMeta-missing pods as LS.",
}, []string{NodeKey})

BESuppressBEUsedCPU = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: KoordletSubsystem,
Name: "be_suppress_be_used_cpu_cores",
Help: "Number of cpu cores used by BE.",
}, []string{NodeKey})

CPUSuppressCollector = []prometheus.Collector{
BESuppressCPU,
BESuppressLSUsedCPU,
BESuppressBEUsedCPU,
}
)

Expand All @@ -53,3 +60,11 @@ func RecordBESuppressLSUsedCPU(value float64) {
}
BESuppressLSUsedCPU.With(labels).Set(value)
}

func RecordBESuppressBEUsedCPU(value float64) {
labels := genNodeLabels()
if labels == nil {
return
}
BESuppressBEUsedCPU.With(labels).Set(value)
}
2 changes: 2 additions & 0 deletions pkg/koordlet/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,9 @@ func TestCommonCollectors(t *testing.T) {
RecordCollectNodeLocalStorageInfoStatus(nil)
RecordBESuppressCores("cfsQuota", float64(1000))
RecordBESuppressLSUsedCPU(1.0)
RecordBESuppressBEUsedCPU(1.0)
RecordNodeUsedCPU(2.0)
RecordNodeUsedMemory(float64(1024))
RecordContainerScaledCFSBurstUS(testingPod.Namespace, testingPod.Name, testingContainer.ContainerID, testingContainer.Name, 1000000)
RecordContainerScaledCFSQuotaUS(testingPod.Namespace, testingPod.Name, testingContainer.ContainerID, testingContainer.Name, 1000000)
RecordPodEviction(testingPod.Namespace, testingPod.Name, "evictByCPU")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (

apiext "github.com/koordinator-sh/koordinator/apis/extension"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metrics"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/framework"
"github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor"
"github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer"
Expand Down Expand Up @@ -107,7 +108,7 @@ func (b *beResourceCollector) collectBECPUResourceMetric() {
klog.Errorf("failed to collect node BECPU, beLimitGenerateSampleErr: %v, beRequestGenerateSampleErr: %v, beUsageGenerateSampleErr: %v", err01, err02, err03)
return
}

metrics.RecordBESuppressBEUsedCPU(float64(beCPUUsageMilliCores) / 1000)
beMetrics := make([]metriccache.MetricSample, 0)
beMetrics = append(beMetrics, beLimit, beRequest, beUsage)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ func (n *nodeResourceCollector) collectNodeResUsed() {
// update collect time
n.started.Store(true)
metrics.RecordNodeUsedCPU(cpuUsageValue) // in cpu cores
metrics.RecordNodeUsedMemory(memUsageValue)

klog.V(4).Infof("collectNodeResUsed finished, count %v, cpu[%v], mem[%v]",
len(nodeMetrics), cpuUsageValue, memUsageValue)
Expand Down

0 comments on commit 053d636

Please sign in to comment.