Skip to content

Commit

Permalink
[exporter/node] total node count per partition per metric (#87)
Browse files Browse the repository at this point in the history
* total node count per partition per metric

* rev micro version
  • Loading branch information
abhinavDhulipala authored Aug 12, 2024
1 parent afbcb41 commit 3ec6076
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 3 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ You can also install the exporter directly with `go install github.com/rivosinc/

```bash
# example installation
$ go install github.com/rivosinc/[email protected].2
$ go install github.com/rivosinc/[email protected].3
# or if you like living on the edge
$ go install github.com/rivosinc/prometheus-slurm-exporter@latest
# if not already added, ensure
Expand Down
11 changes: 9 additions & 2 deletions exporter/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ type PartitionMetric struct {
FreeMemory float64
StateAllocMemory map[string]float64
StateAllocCpus map[string]float64
StateNodeCount map[string]float64
CpuLoad float64
IdleCpus float64
Weight float64
Expand All @@ -216,11 +217,13 @@ func fetchNodePartitionMetrics(nodes []NodeMetric) map[string]*PartitionMetric {
partition = &PartitionMetric{
StateAllocMemory: make(map[string]float64),
StateAllocCpus: make(map[string]float64),
StateNodeCount: make(map[string]float64),
}
partitions[p] = partition
}
partition.StateAllocCpus[node.State] += node.AllocCpus
partition.StateAllocMemory[node.State] += node.AllocMemory
partition.StateNodeCount[node.State] += 1
partition.TotalCpus += node.Cpus
partition.CpuLoad += node.CpuLoad
partition.FreeMemory += node.FreeMemory
Expand Down Expand Up @@ -295,6 +298,7 @@ type NodesCollector struct {
partitionFreeMemory *prometheus.Desc
partitionAllocMemory *prometheus.Desc
partitionAllocCpus *prometheus.Desc
partitionNodeCount *prometheus.Desc
partitionIdleCpus *prometheus.Desc
partitionWeight *prometheus.Desc
partitionCpuLoad *prometheus.Desc
Expand Down Expand Up @@ -332,8 +336,9 @@ func NewNodeCollecter(config *Config) *NodesCollector {
partitionCpus: prometheus.NewDesc("slurm_partition_total_cpus", "Total cpus per partition", []string{"partition"}, nil),
partitionRealMemory: prometheus.NewDesc("slurm_partition_real_mem", "Real mem per partition", []string{"partition"}, nil),
partitionFreeMemory: prometheus.NewDesc("slurm_partition_free_mem", "Free mem per partition", []string{"partition"}, nil),
partitionAllocMemory: prometheus.NewDesc("slurm_partition_alloc_mem", "Alloc mem per partition", []string{"partition", "state"}, nil),
partitionAllocCpus: prometheus.NewDesc("slurm_partition_alloc_cpus", "Alloc cpus per partition", []string{"partition", "state"}, nil),
partitionAllocMemory: prometheus.NewDesc("slurm_partition_alloc_mem", "Alloc mem per partition per state", []string{"partition", "state"}, nil),
partitionAllocCpus: prometheus.NewDesc("slurm_partition_alloc_cpus", "Alloc cpus per partition per state", []string{"partition", "state"}, nil),
partitionNodeCount: prometheus.NewDesc("slurm_partition_node_count", "Node count per partition per state", []string{"partition", "state"}, nil),
partitionIdleCpus: prometheus.NewDesc("slurm_partition_idle_cpus", "Idle cpus per partition", []string{"partition"}, nil),
partitionWeight: prometheus.NewDesc("slurm_partition_weight", "Total node weight per partition??", []string{"partition"}, nil),
partitionCpuLoad: prometheus.NewDesc("slurm_partition_cpu_load", "Total cpu load per partition", []string{"partition"}, nil),
Expand All @@ -356,6 +361,7 @@ func NewNodeCollecter(config *Config) *NodesCollector {
func (nc *NodesCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- nc.partitionAllocCpus
ch <- nc.partitionAllocMemory
ch <- nc.partitionNodeCount
ch <- nc.partitionCpus
ch <- nc.partitionCpuLoad
ch <- nc.partitionFreeMemory
Expand Down Expand Up @@ -394,6 +400,7 @@ func (nc *NodesCollector) Collect(ch chan<- prometheus.Metric) {
for partition, metric := range partitionMetrics {
emitStateVal(partition, metric.StateAllocCpus, nc.partitionAllocCpus)
emitStateVal(partition, metric.StateAllocMemory, nc.partitionAllocMemory)
emitStateVal(partition, metric.StateNodeCount, nc.partitionNodeCount)
if metric.TotalCpus > 0 {
ch <- prometheus.MustNewConstMetric(nc.partitionCpus, prometheus.GaugeValue, metric.TotalCpus, partition)
}
Expand Down
1 change: 1 addition & 0 deletions exporter/nodes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ func TestPartitionMetric(t *testing.T) {
assert.Equal(1.823573e+06, metrics["hw"].FreeMemory)
assert.Equal(2e+06, metrics["hw"].RealMemory)
assert.Equal(252., metrics["hw"].IdleCpus)
assert.Equal(4., sumStateMetric(metrics["hw"].StateNodeCount))
}

func TestNodeSummaryCpuMetric(t *testing.T) {
Expand Down

0 comments on commit 3ec6076

Please sign in to comment.