Skip to content

Commit

Permalink
Merge branch 'feature-distro-packages' of github.com:drewstinnett/pro…
Browse files Browse the repository at this point in the history
…metheus-slurm-exporter into feature-distro-packages
  • Loading branch information
drewstinnett committed Sep 3, 2024
2 parents 5a66f9c + f62d0b2 commit 0969bb7
Show file tree
Hide file tree
Showing 9 changed files with 304 additions and 153 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ tmp*
coverage.html
coverage.out
.DS_Store
dist/
dist
3 changes: 1 addition & 2 deletions .goreleaser.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
version: 2
builds:
-
goos:
- goos:
- linux
goarch:
- amd64
Expand Down
45 changes: 40 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,46 @@ $ curl localhost:9092/metrics | grep "# HELP"
### Exporter Env Var Docs

Env vars can be sepcified in a `.env` file, while using the `just`
| Var | Default Value | Purpose |
|---------------|---------------|-----------------------------------------------------------------------------|
| POLL_LIMIT | 10 | # of seconds to wait before polling slurmctl again (client-side throttling) |
| LOGLEVEL | info | Log Level: debug, info, warn, error |
| CLI_TIMEOUT | 10. | # seconds before the exporter terminates command. |
| Var | Default Value | Purpose |
|-----------------|---------------|-----------------------------------------------------------------------------|
| POLL_LIMIT | 10 | # of seconds to wait before polling slurmctl again (client-side throttling) |
| LOGLEVEL | info | Log Level: debug, info, warn, error |
| CLI_TIMEOUT | 10. | # seconds before the exporter terminates command. |
| TRACE_ROOT_PATH | "cwd" | path to ./templates directory where html files are located |

### RPM/DEB Packages

You can download RPM or DEB versions from the [Releases](releases/) tab. The
packages are configured to use systemd to start and stop the service.

Configuring the systemd service

`$ systemctl edit prometheus-slurm-exporter.service`

```text
### Editing /etc/systemd/system/prometheus-slurm-exporter.service.d/override.conf
### Anything between here and the comment below will become the new contents of the file
[Service]
Environment="PATH=/opt/slurm/bin"
Environment="POLL_INTERVAL=300"
Environment="CLI_TIMEOUT=60"
Environment="LOGLEVEL=debug"
### Lines below this comment will be discarded
### /usr/lib/systemd/system/prometheus-slurm-exporter.service
# [Unit]
# Description=Prometheus SLURM Exporter
#
# [Service]
# ExecStart=/usr/bin/prometheus-slurm-exporter
# Restart=always
# RestartSec=15
#
# [Install]
# WantedBy=multi-user.target
```

### RPM/DEB Packages

Expand Down
119 changes: 59 additions & 60 deletions exporter/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,28 @@ type JobJsonFetcher struct {
errCounter prometheus.Counter
}

func (jjf *JobJsonFetcher) FetchMetrics() ([]JobMetric, error) {
func (jjf *JobJsonFetcher) fetch() ([]JobMetric, error) {
data, err := jjf.scraper.FetchRawBytes()
if err != nil {
jjf.errCounter.Inc()
return nil, err
}
return jjf.cache.FetchOrThrottle(func() ([]JobMetric, error) { return parseJobMetrics(data) })
var squeue squeueResponse
err = json.Unmarshal(data, &squeue)
if err != nil {
slog.Error("Unmarshaling node metrics %q", err)
return nil, err
}
for _, j := range squeue.Jobs {
for _, resource := range j.JobResources.AllocNodes {
resource.Mem *= 1e9
}
}
return squeue.Jobs, nil
}

func (jjf *JobJsonFetcher) FetchMetrics() ([]JobMetric, error) {
return jjf.cache.FetchOrThrottle(jjf.fetch)
}

func (jjf *JobJsonFetcher) ScrapeDuration() time.Duration {
Expand All @@ -78,65 +93,11 @@ type JobCliFallbackFetcher struct {
errCounter prometheus.Counter
}

func (jcf *JobCliFallbackFetcher) FetchMetrics() ([]JobMetric, error) {
data, err := jcf.scraper.FetchRawBytes()
func (jcf *JobCliFallbackFetcher) fetch() ([]JobMetric, error) {
squeue, err := jcf.scraper.FetchRawBytes()
if err != nil {
jcf.errCounter.Inc()
return nil, err
}
return jcf.cache.FetchOrThrottle(func() ([]JobMetric, error) { return parseCliFallback(data, jcf.errCounter) })
}

func (jcf *JobCliFallbackFetcher) ScrapeDuration() time.Duration {
return jcf.scraper.Duration()
}

func (jcf *JobCliFallbackFetcher) ScrapeError() prometheus.Counter {
return jcf.errCounter
}

func totalAllocMem(resource *JobResource) float64 {
var allocMem float64
for _, node := range resource.AllocNodes {
allocMem += node.Mem
}
return allocMem
}

func parseJobMetrics(jsonJobList []byte) ([]JobMetric, error) {
var squeue squeueResponse
err := json.Unmarshal(jsonJobList, &squeue)
if err != nil {
slog.Error("Unmarshaling node metrics %q", err)
return nil, err
}
for _, j := range squeue.Jobs {
for _, resource := range j.JobResources.AllocNodes {
resource.Mem *= 1e9
}
}
return squeue.Jobs, nil
}

type NAbleTime struct{ time.Time }

// report beginning of time in the case of N/A
func (nat *NAbleTime) UnmarshalJSON(data []byte) error {
var tString string
if err := json.Unmarshal(data, &tString); err != nil {
return err
}
nullSet := map[string]struct{}{"N/A": {}, "NONE": {}}
if _, ok := nullSet[tString]; ok {
nat.Time = time.Time{}
return nil
}
t, err := time.Parse("2006-01-02T15:04:05", tString)
nat.Time = t
return err
}

func parseCliFallback(squeue []byte, errorCounter prometheus.Counter) ([]JobMetric, error) {
jobMetrics := make([]JobMetric, 0)
// clean input
squeue = bytes.TrimSpace(squeue)
Expand All @@ -159,13 +120,13 @@ func parseCliFallback(squeue []byte, errorCounter prometheus.Counter) ([]JobMetr
}
if err := json.Unmarshal(line, &metric); err != nil {
slog.Error(fmt.Sprintf("squeue fallback parse error: failed on line %d `%s`", i, line))
errorCounter.Inc()
jcf.errCounter.Inc()
continue
}
mem, err := MemToFloat(metric.Mem)
if err != nil {
slog.Error(fmt.Sprintf("squeue fallback parse error: failed on line %d `%s` with err `%q`", i, line, err))
errorCounter.Inc()
jcf.errCounter.Inc()
continue
}
openapiJobMetric := JobMetric{
Expand All @@ -185,6 +146,44 @@ func parseCliFallback(squeue []byte, errorCounter prometheus.Counter) ([]JobMetr
return jobMetrics, nil
}

func (jcf *JobCliFallbackFetcher) FetchMetrics() ([]JobMetric, error) {
return jcf.cache.FetchOrThrottle(jcf.fetch)
}

func (jcf *JobCliFallbackFetcher) ScrapeDuration() time.Duration {
return jcf.scraper.Duration()
}

func (jcf *JobCliFallbackFetcher) ScrapeError() prometheus.Counter {
return jcf.errCounter
}

func totalAllocMem(resource *JobResource) float64 {
var allocMem float64
for _, node := range resource.AllocNodes {
allocMem += node.Mem
}
return allocMem
}

type NAbleTime struct{ time.Time }

// report beginning of time in the case of N/A
func (nat *NAbleTime) UnmarshalJSON(data []byte) error {
var tString string
if err := json.Unmarshal(data, &tString); err != nil {
return err
}
nullSet := map[string]struct{}{"N/A": {}, "NONE": {}}
if _, ok := nullSet[tString]; ok {
nat.Time = time.Time{}
return nil
}
t, err := time.Parse("2006-01-02T15:04:05", tString)
nat.Time = t
return err
}

type UserJobMetric struct {
stateJobCount map[string]float64
totalJobCount float64
Expand Down
Loading

0 comments on commit 0969bb7

Please sign in to comment.