generated from spack/spack-repo-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
job_price.py
105 lines (79 loc) · 3.79 KB
/
job_price.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import time
from datetime import datetime
from prometheus.client import PrometheusClient
from k8s.node import Node
from k8s.cluster import Cluster
from k8s.pricing import PriceIndex
class Job:
def __init__(self, gitlab_job_id, pod, node, mem_request, cpu_request, package=None):
self.gitlab_id = gitlab_job_id
self.pod = pod
self.node = node
self.package = package
self.mem_request = mem_request
self.cpu_request = cpu_request
def get_jobs(prometheus_client, start, end):
client = prometheus_client
filters = 'namespace="pipeline", container="build"'
# this query should be first since it controls the loop
# if other queries run first it's possible this will contain a pod that
# doesn't exist in previous queries
cpu_util_query = 'sum(rate(container_cpu_usage_seconds_total{' + filters + '}[90s])) by (pod)'
resp = client.query_range(cpu_util_query, start, end)
cpu_utilization = resp.json()["data"]["result"]
pod_labels_query = 'kube_pod_labels{namespace="pipeline"}'
resp = client.query_range(pod_labels_query, start, end)
pod_labels = resp.json()["data"]["result"]
pod_labels_dict = {x["metric"]["pod"]: x["metric"] for x in pod_labels}
cpu_request_query = 'kube_pod_container_resource_requests{' + filters + ', resource="cpu"}'
resp = client.query_range(cpu_request_query, start, end)
cpu_requests = resp.json()["data"]["result"]
cpu_requests_dict = {x["metric"]["pod"]: x for x in cpu_requests}
mem_request_query = 'kube_pod_container_resource_requests{' + filters + ', resource="memory"}'
resp = client.query_range(mem_request_query, start, end)
mem_requests = resp.json()["data"]["result"]
mem_requests_dict = {x["metric"]["pod"]: x for x in mem_requests}
mem_util_query = 'sum(container_memory_working_set_bytes{' + filters + '}) by (pod)'
resp = client.query_range(mem_util_query, start, end)
mem_utilization = resp.json()["data"]["result"]
memory_utilization_dict = {x["metric"]["pod"]: x for x in mem_utilization}
jobs = []
for cpu_utilization_data in cpu_utilization:
pod = cpu_utilization_data["metric"]["pod"]
# disregard builds that haven't yet started running on a node
try:
node = cpu_requests_dict[pod]["metric"]["node"]
gitlab_job = pod_labels_dict[pod]["label_gitlab_ci_job_id"]
cpu_request = cpu_requests_dict[pod]["values"][0][1]
mem_request = mem_requests_dict[pod]["values"][0][1]
first_seen = cpu_utilization_data["values"][0][0]
last_seen = cpu_utilization_data["values"][-1][0]
package = None
if "label_metrics_spack_job_spec_pkg_name" in pod_labels_dict[pod]:
package = pod_labels_dict[pod]["label_metrics_spack_job_spec_pkg_name"]
jobs.append(Job(gitlab_job, pod, node, mem_request, cpu_request, package=package))
except KeyError:
pass
return jobs
def main():
prometheus_url = os.environ.get("CIDA_PROMETHEUS_URL")
if prometheus_url is None:
print("Error: CIDA_PROMETHEUS_URL is undefined")
exit(1)
prometheus_cookie = os.environ.get("CIDA_PROMETHEUS_COOKIE")
if prometheus_cookie is None:
print("Error: CIDA_PROMETHEUS_COOKIE is undefined")
exit(1)
client = PrometheusClient(prometheus_url, prometheus_cookie)
cluster = Cluster(client)
now = time.time()
start = now - (60 * 60 * 6) # grab the last 6 hours worth of data
end = now
price_index = PriceIndex(client, "us-east-1", start, end, capacity_type="spot")
nodes = cluster.get_nodes(start, end)
jobs = get_jobs(client, start, end)
for job in jobs:
print(f"{job.gitlab_id}: {job.pod} :: {job.node}")
if __name__ == "__main__":
main()