Skip to content

Commit

Permalink
total pending reason metric
Browse files Browse the repository at this point in the history
  • Loading branch information
abhinavDhulipala committed Oct 24, 2024
1 parent 3d726f4 commit c16c7ab
Show file tree
Hide file tree
Showing 7 changed files with 382 additions and 22 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ coverage.html
coverage.out
.DS_Store
dist
**/__debug_bin*
8 changes: 4 additions & 4 deletions exporter/fixtures/squeue_fallback.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{"a": "account1", "id": 26515966, "end_time": "2023-09-21T00:21:42", "state": "RUNNING", "p": "hw-h", "cpu": 1, "mem": "128G", "array_id": "N/A"}
{"a": "account1", "id": 50580016, "end_time": "2023-09-21T14:31:11", "state": "RUNNING", "p": "hw-l", "cpu": 1, "mem": "62.50G", "array_id": "N/A"}
{"a": "account1", "id": 51447051, "end_time": "N/A", "state": "PENDING", "p": "hw-h", "cpu": 1, "mem": "40000M", "array_id": "N/A"}
{"a": "account1", "id": 18804, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": 24, "mem": "118G", "array_id": "N/A"}
{"a": "account1", "id": 26515966, "end_time": "2023-09-21T00:21:42", "state": "RUNNING", "p": "hw-h", "cpu": 1, "mem": "128G", "array_id": "N/A", "r": "cs10"}
{"a": "account1", "id": 50580016, "end_time": "2023-09-21T14:31:11", "state": "RUNNING", "p": "hw-l", "cpu": 1, "mem": "62.50G", "array_id": "N/A", "r": "cs10"}
{"a": "account1", "id": 51447051, "end_time": "N/A", "state": "PENDING", "p": "hw-h", "cpu": 1, "mem": "40000M", "array_id": "N/A", "r": "(Dependency)"}
{"a": "account1", "id": 18804, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": 24, "mem": "118G", "array_id": "N/A", "r": "(Priority)"}
# test counter inc with faulty inputs
{"a": "account1", "id": 18805, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": xx, "mem": "118G", "array_id": "N/A"}
{"a": "account1", "id": 18806, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": xx, "mem": "118G", "array_id": "N/A"}
289 changes: 288 additions & 1 deletion exporter/fixtures/squeue_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,293 @@
"user_name": "bkd",
"wckey": "",
"current_working_directory": "/somedir/on/nfs"
}
},
{
"account": "account1",
"accrue_time": 0,
"admin_comment": "",
"allocating_node": "bkd",
"array_job_id": {
"set": true,
"infinite": false,
"number": 58948420
},
"array_task_id": {
"set": false,
"infinite": false,
"number": 0
},
"array_max_tasks": {
"set": true,
"infinite": false,
"number": 0
},
"array_task_string": "1-10",
"association_id": 4001,
"batch_features": "",
"batch_flag": true,
"batch_host": "",
"flags": [
"EXACT_TASK_COUNT_REQUESTED",
"EXACT_CPU_COUNT_REQUESTED",
"EXACT_MEMORY_REQUESTED",
"USING_DEFAULT_QOS",
"USING_DEFAULT_WCKEY",
"DEPENDENT"
],
"burst_buffer": "",
"burst_buffer_state": "",
"cluster": "rivos",
"cluster_features": "",
"command": "",
"comment": "",
"container": "",
"container_id": "",
"contiguous": false,
"core_spec": 0,
"thread_spec": 32766,
"cores_per_socket": {
"set": false,
"infinite": false,
"number": 0
},
"billable_tres": {
"set": false,
"infinite": false,
"number": 0.0
},
"cpus_per_task": {
"set": true,
"infinite": false,
"number": 1
},
"cpu_frequency_minimum": {
"set": false,
"infinite": false,
"number": 0
},
"cpu_frequency_maximum": {
"set": false,
"infinite": false,
"number": 0
},
"cpu_frequency_governor": {
"set": false,
"infinite": false,
"number": 0
},
"cpus_per_tres": "",
"cron": "",
"deadline": 1729802248,
"delay_boot": {
"set": true,
"infinite": false,
"number": 0
},
"dependency": "afterok:58948419(unfulfilled)",
"derived_exit_code": {
"set": true,
"infinite": false,
"number": 0
},
"eligible_time": 0,
"end_time": 0,
"excluded_nodes": "",
"exit_code": {
"set": true,
"infinite": false,
"number": 0
},
"extra": "",
"failed_node": "",
"features": "",
"federation_origin": "",
"federation_siblings_active": "",
"federation_siblings_viable": "",
"gres_detail": [
],
"group_id": 1977700000,
"group_name": "rvs",
"het_job_id": {
"set": true,
"infinite": false,
"number": 0
},
"het_job_id_set": "",
"het_job_offset": {
"set": true,
"infinite": false,
"number": 0
},
"job_id": 58948420,
"job_resources": {
},
"job_size_str": [
],
"job_state": "PENDING",
"last_sched_evaluation": 1729715848,
"licenses": "rtl_single_core@r,",
"mail_type": [
],
"mail_user": "bkd",
"max_cpus": {
"set": true,
"infinite": false,
"number": 0
},
"max_nodes": {
"set": true,
"infinite": false,
"number": 0
},
"mcs_label": "",
"memory_per_tres": "",
"name": "some job name",
"network": "",
"nodes": "",
"nice": 0,
"tasks_per_core": {
"set": false,
"infinite": true,
"number": 0
},
"tasks_per_tres": {
"set": true,
"infinite": false,
"number": 0
},
"tasks_per_node": {
"set": true,
"infinite": false,
"number": 0
},
"tasks_per_socket": {
"set": false,
"infinite": true,
"number": 0
},
"tasks_per_board": {
"set": true,
"infinite": false,
"number": 0
},
"cpus": {
"set": true,
"infinite": false,
"number": 1
},
"node_count": {
"set": true,
"infinite": false,
"number": 1
},
"tasks": {
"set": true,
"infinite": false,
"number": 1
},
"partition": "hw-m",
"prefer": "",
"memory_per_cpu": {
"set": false,
"infinite": false,
"number": 0
},
"memory_per_node": {
"set": true,
"infinite": false,
"number": 131072
},
"minimum_cpus_per_node": {
"set": true,
"infinite": false,
"number": 1
},
"minimum_tmp_disk_per_node": {
"set": true,
"infinite": false,
"number": 0
},
"power": {
"flags": [
]
},
"preempt_time": 0,
"preemptable_time": 0,
"pre_sus_time": 0,
"hold": false,
"priority": {
"set": true,
"infinite": false,
"number": 1368
},
"profile": [
"NOT_SET"
],
"qos": "normal",
"reboot": false,
"required_nodes": "",
"minimum_switches": 0,
"requeue": true,
"resize_time": 0,
"restart_cnt": 0,
"resv_name": "",
"scheduled_nodes": "",
"selinux_context": "",
"shared": [
"oversubscribe"
],
"exclusive": [
"false"
],
"oversubscribe": true,
"show_flags": [
"DETAIL",
"LOCAL"
],
"sockets_per_board": 0,
"sockets_per_node": {
"set": false,
"infinite": false,
"number": 0
},
"start_time": 0,
"state_description": "",
"state_reason": "Dependency",
"standard_error": "\/path\/to\/some\/dir.\/logs\/slurm-58948420.out",
"standard_input": "\/dev\/null",
"standard_output": "\/path\/to\/some\/dir\/.\/logs\/slurm-58948420.out",
"submit_time": 1729715848,
"suspend_time": 0,
"system_comment": "",
"time_limit": {
"set": true,
"infinite": false,
"number": 1439
},
"time_minimum": {
"set": true,
"infinite": false,
"number": 0
},
"threads_per_core": {
"set": false,
"infinite": false,
"number": 0
},
"tres_bind": "",
"tres_freq": "",
"tres_per_job": "",
"tres_per_node": "",
"tres_per_socket": "",
"tres_per_task": "",
"tres_req_str": "cpu=1,mem=128G,node=1,billing=1",
"tres_alloc_str": "",
"user_id": 1234,
"user_name": "bkd",
"maximum_switch_wait_time": 0,
"wckey": "",
"current_working_directory": "\/path\/to\/some\/dir"
}
]
}
Loading

0 comments on commit c16c7ab

Please sign in to comment.