Add retry limit check on the collection side

If a job is over the defined retry limit, we won't mark it as needing to be retried. However, because we are handling this on a pipeline level, if another job in the pipeline was OOMed but not over the retry limit the pipeline will still be retried, leading to some idiosyncrasies.
spack · Oct 29, 2024 · 1b2ba20 · 1b2ba20
1 parent 888db7a
commit 1b2ba20
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 5 deletions.
diff --git a/gantry/clients/prometheus/job.py b/gantry/clients/prometheus/job.py
@@ -53,8 +53,8 @@ async def get_annotations(self, gl_id: int, time: float) -> dict:
                     "annotation_metrics_spack_job_spec_compiler_version"
                 ],
                 "stack": annotations["annotation_metrics_spack_ci_stack_name"],
-                "retry_count": annotations.get(
-                    "annotation_metrics_spack_job_retry_count", 0
+                "retry_count": int(
+                    annotations.get("annotation_metrics_spack_job_retry_count", 0)
                 ),
             }
         except KeyError as e:

diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py
@@ -9,6 +9,7 @@
 from gantry.clients.prometheus import PrometheusClient
 from gantry.clients.prometheus.util import IncompleteData
 from gantry.models import Job
+from gantry.routes.prediction.prediction import RETRY_COUNT_LIMIT
 
 MB_IN_BYTES = 1_000_000
 BUILD_STAGE_REGEX = r"^stage-\d+$"
@@ -58,15 +59,19 @@ async def handle_pipeline(
 
     for job in failed_jobs:
         # insert every potentially oomed job
+        # if a job has been retried RETRY_COUNT_LIMIT times, oomed will be False
+        # start_pipeline will be called if any of the failed_jobs fit the criteria
+        # the same check is performed on the prediction side, and won't re-bump memory
         oomed = await fetch_job(job, db_conn, gitlab, prometheus, from_pipeline=True)
+
         # fetch_job can return None or (job_id: int, oomed: bool)
         if oomed and oomed[1]:
             retry_pipeline = True
 
     # once all jobs are collected/discarded, retry the pipeline if needed
     if retry_pipeline:
         await gitlab.start_pipeline(ref)
-        return True
+        return retry_pipeline
 
 
 async def fetch_job(
@@ -135,8 +140,12 @@ async def fetch_job(
         annotations = await prometheus.job.get_annotations(job.gl_id, job.midpoint)
         # check if failed job was OOM killed,
         # return early if it wasn't because we don't care about it anymore
+        # do not retry if the job has already been retried RETRY_COUNT_LIMIT times
         if job.status == "failed":
-            if await prometheus.job.is_oom(annotations["pod"], job.start, job.end):
+            if (
+                await prometheus.job.is_oom(annotations["pod"], job.start, job.end)
+                and annotations["retry_count"] < RETRY_COUNT_LIMIT
+            ):
                 oomed = True
             else:
                 return

diff --git a/gantry/tests/defs/collection.py b/gantry/tests/defs/collection.py
@@ -36,7 +36,7 @@
 
 # these were obtained by executing the respective queries to Prometheus and capturing the JSON output
 # or the raw output of PrometheusClient._query
-VALID_ANNOTATIONS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_annotations', 'annotation_gitlab_ci_job_id': '9892514', 'annotation_metrics_spack_ci_stack_name': 'e4s', 'annotation_metrics_spack_job_spec_arch': 'linux', 'annotation_metrics_spack_job_spec_compiler_name': 'gcc', 'annotation_metrics_spack_job_spec_compiler_version': '11.4.0', 'annotation_metrics_spack_job_spec_pkg_name': 'gmsh', 'annotation_metrics_spack_job_spec_pkg_version': '4.8.4', 'annotation_metrics_spack_job_spec_variants': '+alglib~cairo+cgns+compression~eigen~external+fltk+gmp~hdf5~ipo+med+metis+mmg+mpi+netgen+oce~opencascade~openmp~petsc~privateapi+shared~slepc+tetgen+voropp build_system=cmake build_type=Release generator=make', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0'}, 'value': [1706117733, '1']}]}}
+VALID_ANNOTATIONS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_annotations', 'annotation_gitlab_ci_job_id': '9892514', 'annotation_metrics_spack_ci_stack_name': 'e4s', 'annotation_metrics_spack_job_spec_arch': 'linux', 'annotation_metrics_spack_job_spec_compiler_name': 'gcc', 'annotation_metrics_spack_job_spec_compiler_version': '11.4.0', 'annotation_metrics_spack_job_retry_count': '0', 'annotation_metrics_spack_job_spec_pkg_name': 'gmsh', 'annotation_metrics_spack_job_spec_pkg_version': '4.8.4', 'annotation_metrics_spack_job_spec_variants': '+alglib~cairo+cgns+compression~eigen~external+fltk+gmp~hdf5~ipo+med+metis+mmg+mpi+netgen+oce~opencascade~openmp~petsc~privateapi+shared~slepc+tetgen+voropp build_system=cmake build_type=Release generator=make', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0'}, 'value': [1706117733, '1']}]}}
 VALID_RESOURCE_REQUESTS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_container_resource_requests', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'cpu', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'core'}, 'value': [1706117733, '0.75']}, {'metric': {'__name__': 'kube_pod_container_resource_requests', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'memory', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'byte'}, 'value': [1706117733, '2000000000']}]}}
 VALID_RESOURCE_LIMITS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_container_resource_limits', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'memory', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'byte'}, 'value': [1706117733, '48000000000']}]}}
 VALID_MEMORY_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'container_memory_working_set_bytes', 'container': 'build', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117115, '2785280'], [1706117116, '2785280'], [1706117117, '2785280'], [1706117118, '2785280'], [1706117119, '2785280'], [1706117120, '2785280'], [1706117121, '2785280'], [1706117122, '2785280'], [1706117123, '2785280'], [1706117124, '2785280'], [1706117125, '2785280'], [1706117126, '2785280'], [1706117127, '2785280'], [1706117128, '2785280'], [1706117129, '2785280'], [1706117130, '2785280'], [1706118416, '594620416'], [1706118417, '594620416'], [1706118418, '594620416'], [1706118419, '594620416'], [1706118420, '594620416']]}]}}

diff --git a/gantry/tests/test_collection.py b/gantry/tests/test_collection.py
@@ -1,8 +1,11 @@
+import copy
+
 import pytest
 
 from gantry.clients.gitlab import GitlabClient
 from gantry.clients.prometheus import PrometheusClient
 from gantry.routes.collection import fetch_job, fetch_node, handle_pipeline
+from gantry.routes.prediction.prediction import RETRY_COUNT_LIMIT
 from gantry.tests.defs import collection as defs
 
 # mapping of prometheus request shortcuts
@@ -186,6 +189,22 @@ async def test_handle_pipeline(db_conn, gitlab, prometheus):
         await handle_pipeline(defs.FAILED_PIPELINE, db_conn, gitlab, prometheus) is None
     )
 
+    # job oom killed, but over retry limit
+    p_list = list(p.values())
+    # modify the annotations
+    # deepcopy so the original annotations are not modified
+    p_list[0] = copy.deepcopy(p_list[0])
+    p_list[0]["data"]["result"][0]["metric"] |= {
+        "annotation_metrics_spack_job_retry_count": str(RETRY_COUNT_LIMIT)
+    }
+    p_list.insert(1, defs.OOM_KILLED)
+    prometheus._query.side_effect = p_list
+    # handle_pipeline should not allow a retry because the retry count is over the limit
+    # however, if another job was oomed but not over the limit, it should be retried
+    assert (
+        await handle_pipeline(defs.FAILED_PIPELINE, db_conn, gitlab, prometheus) is None
+    )
+
     # pipeline failed, one job was oomed the other was not
     p_list = list(p.values())
     # after verifying job was not oomed, go onto the next job to insert annotations