From 1b2ba2027fe7514795529e801f2ab865be3975c7 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Fri, 27 Sep 2024 14:08:28 -0300 Subject: [PATCH] Add retry limit check on the collection side If a job is over the defined retry limit, we won't mark it as needing to be retried. However, because we are handling this on a pipeline level, if another job in the pipeline was OOMed but not over the retry limit the pipeline will still be retried, leading to some idiosyncrasies. --- gantry/clients/prometheus/job.py | 4 ++-- gantry/routes/collection.py | 13 +++++++++++-- gantry/tests/defs/collection.py | 2 +- gantry/tests/test_collection.py | 19 +++++++++++++++++++ 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/gantry/clients/prometheus/job.py b/gantry/clients/prometheus/job.py index f7cbf0f..54e956f 100644 --- a/gantry/clients/prometheus/job.py +++ b/gantry/clients/prometheus/job.py @@ -53,8 +53,8 @@ async def get_annotations(self, gl_id: int, time: float) -> dict: "annotation_metrics_spack_job_spec_compiler_version" ], "stack": annotations["annotation_metrics_spack_ci_stack_name"], - "retry_count": annotations.get( - "annotation_metrics_spack_job_retry_count", 0 + "retry_count": int( + annotations.get("annotation_metrics_spack_job_retry_count", 0) ), } except KeyError as e: diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index d88e8eb..c817e9f 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -9,6 +9,7 @@ from gantry.clients.prometheus import PrometheusClient from gantry.clients.prometheus.util import IncompleteData from gantry.models import Job +from gantry.routes.prediction.prediction import RETRY_COUNT_LIMIT MB_IN_BYTES = 1_000_000 BUILD_STAGE_REGEX = r"^stage-\d+$" @@ -58,7 +59,11 @@ async def handle_pipeline( for job in failed_jobs: # insert every potentially oomed job + # if a job has been retried RETRY_COUNT_LIMIT times, oomed will be False + # start_pipeline will be called if any of the failed_jobs fit the criteria + # the same check is performed on the prediction side, and won't re-bump memory oomed = await fetch_job(job, db_conn, gitlab, prometheus, from_pipeline=True) + # fetch_job can return None or (job_id: int, oomed: bool) if oomed and oomed[1]: retry_pipeline = True @@ -66,7 +71,7 @@ async def handle_pipeline( # once all jobs are collected/discarded, retry the pipeline if needed if retry_pipeline: await gitlab.start_pipeline(ref) - return True + return retry_pipeline async def fetch_job( @@ -135,8 +140,12 @@ async def fetch_job( annotations = await prometheus.job.get_annotations(job.gl_id, job.midpoint) # check if failed job was OOM killed, # return early if it wasn't because we don't care about it anymore + # do not retry if the job has already been retried RETRY_COUNT_LIMIT times if job.status == "failed": - if await prometheus.job.is_oom(annotations["pod"], job.start, job.end): + if ( + await prometheus.job.is_oom(annotations["pod"], job.start, job.end) + and annotations["retry_count"] < RETRY_COUNT_LIMIT + ): oomed = True else: return diff --git a/gantry/tests/defs/collection.py b/gantry/tests/defs/collection.py index bc13abb..dce51fd 100644 --- a/gantry/tests/defs/collection.py +++ b/gantry/tests/defs/collection.py @@ -36,7 +36,7 @@ # these were obtained by executing the respective queries to Prometheus and capturing the JSON output # or the raw output of PrometheusClient._query -VALID_ANNOTATIONS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_annotations', 'annotation_gitlab_ci_job_id': '9892514', 'annotation_metrics_spack_ci_stack_name': 'e4s', 'annotation_metrics_spack_job_spec_arch': 'linux', 'annotation_metrics_spack_job_spec_compiler_name': 'gcc', 'annotation_metrics_spack_job_spec_compiler_version': '11.4.0', 'annotation_metrics_spack_job_spec_pkg_name': 'gmsh', 'annotation_metrics_spack_job_spec_pkg_version': '4.8.4', 'annotation_metrics_spack_job_spec_variants': '+alglib~cairo+cgns+compression~eigen~external+fltk+gmp~hdf5~ipo+med+metis+mmg+mpi+netgen+oce~opencascade~openmp~petsc~privateapi+shared~slepc+tetgen+voropp build_system=cmake build_type=Release generator=make', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0'}, 'value': [1706117733, '1']}]}} +VALID_ANNOTATIONS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_annotations', 'annotation_gitlab_ci_job_id': '9892514', 'annotation_metrics_spack_ci_stack_name': 'e4s', 'annotation_metrics_spack_job_spec_arch': 'linux', 'annotation_metrics_spack_job_spec_compiler_name': 'gcc', 'annotation_metrics_spack_job_spec_compiler_version': '11.4.0', 'annotation_metrics_spack_job_retry_count': '0', 'annotation_metrics_spack_job_spec_pkg_name': 'gmsh', 'annotation_metrics_spack_job_spec_pkg_version': '4.8.4', 'annotation_metrics_spack_job_spec_variants': '+alglib~cairo+cgns+compression~eigen~external+fltk+gmp~hdf5~ipo+med+metis+mmg+mpi+netgen+oce~opencascade~openmp~petsc~privateapi+shared~slepc+tetgen+voropp build_system=cmake build_type=Release generator=make', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0'}, 'value': [1706117733, '1']}]}} VALID_RESOURCE_REQUESTS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_container_resource_requests', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'cpu', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'core'}, 'value': [1706117733, '0.75']}, {'metric': {'__name__': 'kube_pod_container_resource_requests', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'memory', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'byte'}, 'value': [1706117733, '2000000000']}]}} VALID_RESOURCE_LIMITS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_container_resource_limits', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'memory', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'byte'}, 'value': [1706117733, '48000000000']}]}} VALID_MEMORY_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'container_memory_working_set_bytes', 'container': 'build', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117115, '2785280'], [1706117116, '2785280'], [1706117117, '2785280'], [1706117118, '2785280'], [1706117119, '2785280'], [1706117120, '2785280'], [1706117121, '2785280'], [1706117122, '2785280'], [1706117123, '2785280'], [1706117124, '2785280'], [1706117125, '2785280'], [1706117126, '2785280'], [1706117127, '2785280'], [1706117128, '2785280'], [1706117129, '2785280'], [1706117130, '2785280'], [1706118416, '594620416'], [1706118417, '594620416'], [1706118418, '594620416'], [1706118419, '594620416'], [1706118420, '594620416']]}]}} diff --git a/gantry/tests/test_collection.py b/gantry/tests/test_collection.py index c87f6bf..01fad5b 100644 --- a/gantry/tests/test_collection.py +++ b/gantry/tests/test_collection.py @@ -1,8 +1,11 @@ +import copy + import pytest from gantry.clients.gitlab import GitlabClient from gantry.clients.prometheus import PrometheusClient from gantry.routes.collection import fetch_job, fetch_node, handle_pipeline +from gantry.routes.prediction.prediction import RETRY_COUNT_LIMIT from gantry.tests.defs import collection as defs # mapping of prometheus request shortcuts @@ -186,6 +189,22 @@ async def test_handle_pipeline(db_conn, gitlab, prometheus): await handle_pipeline(defs.FAILED_PIPELINE, db_conn, gitlab, prometheus) is None ) + # job oom killed, but over retry limit + p_list = list(p.values()) + # modify the annotations + # deepcopy so the original annotations are not modified + p_list[0] = copy.deepcopy(p_list[0]) + p_list[0]["data"]["result"][0]["metric"] |= { + "annotation_metrics_spack_job_retry_count": str(RETRY_COUNT_LIMIT) + } + p_list.insert(1, defs.OOM_KILLED) + prometheus._query.side_effect = p_list + # handle_pipeline should not allow a retry because the retry count is over the limit + # however, if another job was oomed but not over the limit, it should be retried + assert ( + await handle_pipeline(defs.FAILED_PIPELINE, db_conn, gitlab, prometheus) is None + ) + # pipeline failed, one job was oomed the other was not p_list = list(p.values()) # after verifying job was not oomed, go onto the next job to insert annotations