Skip to content

Commit

Permalink
Add retry limit check on the collection side
Browse files Browse the repository at this point in the history
If a job is over the defined retry limit, we won't mark it as needing to be retried. However, because we are handling this on a pipeline level, if another job in the pipeline was OOMed but not over the retry limit the pipeline will still be retried, leading to some idiosyncrasies.
  • Loading branch information
cmelone committed Oct 29, 2024
1 parent 888db7a commit 1b2ba20
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 5 deletions.
4 changes: 2 additions & 2 deletions gantry/clients/prometheus/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ async def get_annotations(self, gl_id: int, time: float) -> dict:
"annotation_metrics_spack_job_spec_compiler_version"
],
"stack": annotations["annotation_metrics_spack_ci_stack_name"],
"retry_count": annotations.get(
"annotation_metrics_spack_job_retry_count", 0
"retry_count": int(
annotations.get("annotation_metrics_spack_job_retry_count", 0)
),
}
except KeyError as e:
Expand Down
13 changes: 11 additions & 2 deletions gantry/routes/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from gantry.clients.prometheus import PrometheusClient
from gantry.clients.prometheus.util import IncompleteData
from gantry.models import Job
from gantry.routes.prediction.prediction import RETRY_COUNT_LIMIT

MB_IN_BYTES = 1_000_000
BUILD_STAGE_REGEX = r"^stage-\d+$"
Expand Down Expand Up @@ -58,15 +59,19 @@ async def handle_pipeline(

for job in failed_jobs:
# insert every potentially oomed job
# if a job has been retried RETRY_COUNT_LIMIT times, oomed will be False
# start_pipeline will be called if any of the failed_jobs fit the criteria
# the same check is performed on the prediction side, and won't re-bump memory
oomed = await fetch_job(job, db_conn, gitlab, prometheus, from_pipeline=True)

# fetch_job can return None or (job_id: int, oomed: bool)
if oomed and oomed[1]:
retry_pipeline = True

# once all jobs are collected/discarded, retry the pipeline if needed
if retry_pipeline:
await gitlab.start_pipeline(ref)
return True
return retry_pipeline


async def fetch_job(
Expand Down Expand Up @@ -135,8 +140,12 @@ async def fetch_job(
annotations = await prometheus.job.get_annotations(job.gl_id, job.midpoint)
# check if failed job was OOM killed,
# return early if it wasn't because we don't care about it anymore
# do not retry if the job has already been retried RETRY_COUNT_LIMIT times
if job.status == "failed":
if await prometheus.job.is_oom(annotations["pod"], job.start, job.end):
if (
await prometheus.job.is_oom(annotations["pod"], job.start, job.end)
and annotations["retry_count"] < RETRY_COUNT_LIMIT
):
oomed = True
else:
return
Expand Down
2 changes: 1 addition & 1 deletion gantry/tests/defs/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

# these were obtained by executing the respective queries to Prometheus and capturing the JSON output
# or the raw output of PrometheusClient._query
VALID_ANNOTATIONS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_annotations', 'annotation_gitlab_ci_job_id': '9892514', 'annotation_metrics_spack_ci_stack_name': 'e4s', 'annotation_metrics_spack_job_spec_arch': 'linux', 'annotation_metrics_spack_job_spec_compiler_name': 'gcc', 'annotation_metrics_spack_job_spec_compiler_version': '11.4.0', 'annotation_metrics_spack_job_spec_pkg_name': 'gmsh', 'annotation_metrics_spack_job_spec_pkg_version': '4.8.4', 'annotation_metrics_spack_job_spec_variants': '+alglib~cairo+cgns+compression~eigen~external+fltk+gmp~hdf5~ipo+med+metis+mmg+mpi+netgen+oce~opencascade~openmp~petsc~privateapi+shared~slepc+tetgen+voropp build_system=cmake build_type=Release generator=make', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0'}, 'value': [1706117733, '1']}]}}
VALID_ANNOTATIONS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_annotations', 'annotation_gitlab_ci_job_id': '9892514', 'annotation_metrics_spack_ci_stack_name': 'e4s', 'annotation_metrics_spack_job_spec_arch': 'linux', 'annotation_metrics_spack_job_spec_compiler_name': 'gcc', 'annotation_metrics_spack_job_spec_compiler_version': '11.4.0', 'annotation_metrics_spack_job_retry_count': '0', 'annotation_metrics_spack_job_spec_pkg_name': 'gmsh', 'annotation_metrics_spack_job_spec_pkg_version': '4.8.4', 'annotation_metrics_spack_job_spec_variants': '+alglib~cairo+cgns+compression~eigen~external+fltk+gmp~hdf5~ipo+med+metis+mmg+mpi+netgen+oce~opencascade~openmp~petsc~privateapi+shared~slepc+tetgen+voropp build_system=cmake build_type=Release generator=make', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0'}, 'value': [1706117733, '1']}]}}
VALID_RESOURCE_REQUESTS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_container_resource_requests', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'cpu', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'core'}, 'value': [1706117733, '0.75']}, {'metric': {'__name__': 'kube_pod_container_resource_requests', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'memory', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'byte'}, 'value': [1706117733, '2000000000']}]}}
VALID_RESOURCE_LIMITS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_container_resource_limits', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'memory', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'byte'}, 'value': [1706117733, '48000000000']}]}}
VALID_MEMORY_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'container_memory_working_set_bytes', 'container': 'build', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117115, '2785280'], [1706117116, '2785280'], [1706117117, '2785280'], [1706117118, '2785280'], [1706117119, '2785280'], [1706117120, '2785280'], [1706117121, '2785280'], [1706117122, '2785280'], [1706117123, '2785280'], [1706117124, '2785280'], [1706117125, '2785280'], [1706117126, '2785280'], [1706117127, '2785280'], [1706117128, '2785280'], [1706117129, '2785280'], [1706117130, '2785280'], [1706118416, '594620416'], [1706118417, '594620416'], [1706118418, '594620416'], [1706118419, '594620416'], [1706118420, '594620416']]}]}}
Expand Down
19 changes: 19 additions & 0 deletions gantry/tests/test_collection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import copy

import pytest

from gantry.clients.gitlab import GitlabClient
from gantry.clients.prometheus import PrometheusClient
from gantry.routes.collection import fetch_job, fetch_node, handle_pipeline
from gantry.routes.prediction.prediction import RETRY_COUNT_LIMIT
from gantry.tests.defs import collection as defs

# mapping of prometheus request shortcuts
Expand Down Expand Up @@ -186,6 +189,22 @@ async def test_handle_pipeline(db_conn, gitlab, prometheus):
await handle_pipeline(defs.FAILED_PIPELINE, db_conn, gitlab, prometheus) is None
)

# job oom killed, but over retry limit
p_list = list(p.values())
# modify the annotations
# deepcopy so the original annotations are not modified
p_list[0] = copy.deepcopy(p_list[0])
p_list[0]["data"]["result"][0]["metric"] |= {
"annotation_metrics_spack_job_retry_count": str(RETRY_COUNT_LIMIT)
}
p_list.insert(1, defs.OOM_KILLED)
prometheus._query.side_effect = p_list
# handle_pipeline should not allow a retry because the retry count is over the limit
# however, if another job was oomed but not over the limit, it should be retried
assert (
await handle_pipeline(defs.FAILED_PIPELINE, db_conn, gitlab, prometheus) is None
)

# pipeline failed, one job was oomed the other was not
p_list = list(p.values())
# after verifying job was not oomed, go onto the next job to insert annotations
Expand Down

0 comments on commit 1b2ba20

Please sign in to comment.