diff --git a/gantry/collection.py b/gantry/collection.py index 9aa8121..298b3b0 100644 --- a/gantry/collection.py +++ b/gantry/collection.py @@ -36,13 +36,17 @@ async def fetch_build(payload: dict, db: aiosqlite.Connection) -> None: # perform checks to see if we should collect data for this job if ( - build.status not in ("success",) + build.status not in ("success", "failed") or not build.valid_name # is not a build job or await build.in_db(db) # job already in the database or await build.is_ghost(db, gitlab) ): return + # only collect data for jobs that failed due to resource contention + if build.status == "failed" and not await build.is_oom(prometheus): + return + try: await build.get_annotations(prometheus) await build.get_resources(prometheus) diff --git a/gantry/models/build.py b/gantry/models/build.py index 4289bb8..97a58d4 100644 --- a/gantry/models/build.py +++ b/gantry/models/build.py @@ -73,6 +73,28 @@ async def is_ghost(self, db: aiosqlite.Connection, gl: GitlabClient) -> bool: return ghost + async def is_oom(self, prometheus: PrometheusClient) -> bool: + """Checks if a failed job was OOM killed.""" + oom_status = prometheus.query( + type="range", + query={ + "metric": "kube_pod_container_status_last_terminated_reason", + "filters": { + "container": "build", + "pod": self.pod, + "reason": "OOMKilled", + }, + }, + start=self.start, + end=self.end + (10 * 60), # give a 10 minute buffer + ) + + if not oom_status: + return False + + # TODO retry job here + return True + async def in_db(self, db: aiosqlite.Connection) -> bool: """Checks if the job is already in the db."""