Skip to content

Commit

Permalink
handle oom draft
Browse files Browse the repository at this point in the history
  • Loading branch information
cmelone committed Jan 23, 2024
1 parent 45d8ef1 commit 5a3ede8
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 1 deletion.
6 changes: 5 additions & 1 deletion gantry/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,17 @@ async def fetch_build(payload: dict, db: aiosqlite.Connection) -> None:

# perform checks to see if we should collect data for this job
if (
build.status not in ("success",)
build.status not in ("success", "failed")
or not build.valid_name # is not a build job
or await build.in_db(db) # job already in the database
or await build.is_ghost(db, gitlab)
):
return

# only collect data for jobs that failed due to resource contention
if build.status == "failed" and not await build.is_oom(prometheus):
return

try:
await build.get_annotations(prometheus)
await build.get_resources(prometheus)
Expand Down
22 changes: 22 additions & 0 deletions gantry/models/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,28 @@ async def is_ghost(self, db: aiosqlite.Connection, gl: GitlabClient) -> bool:

return ghost

async def is_oom(self, prometheus: PrometheusClient) -> bool:
"""Checks if a failed job was OOM killed."""
oom_status = prometheus.query(
type="range",
query={
"metric": "kube_pod_container_status_last_terminated_reason",
"filters": {
"container": "build",
"pod": self.pod,
"reason": "OOMKilled",
},
},
start=self.start,
end=self.end + (10 * 60), # give a 10 minute buffer
)

if not oom_status:
return False

# TODO retry job here
return True

async def in_db(self, db: aiosqlite.Connection) -> bool:
"""Checks if the job is already in the db."""

Expand Down

0 comments on commit 5a3ede8

Please sign in to comment.