Skip to content

Commit

Permalink
add tests for OOM handling
Browse files Browse the repository at this point in the history
  • Loading branch information
cmelone committed Oct 29, 2024
1 parent ae5b611 commit 888db7a
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 17 deletions.
6 changes: 3 additions & 3 deletions gantry/tests/defs/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@

# used to compare successful insertions
# run SELECT * FROM table_name WHERE id = 1; from python sqlite api and grab fetchone() result
INSERTED_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'success', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux-ubuntu20.04-x86_64_v3', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485, 0, 0)
INSERTED_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'success', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485, 0, 0)
INSERTED_NODE = (1, 'ec253b04-b1dc-f08b-acac-e23df83b3602', 'ip-192-168-86-107.ec2.internal', 24.0, 196608000000.0, 'amd64', 'linux', 'i3en.6xlarge')

# these were obtained by executing the respective queries to Prometheus and capturing the JSON output
# or the raw output of PrometheusClient._query
VALID_ANNOTATIONS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_annotations', 'annotation_gitlab_ci_job_id': '9892514', 'annotation_metrics_spack_ci_stack_name': 'e4s', 'annotation_metrics_spack_job_spec_arch': 'linux-ubuntu20.04-x86_64_v3', 'annotation_metrics_spack_job_spec_compiler_name': 'gcc', 'annotation_metrics_spack_job_spec_compiler_version': '11.4.0', 'annotation_metrics_spack_job_spec_pkg_name': 'gmsh', 'annotation_metrics_spack_job_spec_pkg_version': '4.8.4', 'annotation_metrics_spack_job_spec_variants': '+alglib~cairo+cgns+compression~eigen~external+fltk+gmp~hdf5~ipo+med+metis+mmg+mpi+netgen+oce~opencascade~openmp~petsc~privateapi+shared~slepc+tetgen+voropp build_system=cmake build_type=Release generator=make', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0'}, 'value': [1706117733, '1']}]}}
VALID_ANNOTATIONS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_annotations', 'annotation_gitlab_ci_job_id': '9892514', 'annotation_metrics_spack_ci_stack_name': 'e4s', 'annotation_metrics_spack_job_spec_arch': 'linux', 'annotation_metrics_spack_job_spec_compiler_name': 'gcc', 'annotation_metrics_spack_job_spec_compiler_version': '11.4.0', 'annotation_metrics_spack_job_spec_pkg_name': 'gmsh', 'annotation_metrics_spack_job_spec_pkg_version': '4.8.4', 'annotation_metrics_spack_job_spec_variants': '+alglib~cairo+cgns+compression~eigen~external+fltk+gmp~hdf5~ipo+med+metis+mmg+mpi+netgen+oce~opencascade~openmp~petsc~privateapi+shared~slepc+tetgen+voropp build_system=cmake build_type=Release generator=make', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0'}, 'value': [1706117733, '1']}]}}
VALID_RESOURCE_REQUESTS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_container_resource_requests', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'cpu', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'core'}, 'value': [1706117733, '0.75']}, {'metric': {'__name__': 'kube_pod_container_resource_requests', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'memory', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'byte'}, 'value': [1706117733, '2000000000']}]}}
VALID_RESOURCE_LIMITS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_pod_container_resource_limits', 'container': 'build', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'resource': 'memory', 'service': 'kube-prometheus-stack-kube-state-metrics', 'uid': 'd7aa13e0-998c-4f21-b1d6-62781f4980b0', 'unit': 'byte'}, 'value': [1706117733, '48000000000']}]}}
VALID_MEMORY_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'container_memory_working_set_bytes', 'container': 'build', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117115, '2785280'], [1706117116, '2785280'], [1706117117, '2785280'], [1706117118, '2785280'], [1706117119, '2785280'], [1706117120, '2785280'], [1706117121, '2785280'], [1706117122, '2785280'], [1706117123, '2785280'], [1706117124, '2785280'], [1706117125, '2785280'], [1706117126, '2785280'], [1706117127, '2785280'], [1706117128, '2785280'], [1706117129, '2785280'], [1706117130, '2785280'], [1706118416, '594620416'], [1706118417, '594620416'], [1706118418, '594620416'], [1706118419, '594620416'], [1706118420, '594620416']]}]}}
Expand Down Expand Up @@ -67,4 +67,4 @@
]
}

INSERTED_OOM_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'failed', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux-ubuntu20.04-x86_64_v3', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485, 1, 0)
INSERTED_OOM_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'failed', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485, 1, 0)
10 changes: 10 additions & 0 deletions gantry/tests/defs/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,13 @@
"KUBERNETES_MEMORY_REQUEST": "2000M",
},
}

OOM_PREDICTION = {
"variables": {
"KUBERNETES_CPU_REQUEST": "14779m",
"KUBERNETES_CPU_LIMIT": "12001m",
"KUBERNETES_MEMORY_REQUEST": "9358M",
"KUBERNETES_MEMORY_LIMIT": "76800M",
"GANTRY_RETRY_COUNT": 1,
},
}
2 changes: 1 addition & 1 deletion gantry/tests/sql/insert_job.sql
Original file line number Diff line number Diff line change
@@ -1 +1 @@
INSERT INTO jobs VALUES(1,'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z',2,1706117046,1706118420,9892514,'success','pr42264_bugfix/mathomp4/hdf5-appleclang15','gmsh','4.8.4','{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',16,0.75,NULL,4.12532286694540495,3.15805864677520409,11.6038107294648877,0.248374361826775191,3.34888880339475214,2000000000.0,48000000000.0,1649868862.72588062,999763968.0,5679742976.0,2785280.0,1378705563.21018671, 0, 0);
INSERT INTO jobs VALUES(1,'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z',2,1706117046,1706118420,9892514,'success','pr42264_bugfix/mathomp4/hdf5-appleclang15','gmsh','4.8.4','{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}','gcc','11.4.0','linux','e4s',16,0.75,NULL,4.12532286694540495,3.15805864677520409,11.6038107294648877,0.248374361826775191,3.34888880339475214,2000000000.0,48000000000.0,1649868862.72588062,999763968.0,5679742976.0,2785280.0,1378705563.21018671, 0, 0);
Loading

0 comments on commit 888db7a

Please sign in to comment.